146 files changed, 32918 insertions, 333 deletions
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..75a9c94
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,28 @@
+
+
+
+				  COPYRIGHT
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2010 Computer System Laboratory, Institute of Information Science, 
+   Academia Sinica, Taiwan
+ + 2016 COVART Laboratory, Department of Computer Science and Information
+   Engineering, National Taiwan University, Taiwan.
+
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others. This software was authored by the following authors,
+sorted by surname:
+
+Name:  Sheng-Yu Fu
+Email: d03922013@csie.ntu.edu.tw
+
+Name:  Ding-Yong Hong
+Email: dyhong@iis.sinica.edu.tw
+
+Name:  Yu-Ping Liu
+Email: r04922005@csie.ntu.edu.tw
diff --git a/Makefile.target b/Makefile.target
index 962d004..4e4b1fe 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -158,6 +158,8 @@ GENERATED_HEADERS += hmp-commands.h hmp-commands-info.h qmp-commands-old.h
 
 endif # CONFIG_SOFTMMU
 
+include $(SRC_PATH)/llvm/hqemu.mk
+
 # Workaround for http://gcc.gnu.org/PR55489, see configure.
 %/translate.o: QEMU_CFLAGS += $(TRANSLATE_OPT_CFLAGS)
 
@@ -189,8 +191,8 @@ all-obj-$(CONFIG_SOFTMMU) += $(crypto-obj-y)
 $(QEMU_PROG_BUILD): config-devices.mak
 
 # build either PROG or PROGW
-$(QEMU_PROG_BUILD): $(all-obj-y) ../libqemuutil.a ../libqemustub.a
-	$(call LINK, $(filter-out %.mak, $^))
+$(QEMU_PROG_BUILD): $(all-obj-y) ../libqemuutil.a ../libqemustub.a $(LLVM_BITCODE)
+	$(call LINK, $(filter-out %.mak %.bc, $^))
 ifdef CONFIG_DARWIN
 	$(call quiet-command,Rez -append $(SRC_PATH)/pc-bios/qemu.rsrc -o $@,"  REZ   $(TARGET_DIR)$@")
 	$(call quiet-command,SetFile -a C $@,"  SETFILE $(TARGET_DIR)$@")
@@ -225,6 +227,9 @@ ifdef CONFIG_TRACE_SYSTEMTAP
 	$(INSTALL_DATA) $(QEMU_PROG).stp-installed "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG).stp"
 	$(INSTALL_DATA) $(QEMU_PROG)-simpletrace.stp "$(DESTDIR)$(qemu_datadir)/../systemtap/tapset/$(QEMU_PROG)-simpletrace.stp"
 endif
+ifneq ($(LLVM_BITCODE),)
+	$(INSTALL) -m 644 $(LLVM_BITCODE) "$(DESTDIR)$(bindir)"
+endif
 
 GENERATED_HEADERS += config-target.h
 Makefile: $(GENERATED_HEADERS)
diff --git a/configure b/configure
index 6ca6c64..80c9396 100755
--- a/configure
+++ b/configure
@@ -345,6 +345,9 @@ vhdx=""
 numa=""
 tcmalloc="no"
 jemalloc="no"
+llvm="no"
+bcflags=""
+libopencsd=""
 
 # parse CC options first
 for opt do
@@ -1169,6 +1172,12 @@ for opt do
   ;;
   --enable-jemalloc) jemalloc="yes"
   ;;
+  --enable-llvm) llvm="yes"
+  ;;
+  --clang-flags=*) bcflags="$optarg"
+  ;;
+  --with-libopencsd=*) libopencsd="$optarg"
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1391,12 +1400,26 @@ disabled with --disable-FEATURE, default is enabled if available:
   numa            libnuma support
   tcmalloc        tcmalloc support
   jemalloc        jemalloc support
+  llvm            enable LLVM optimization
+  --clang-flags   flags for clang compiler
+  --with-libopencsd path to libopencsd library
 
 NOTE: The object files are built at the place where configure is launched
 EOF
 exit 0
 fi
 
+if test "$llvm" != "no" ; then
+    llvm-config --version > /dev/null 2>&1 || { echo >&2 "llvm-config is not in the PATH"; exit 1; }
+    llvm_major=`llvm-config --version | cut -d'.' -f1`
+    llvm_minor=`llvm-config --version | cut -d'.' -f2`
+    if test "$llvm_major" -lt "3" ; then
+        error_exit "LLVM version too old. Version 3.5 or later is required."
+    elif test "$llvm_major" -eq "3" && test "$llvm_minor" -lt "5" ; then
+        error_exit "LLVM version too old. Version 3.5 or later is required."
+    fi
+fi
+
 # Now we have handled --enable-tcg-interpreter and know we're not just
 # printing the help message, bail out if the host CPU isn't supported.
 if test "$ARCH" = "unknown"; then
@@ -1469,6 +1492,7 @@ gcc_flags="-Wmissing-include-dirs -Wempty-body -Wnested-externs $gcc_flags"
 gcc_flags="-Wendif-labels $gcc_flags"
 gcc_flags="-Wno-initializer-overrides $gcc_flags"
 gcc_flags="-Wno-string-plus-int $gcc_flags"
+gcc_flags="-Wno-format-truncation $gcc_flags"
 # Note that we do not add -Werror to gcc_flags here, because that would
 # enable it for all configure tests. If a configure test failed due
 # to -Werror this would just silently disable some features,
@@ -4847,6 +4871,11 @@ echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "LLVM enabled      $llvm (version `llvm-config --version`)"
+
+if test "$libopencsd" != ""; then
+  echo "libopencsd        $libopencsd"
+fi
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5252,6 +5281,21 @@ if test "$seccomp" = "yes"; then
   echo "CONFIG_SECCOMP=y" >> $config_host_mak
 fi
 
+if test "$llvm" != "no" ; then
+  echo "CONFIG_LLVM=y" >> $config_host_mak
+  echo "BCFLAGS=$bcflags" >> $config_host_mak
+  echo "LLVM_VERSION=LLVM_V`llvm-config --version | sed -e "s/\.//g" | cut -c 1-2`" >> $config_host_mak
+  echo "LLVM_CFLAGS=`llvm-config --cflags`" >> $config_host_mak
+  echo "LLVM_CXXFLAGS=`llvm-config --cxxflags`" >> $config_host_mak
+  echo "LLVM_LDFLAGS=`llvm-config --ldflags`" >> $config_host_mak
+  echo "LLVM_LIBS=`llvm-config --libs`" >> $config_host_mak
+fi
+
+if test "$libopencsd" != "" ; then
+  echo "CONFIG_LIBOPENCSD=y" >> $config_host_mak
+  echo "LIBOPENCSD=$libopencsd" >> $config_host_mak
+fi
+
 # XXX: suppress that
 if [ "$bsd" = "yes" ] ; then
   echo "CONFIG_BSD=y" >> $config_host_mak
@@ -5852,6 +5896,23 @@ fi
 echo "LDFLAGS+=$ldflags" >> $config_target_mak
 echo "QEMU_CFLAGS+=$cflags" >> $config_target_mak
 
+if test "$cpu" = "i386" -o "$cpu" = "x86_64" -o "$cpu" = "arm" ; then
+  case "$target_name" in
+  i386|x86_64)
+    echo "CONFIG_COREMU=y" >> $config_target_mak
+    ;;
+  esac
+fi
+
+if test "$llvm" != "no" ; then
+  bitcode="llvm_helper_$target_name"
+  if test "$target_softmmu" = "yes" ; then
+    bitcode=$bitcode"_softmmu"
+  fi
+  echo "LLVM_EXTRA_FLAGS+=-I. -I\$(SRC_PATH) $cflags $LLVM_EXTRA_FLAGS" >> $config_target_mak
+  echo "CONFIG_LLVM_BITCODE=\"$prefix/bin/$bitcode.bc\"" >> $config_target_mak
+fi
+
 done # for target in $targets
 
 if [ "$pixman" = "internal" ]; then
diff --git a/cpu-exec.c b/cpu-exec.c
index c88d0ff..5e1f380 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -31,6 +31,7 @@
 #include "hw/i386/apic.h"
 #endif
 #include "sysemu/replay.h"
+#include "hqemu.h"
 
 /* -icount align implementation. */
 
@@ -104,6 +105,7 @@ static void print_delay(const SyncClocks *sc)
 static void init_delay_params(SyncClocks *sc,
                               const CPUState *cpu)
 {
+    memset(sc, 0, sizeof(SyncClocks));
     if (!icount_align_option) {
         return;
     }
@@ -159,6 +161,10 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
     trace_exec_tb_exit((void *) (next_tb & ~TB_EXIT_MASK),
                        next_tb & TB_EXIT_MASK);
 
+#if defined(CONFIG_LLVM)
+    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_LLVM)
+        return next_tb;
+#endif
     if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) {
         /* We didn't start executing this TB (eg because the instruction
          * counter hit zero); we must restore the guest PC to the address
@@ -197,7 +203,7 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                      max_cycles | CF_NOCACHE
                          | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
+    tb->orig_tb = tcg_ctx.tb_ctx->tb_invalidated_flag ? NULL : orig_tb;
     cpu->current_tb = tb;
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
@@ -218,13 +224,13 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
     tb_page_addr_t phys_pc, phys_page1;
     target_ulong virt_page2;
 
-    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
+    tcg_ctx.tb_ctx->tb_invalidated_flag = 0;
 
     /* find translated block using physical mappings */
     phys_pc = get_page_addr_code(env, pc);
     phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_phys_hash_func(phys_pc);
-    ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h];
+    h = tb_phys_hash_func(pc);
+    ptb1 = &tcg_ctx.tb_ctx->tb_phys_hash[h];
     for(;;) {
         tb = *ptb1;
         if (!tb) {
@@ -253,8 +259,8 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
 
     /* Move the TB to the head of the list */
     *ptb1 = tb->phys_hash_next;
-    tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h];
-    tcg_ctx.tb_ctx.tb_phys_hash[h] = tb;
+    tb->phys_hash_next = tcg_ctx.tb_ctx->tb_phys_hash[h];
+    tcg_ctx.tb_ctx->tb_phys_hash[h] = tb;
     return tb;
 }
 
@@ -315,6 +321,10 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu)
                  tb->flags != flags)) {
         tb = tb_find_slow(cpu, pc, cs_base, flags);
     }
+
+    itlb_update_entry(env, tb);
+    ibtc_update_entry(env, tb);
+
     return tb;
 }
 
@@ -492,29 +502,23 @@ int cpu_exec(CPUState *cpu)
                 tb = tb_find_fast(cpu);
                 /* Note: we do it here to avoid a gcc bug on Mac OS X when
                    doing it in tb_find_slow */
-                if (tcg_ctx.tb_ctx.tb_invalidated_flag) {
+                if (tcg_ctx.tb_ctx->tb_invalidated_flag) {
                     /* as some TB could have been invalidated because
                        of memory exceptions while generating the code, we
                        must recompute the hash index here */
                     next_tb = 0;
-                    tcg_ctx.tb_ctx.tb_invalidated_flag = 0;
+                    tcg_ctx.tb_ctx->tb_invalidated_flag = 0;
                 }
                 if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
                     qemu_log("Trace %p [" TARGET_FMT_lx "] %s\n",
                              tb->tc_ptr, tb->pc, lookup_symbol(tb->pc));
                 }
-                /* see if we can patch the calling TB. When the TB
-                   spans two pages, we cannot safely do a direct
-                   jump. */
-                if (next_tb != 0 && tb->page_addr[1] == -1
-                    && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-                    tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
-                                next_tb & TB_EXIT_MASK, tb);
-                }
+
+                tracer_exec_tb(cpu->env_ptr, next_tb, tb);
                 tb_unlock();
                 if (likely(!cpu->exit_request)) {
                     trace_exec_tb(tb, tb->pc);
-                    tc_ptr = tb->tc_ptr;
+                    tc_ptr = tb->opt_ptr;
                     /* execute the generated code */
                     cpu->current_tb = tb;
                     next_tb = cpu_tb_exec(cpu, tc_ptr);
@@ -533,9 +537,14 @@ int cpu_exec(CPUState *cpu)
                          */
                         smp_rmb();
                         next_tb = 0;
+
+                        tracer_reset(cpu->env_ptr);
                         break;
                     case TB_EXIT_ICOUNT_EXPIRED:
                     {
+#if defined(CONFIG_LLVM)
+                        break;
+#endif
                         /* Instruction counter expired.  */
                         int insns_left = cpu->icount_decr.u32;
                         if (cpu->icount_extra && insns_left >= 0) {
@@ -590,6 +599,8 @@ int cpu_exec(CPUState *cpu)
 #endif /* buggy compiler */
             cpu->can_do_io = 1;
             tb_lock_reset();
+
+            tracer_reset(cpu->env_ptr);
         }
     } /* for(;;) */
 
diff --git a/cpus.c b/cpus.c
index 747f14d..04da0ad 100644
--- a/cpus.c
+++ b/cpus.c
@@ -66,6 +66,9 @@
 
 #endif /* CONFIG_LINUX */
 
+#include "tcg.h"
+#include "hqemu.h"
+
 static CPUState *next_cpu;
 int64_t max_delay;
 int64_t max_advance;
@@ -892,6 +895,18 @@ void qemu_init_cpu_loop(void)
     qemu_thread_get_self(&io_thread);
 }
 
+void qemu_end_cpu_loop(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu)
+        optimization_finalize(cpu->env_ptr);
+
+#if defined(CONFIG_LLVM)
+    llvm_finalize();
+#endif
+}
+
 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 {
     struct qemu_work_item wi;
@@ -1134,6 +1149,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
     /* process any pending work */
     atomic_mb_set(&exit_request, 1);
 
+#if defined(CONFIG_LLVM)
+    llvm_init();
+#endif
+    /* we can safely initialize optimization resources after
+     * the setup of CPUArchState is completed. */
+    CPU_FOREACH(cpu) {
+        copy_tcg_context();
+        optimization_init(cpu->env_ptr);
+    }
+
     while (1) {
         tcg_exec_all();
 
diff --git a/cputlb.c b/cputlb.c
index bf1d50a..c81c3be 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -19,6 +19,7 @@
 
 #include "config.h"
 #include "cpu.h"
+#include "exec/tb-hash.h"
 #include "exec/exec-all.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
@@ -30,12 +31,38 @@
 #include "exec/ram_addr.h"
 #include "tcg/tcg.h"
 
+#include "hqemu.h"
+
+#if defined(ENABLE_TLBVERSION)
+#define TLB_NONIO_MASK       (TARGET_PAGE_MASK | TLB_INVALID_MASK | TLB_VERSION_MASK)
+#define page_val(addr, env)  (((tlbaddr_t)addr & TARGET_PAGE_MASK) | tlb_version(env))
+#else
+#define TLB_NONIO_MASK       (TARGET_PAGE_MASK | TLB_INVALID_MASK)
+#define page_val(addr, env)  (addr & TARGET_PAGE_MASK)
+#endif
+
 //#define DEBUG_TLB
 //#define DEBUG_TLB_CHECK
 
 /* statistics */
 int tlb_flush_count;
 
+static inline void tlb_reset(CPUArchState *env)
+{
+#if defined(ENABLE_TLBVERSION)
+    tlbaddr_t version = env->tlb_version >> TLB_VERSION_SHIFT;
+    if (++version == TLB_VERSION_SIZE) {
+        version = 0;
+        memset(env->tlb_table, -1, sizeof(env->tlb_table));
+        memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
+    }
+    env->tlb_version = version << TLB_VERSION_SHIFT;
+#else
+    memset(env->tlb_table, -1, sizeof(env->tlb_table));
+    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
+#endif
+}
+
 /* NOTE:
  * If flush_global is true (the usual case), flush all tlb entries.
  * If flush_global is false, flush (at least) all tlb entries not
@@ -59,10 +86,12 @@ void tlb_flush(CPUState *cpu, int flush_global)
        links while we are modifying them */
     cpu->current_tb = NULL;
 
-    memset(env->tlb_table, -1, sizeof(env->tlb_table));
-    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
+    tlb_reset(env);
     memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
 
+    optimization_reset(env, 0);
+    lpt_reset(env);
+
     env->vtlb_index = 0;
     env->tlb_flush_addr = -1;
     env->tlb_flush_mask = 0;
@@ -110,18 +139,67 @@ void tlb_flush_by_mmuidx(CPUState *cpu, ...)
     va_end(argp);
 }
 
-static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
+static inline void tlb_flush_entry(CPUArchState *env, CPUTLBEntry *tlb_entry,
+                                   target_ulong addr)
 {
-    if (addr == (tlb_entry->addr_read &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
-        addr == (tlb_entry->addr_write &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
-        addr == (tlb_entry->addr_code &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) == (tlb_entry->addr_read & TLB_NONIO_MASK) ||
+        page_val(addr, env) == (tlb_entry->addr_write & TLB_NONIO_MASK) ||
+        page_val(addr, env) == (tlb_entry->addr_code & TLB_NONIO_MASK)) {
         memset(tlb_entry, -1, sizeof(*tlb_entry));
     }
 }
 
+#ifdef ENABLE_LPAGE
+static int tlb_flush_large_page(CPUState *cpu, target_ulong addr)
+{
+    int i, j, k, ret, mmu_idx, num_base_pages, max_flush_pages;
+    target_ulong page_addr, page_size, flush_addr;
+    CPUArchState *env = cpu->env_ptr;
+
+#if defined(DEBUG_TLB)
+    printf("tlb_flush:\n");
+#endif
+    /* must reset current TB so that interrupts cannot modify the
+       links while we are modifying them */
+    cpu->current_tb = NULL;
+
+    ret = lpt_flush_page(env, addr, &page_addr, &page_size);
+    if (ret == 0)
+        return 0;
+
+    /* If the large page occupies a small set of the tlb, do a partial flush
+     * optimzation, otherwise, do a full flush. */
+    num_base_pages = page_size / TARGET_PAGE_SIZE;
+    max_flush_pages = (CPU_TLB_SIZE / 4 < 1024) ? CPU_TLB_SIZE / 4 : 1024;
+    if (num_base_pages > max_flush_pages) {
+        tlb_flush(cpu, 1);
+        return 1;
+    }
+
+    for (i = 0; i < num_base_pages; i++) {
+        flush_addr = addr + i * TARGET_PAGE_SIZE;
+        j = (flush_addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+        for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++)
+            tlb_flush_entry(env, &env->tlb_table[mmu_idx][j], flush_addr);
+
+        /* check whether there are entries that need to be flushed in the vtlb */
+        for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+            for (k = 0; k < CPU_VTLB_SIZE; k++)
+                tlb_flush_entry(env, &env->tlb_v_table[mmu_idx][k], flush_addr);
+        }
+    }
+
+    for (i = -1; i < num_base_pages; i++) {
+        j = tb_jmp_cache_hash_page(addr + i * TARGET_PAGE_SIZE);
+        memset(&cpu->tb_jmp_cache[j], 0,
+               TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+    }
+    optimization_reset(env, 0);
+
+    return 1;
+}
+#endif
+
 void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {
     CPUArchState *env = cpu->env_ptr;
@@ -138,8 +216,14 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
                TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
                env->tlb_flush_addr, env->tlb_flush_mask);
 #endif
+
+#ifdef ENABLE_LPAGE
+        if (tlb_flush_large_page(cpu, addr))
+            return;
+#else
         tlb_flush(cpu, 1);
         return;
+#endif
     }
     /* must reset current TB so that interrupts cannot modify the
        links while we are modifying them */
@@ -148,18 +232,19 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
     addr &= TARGET_PAGE_MASK;
     i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+        tlb_flush_entry(env, &env->tlb_table[mmu_idx][i], addr);
     }
 
     /* check whether there are entries that need to be flushed in the vtlb */
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         int k;
         for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+            tlb_flush_entry(env, &env->tlb_v_table[mmu_idx][k], addr);
         }
     }
 
     tb_flush_jmp_cache(cpu, addr);
+    optimization_flush_page(env, addr);
 }
 
 void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
@@ -202,11 +287,11 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, ...)
         printf(" %d", mmu_idx);
 #endif
 
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
+        tlb_flush_entry(env, &env->tlb_table[mmu_idx][i], addr);
 
         /* check whether there are vltb entries that need to be flushed */
         for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+            tlb_flush_entry(env, &env->tlb_v_table[mmu_idx][k], addr);
         }
     }
     va_end(argp);
@@ -284,10 +369,11 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
     }
 }
 
-static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
+static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr,
+                                  tlbaddr_t version)
 {
-    if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY)) {
-        tlb_entry->addr_write = vaddr;
+    if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY | version)) {
+        tlb_entry->addr_write = vaddr | version;
     }
 }
 
@@ -302,13 +388,13 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
     vaddr &= TARGET_PAGE_MASK;
     i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
+        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr, tlb_version(env));
     }
 
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         int k;
         for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr, tlb_version(env));
         }
     }
 }
@@ -360,6 +446,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     assert(size >= TARGET_PAGE_SIZE);
     if (size != TARGET_PAGE_SIZE) {
         tlb_add_large_page(env, vaddr, size);
+        lpt_add_page(env, vaddr, size);
     }
 
     sz = size;
@@ -424,6 +511,13 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     } else {
         te->addr_write = -1;
     }
+
+#ifdef ENABLE_TLBVERSION
+    tlbaddr_t version = tlb_version(env);
+    te->addr_read |= version;
+    te->addr_write |= version;
+    te->addr_code |= version;
+#endif
 }
 
 /* Add a new TLB entry, but without specifying the memory
@@ -452,7 +546,7 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
     page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     mmu_idx = cpu_mmu_index(env1, true);
     if (unlikely(env1->tlb_table[mmu_idx][page_index].addr_code !=
-                 (addr & TARGET_PAGE_MASK))) {
+                 page_val(addr, env1))) {
         cpu_ldub_code(env1, addr);
     }
     pd = env1->iotlb[mmu_idx][page_index].addr & ~TARGET_PAGE_MASK;
@@ -471,6 +565,9 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+#undef TLB_NONIO_MASK
+#undef page_val
+
 #define MMUSUFFIX _mmu
 
 #define SHIFT 0
diff --git a/exec.c b/exec.c
index 0bf0a6e..30ab09f 100644
--- a/exec.c
+++ b/exec.c
@@ -706,7 +706,7 @@ int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
     }
     wp = g_malloc(sizeof(*wp));
 
-    wp->vaddr = addr;
+    wp->addr = addr;
     wp->len = len;
     wp->flags = flags;
 
@@ -731,7 +731,7 @@ int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
     CPUWatchpoint *wp;
 
     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
-        if (addr == wp->vaddr && len == wp->len
+        if (addr == wp->addr && len == wp->len
                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
             cpu_watchpoint_remove_by_ref(cpu, wp);
             return 0;
@@ -745,7 +745,7 @@ void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 {
     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 
-    tlb_flush_page(cpu, watchpoint->vaddr);
+    tlb_flush_page(cpu, watchpoint->addr);
 
     g_free(watchpoint);
 }
@@ -776,10 +776,10 @@ static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
      * exactly at the top of the address space and so addr + len
      * wraps round to zero.
      */
-    vaddr wpend = wp->vaddr + wp->len - 1;
+    vaddr wpend = wp->addr + wp->len - 1;
     vaddr addrend = addr + len - 1;
 
-    return !(addr > wpend || wp->vaddr > addrend);
+    return !(addr > wpend || wp->addr > addrend);
 }
 
 #endif
diff --git a/gdbstub.c b/gdbstub.c
index 9c29aa0..a24d9ef 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -1267,7 +1267,7 @@ static void gdb_vm_state_change(void *opaque, int running, RunState state)
             snprintf(buf, sizeof(buf),
                      "T%02xthread:%02x;%swatch:" TARGET_FMT_lx ";",
                      GDB_SIGNAL_TRAP, cpu_index(cpu), type,
-                     (target_ulong)cpu->watchpoint_hit->vaddr);
+                     (target_ulong)cpu->watchpoint_hit->addr);
             cpu->watchpoint_hit = NULL;
             goto send_packet;
         }
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 83b1781..9471dc6 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -271,12 +271,12 @@ CPUArchState *cpu_copy(CPUArchState *env);
 /* Flags stored in the low bits of the TLB virtual address.  These are
    defined so that fast path ram access is all zeros.  */
 /* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK   (1 << 3)
+#define TLB_INVALID_MASK   (1 << TLB_INVALID_SHIFT)
 /* Set if TLB entry references a clean RAM page.  The iotlb entry will
    contain the page physical address.  */
-#define TLB_NOTDIRTY    (1 << 4)
+#define TLB_NOTDIRTY    (1 << TLB_NOTDIRTY_SHIFT)
 /* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO        (1 << 5)
+#define TLB_MMIO        (1 << TLB_MMIO_SHIFT)
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf);
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 85aa403..ce7deb9 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -76,12 +76,12 @@ void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
 static inline void cpu_physical_memory_read(hwaddr addr,
                                             void *buf, int len)
 {
-    cpu_physical_memory_rw(addr, buf, len, 0);
+    cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 0);
 }
 static inline void cpu_physical_memory_write(hwaddr addr,
                                              const void *buf, int len)
 {
-    cpu_physical_memory_rw(addr, (void *)buf, len, 1);
+    cpu_physical_memory_rw(addr, (uint8_t *)buf, len, 1);
 }
 void *cpu_physical_memory_map(hwaddr addr,
                               hwaddr *plen,
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 5093be2..b44e3f2 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -56,6 +56,8 @@ typedef uint64_t target_ulong;
 #error TARGET_LONG_SIZE undefined
 #endif
 
+#include "hqemu-config.h"
+
 #if !defined(CONFIG_USER_ONLY)
 /* use a fully associative victim tlb of 8 entries */
 #define CPU_VTLB_SIZE 8
@@ -89,7 +91,7 @@ typedef uint64_t target_ulong;
  * of tlb_table inside env (which is non-trivial but not huge).
  */
 #define CPU_TLB_BITS                                             \
-    MIN(8,                                                       \
+    MIN(12,                                                      \
         TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS -  \
         (NB_MMU_MODES <= 1 ? 0 :                                 \
          NB_MMU_MODES <= 2 ? 1 :                                 \
@@ -107,9 +109,9 @@ typedef struct CPUTLBEntry {
     */
     union {
         struct {
-            target_ulong addr_read;
-            target_ulong addr_write;
-            target_ulong addr_code;
+            tlbaddr_t addr_read;
+            tlbaddr_t addr_write;
+            tlbaddr_t addr_code;
             /* Addend to virtual address to get host address.  IO accesses
                use the corresponding iotlb value.  */
             uintptr_t addend;
@@ -140,6 +142,7 @@ typedef struct CPUIOTLBEntry {
     target_ulong tlb_flush_addr;                                        \
     target_ulong tlb_flush_mask;                                        \
     target_ulong vtlb_index;                                            \
+    tlbaddr_t tlb_version;                                              \
 
 #else
 
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index b573df5..72acce7 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -405,7 +405,7 @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, target_ulong addr,
 #else
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     CPUTLBEntry *tlbentry = &env->tlb_table[mmu_idx][index];
-    target_ulong tlb_addr;
+    tlbaddr_t tlb_addr;
     uintptr_t haddr;
 
     switch (access_type) {
@@ -422,13 +422,22 @@ static inline void *tlb_vaddr_to_host(CPUArchState *env, target_ulong addr,
         g_assert_not_reached();
     }
 
+#if defined(ENABLE_TLBVERSION)
+    if (tlb_version(env) != (tlb_addr & TLB_VERSION_MASK))
+        return NULL;
+#endif
+
     if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
         /* TLB entry is for a different page */
         return NULL;
     }
 
+#if defined(ENABLE_TLBVERSION)
+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO)) {
+#else
     if (tlb_addr & ~TARGET_PAGE_MASK) {
+#endif
         /* IO access */
         return NULL;
     }
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
index 3091c00..2a01c6f 100644
--- a/include/exec/cpu_ldst_template.h
+++ b/include/exec/cpu_ldst_template.h
@@ -67,6 +67,14 @@
 #define SRETSUFFIX glue(s, SUFFIX)
 #endif
 
+#include "hqemu.h"
+
+#if defined(ENABLE_TLBVERSION)
+#define page_val(addr, env)  ((((tlbaddr_t)addr + DATA_SIZE - 1) & TARGET_PAGE_MASK) | tlb_version(env))
+#else
+#define page_val(addr, env)  (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1)))
+#endif
+
 /* generic load/store macros */
 
 static inline RES_TYPE
@@ -80,12 +88,17 @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     int mmu_idx;
     TCGMemOpIdx oi;
 
+#ifdef SOFTMMU_CODE_ACCESS
+    if (build_llvm_only(env))
+        return glue(glue(ld, USUFFIX), _p)((uint8_t *)env->image_base + ptr);
+#endif
+
     addr = ptr;
     page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     mmu_idx = CPU_MMU_INDEX;
     if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+                 page_val(addr, env))) {
+        oi = make_memop_idx((TCGMemOp)SHIFT, mmu_idx);
         res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
                                                             oi, retaddr);
     } else {
@@ -112,12 +125,17 @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     int mmu_idx;
     TCGMemOpIdx oi;
 
+#ifdef SOFTMMU_CODE_ACCESS
+    if (build_llvm_only(env))
+        return glue(glue(lds, SUFFIX), _p)((uint8_t *)env->image_base + ptr);
+#endif
+
     addr = ptr;
     page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     mmu_idx = CPU_MMU_INDEX;
     if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+                 page_val(addr, env))) {
+        oi = make_memop_idx((TCGMemOp)SHIFT, mmu_idx);
         res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
                                MMUSUFFIX)(env, addr, oi, retaddr);
     } else {
@@ -152,8 +170,8 @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
     page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     mmu_idx = CPU_MMU_INDEX;
     if (unlikely(env->tlb_table[mmu_idx][page_index].addr_write !=
-                 (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
-        oi = make_memop_idx(SHIFT, mmu_idx);
+                 page_val(addr, env))) {
+        oi = make_memop_idx((TCGMemOp)SHIFT, mmu_idx);
         glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
                                                      retaddr);
     } else {
@@ -171,6 +189,7 @@ glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
 
 #endif /* !SOFTMMU_CODE_ACCESS */
 
+#undef page_val
 #undef RES_TYPE
 #undef DATA_TYPE
 #undef DATA_STYPE
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index d900b0d..a225bea 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -21,6 +21,7 @@
 #define _EXEC_ALL_H_
 
 #include "qemu-common.h"
+#include "hqemu-config.h"
 
 /* allow to see translation results - the slowdown should be negligible, so we leave it */
 #define DEBUG_DISAS
@@ -59,7 +60,7 @@ typedef struct TranslationBlock TranslationBlock;
  * and up to 4 + N parameters on 64-bit archs
  * (N = number of input arguments + output arguments).  */
 #define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
-#define OPC_BUF_SIZE 640
+#define OPC_BUF_SIZE 2048
 #define OPC_MAX_SIZE (OPC_BUF_SIZE - MAX_OP_PER_INSTR)
 
 #define OPPARAM_BUF_SIZE (OPC_BUF_SIZE * MAX_OPC_PARAM)
@@ -216,6 +217,8 @@ struct TranslationBlock {
        jmp_first */
     struct TranslationBlock *jmp_next[2];
     struct TranslationBlock *jmp_first;
+
+    TB_OPTIMIZATION_COMMON
 };
 
 #include "qemu/thread.h"
@@ -305,7 +308,7 @@ static inline void tb_set_jmp_target(TranslationBlock *tb,
                                      int n, uintptr_t addr)
 {
     uint16_t offset = tb->tb_jmp_offset[n];
-    tb_set_jmp_target1((uintptr_t)(tb->tc_ptr + offset), addr);
+    tb_set_jmp_target1((uintptr_t)((uint8_t *)tb->tc_ptr + offset), addr);
 }
 
 #else
@@ -405,4 +408,6 @@ extern int singlestep;
 extern CPUState *tcg_current_cpu;
 extern bool exit_request;
 
+size_t get_cpu_size(void);
+
 #endif
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 0f07159..c2a1cd3 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -208,9 +208,9 @@ struct MemoryListener {
     void (*region_del)(MemoryListener *listener, MemoryRegionSection *section);
     void (*region_nop)(MemoryListener *listener, MemoryRegionSection *section);
     void (*log_start)(MemoryListener *listener, MemoryRegionSection *section,
-                      int old, int new);
+                      int _old, int _new);
     void (*log_stop)(MemoryListener *listener, MemoryRegionSection *section,
-                     int old, int new);
+                     int _old, int _new);
     void (*log_sync)(MemoryListener *listener, MemoryRegionSection *section);
     void (*log_global_start)(MemoryListener *listener);
     void (*log_global_stop)(MemoryListener *listener);
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index c537969..4453e5b 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -10,6 +10,8 @@
 #include "qapi/error.h"
 #include "hw/hotplug.h"
 
+#define typename QEMUtypename
+
 enum {
     DEV_NVECTORS_UNSPECIFIED = -1,
 };
@@ -401,4 +403,6 @@ static inline bool qbus_is_hotpluggable(BusState *bus)
 void device_listener_register(DeviceListener *listener);
 void device_listener_unregister(DeviceListener *listener);
 
+#undef typename
+
 #endif
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 405364f..d0c2e20 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -454,7 +454,7 @@ int mod_utf8_codepoint(const char *s, size_t n, char **end);
 void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 
 /* vector definitions */
-#ifdef __ALTIVEC__
+#if defined(__ALTIVEC__) && !defined(__clang__)
 #include <altivec.h>
 /* The altivec.h header says we're allowed to undef these for
  * C++ compatibility.  Here we don't care about C++, but we
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index bd2c075..e2125bd 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -158,13 +158,13 @@
 #ifndef atomic_rcu_read
 #ifdef __ATOMIC_CONSUME
 #define atomic_rcu_read(ptr)    ({                \
-    typeof(*ptr) _val;                            \
+    __typeof__(*ptr) _val;                        \
      __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \
     _val;                                         \
 })
 #else
 #define atomic_rcu_read(ptr)    ({                \
-    typeof(*ptr) _val = atomic_read(ptr);         \
+    __typeof__(*ptr) _val = atomic_read(ptr);     \
     smp_read_barrier_depends();                   \
     _val;                                         \
 })
@@ -185,7 +185,7 @@
 #ifndef atomic_rcu_set
 #ifdef __ATOMIC_RELEASE
 #define atomic_rcu_set(ptr, i)  do {              \
-    typeof(*ptr) _val = (i);                      \
+    __typeof__(*ptr) _val = (i);                  \
     __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \
 } while(0)
 #else
@@ -220,7 +220,7 @@
  */
 #ifndef atomic_mb_read
 #define atomic_mb_read(ptr)    ({           \
-    typeof(*ptr) _val = atomic_read(ptr);   \
+    __typeof__(*ptr) _val = atomic_read(ptr);   \
     smp_rmb();                              \
     _val;                                   \
 })
@@ -239,7 +239,7 @@
 #define atomic_xchg(ptr, i)    __sync_swap(ptr, i)
 #elif defined(__ATOMIC_SEQ_CST)
 #define atomic_xchg(ptr, i)    ({                           \
-    typeof(*ptr) _new = (i), _old;                          \
+    __typeof__(*ptr) _new = (i), _old;                      \
     __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \
     _old;                                                   \
 })
diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index 86dd9cd..b53f462 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -71,7 +71,7 @@
         unsigned long name[BITS_TO_LONGS(bits)]
 
 #define small_nbits(nbits)                      \
-        ((nbits) <= BITS_PER_LONG)
+        ((nbits) <= (long)BITS_PER_LONG)
 
 int slow_bitmap_empty(const unsigned long *bitmap, long bits);
 int slow_bitmap_full(const unsigned long *bitmap, long bits);
@@ -97,7 +97,7 @@ int slow_bitmap_intersects(const unsigned long *bitmap1,
 static inline unsigned long *bitmap_try_new(long nbits)
 {
     long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-    return g_try_malloc0(len);
+    return (unsigned long *)g_try_malloc0(len);
 }
 
 static inline unsigned long *bitmap_new(long nbits)
@@ -241,9 +241,9 @@ static inline unsigned long *bitmap_zero_extend(unsigned long *old,
                                                 long old_nbits, long new_nbits)
 {
     long new_len = BITS_TO_LONGS(new_nbits) * sizeof(unsigned long);
-    unsigned long *new = g_realloc(old, new_len);
-    bitmap_clear(new, old_nbits, new_nbits - old_nbits);
-    return new;
+    unsigned long *new_bitmap = (unsigned long *)g_realloc(old, new_len);
+    bitmap_clear(new_bitmap, old_nbits, new_nbits - old_nbits);
+    return new_bitmap;
 }
 
 #endif /* BITMAP_H */
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index d22eb01..0abf0f8 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -60,7 +60,7 @@
 
 #ifndef container_of
 #define container_of(ptr, type, member) ({                      \
-        const typeof(((type *) 0)->member) *__mptr = (ptr);     \
+        const __typeof__(((type *) 0)->member) *__mptr = (ptr); \
         (type *) ((char *) __mptr - offsetof(type, member));})
 #endif
 
@@ -74,7 +74,7 @@
 #define DO_UPCAST(type, field, dev) container_of(dev, type, field)
 #endif
 
-#define typeof_field(type, field) typeof(((type *)0)->field)
+#define typeof_field(type, field) __typeof__(((type *)0)->field)
 #define type_check(t1,t2) ((t1*)0 - (t2*)0)
 
 #ifndef always_inline
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index f781aa2..b56bce5 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -198,7 +198,7 @@ struct {                                                                \
 } while (/*CONSTCOND*/0)
 
 #define QSLIST_INSERT_HEAD_ATOMIC(head, elm, field) do {                     \
-        typeof(elm) save_sle_next;                                           \
+        __typeof__(elm) save_sle_next;                                       \
         do {                                                                 \
             save_sle_next = (elm)->field.sle_next = (head)->slh_first;       \
         } while (atomic_cmpxchg(&(head)->slh_first, save_sle_next, (elm)) != \
diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index f6d1d56..0d9f677 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -135,8 +135,8 @@ extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
 #define call_rcu(head, func, field)                                      \
     call_rcu1(({                                                         \
          char __attribute__((unused))                                    \
-            offset_must_be_zero[-offsetof(typeof(*(head)), field)],      \
-            func_type_invalid = (func) - (void (*)(typeof(head)))(func); \
+            offset_must_be_zero[-offsetof(__typeof__(*(head)), field)],  \
+            func_type_invalid = (func) - (void (*)(__typeof__(head)))(func); \
          &(head)->field;                                                 \
       }),                                                                \
       (RCUCBFunc *)(func))
@@ -144,7 +144,7 @@ extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
 #define g_free_rcu(obj, field) \
     call_rcu1(({                                                         \
         char __attribute__((unused))                                     \
-            offset_must_be_zero[-offsetof(typeof(*(obj)), field)];       \
+            offset_must_be_zero[-offsetof(__typeof__(*(obj)), field)];   \
         &(obj)->field;                                                   \
       }),                                                                \
       (RCUCBFunc *)g_free);
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index d0946cb..a16effa 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -523,7 +523,7 @@ static inline QEMUTimer *timer_new_tl(QEMUTimerList *timer_list,
                                       QEMUTimerCB *cb,
                                       void *opaque)
 {
-    QEMUTimer *ts = g_malloc0(sizeof(QEMUTimer));
+    QEMUTimer *ts = (QEMUTimer *)g_malloc0(sizeof(QEMUTimer));
     timer_init_tl(ts, timer_list, scale, cb, opaque);
     return ts;
 }
@@ -965,7 +965,7 @@ static inline int64_t cpu_get_host_ticks (void)
 #define MIPS_RDHWR(rd, value) {                         \
         __asm__ __volatile__ (".set   push\n\t"         \
                               ".set mips32r2\n\t"       \
-                              "rdhwr  %0, "rd"\n\t"     \
+                              "rdhwr  %0, " rd "\n\t"   \
                               ".set   pop"              \
                               : "=r" (value));          \
     }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 51a1323..4b005ff 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -30,6 +30,8 @@
 #include "qemu/thread.h"
 #include "qemu/typedefs.h"
 
+#define typename QEMUtypename
+
 typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
                                      void *opaque);
 
@@ -196,7 +198,7 @@ typedef struct CPUBreakpoint {
 } CPUBreakpoint;
 
 typedef struct CPUWatchpoint {
-    vaddr vaddr;
+    vaddr addr;
     vaddr len;
     vaddr hitaddr;
     MemTxAttrs hitattrs;
@@ -775,4 +777,7 @@ extern const struct VMStateDescription vmstate_cpu_common;
     .offset = 0,                                                            \
 }
 
+CPUState *cpu_create(void);
+#undef typename
+
 #endif
diff --git a/include/qom/object.h b/include/qom/object.h
index 4509166..118c227 100644
--- a/include/qom/object.h
+++ b/include/qom/object.h
@@ -20,6 +20,10 @@
 #include "qemu/queue.h"
 #include "qapi/error.h"
 
+#define Type     QEMUType
+#define class    QEMUclass
+#define typename QEMUtypename
+
 struct Visitor;
 
 struct TypeImpl;
@@ -1570,5 +1574,8 @@ int object_child_foreach_recursive(Object *obj,
  */
 Object *container_get(Object *root, const char *path);
 
+#undef Type
+#undef class
+#undef typename
 
 #endif
diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index 3d1e5ba..d594ebf 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -4,6 +4,7 @@
 /* cpus.c */
 bool qemu_in_vcpu_thread(void);
 void qemu_init_cpu_loop(void);
+void qemu_end_cpu_loop(void);
 void resume_all_vcpus(void);
 void pause_all_vcpus(void);
 void cpu_stop_current(void);
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8b17c0e..7be6e71 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2001,9 +2001,13 @@ static void load_elf_image(const char *image_name, int image_fd,
         info->brk = info->end_code;
     }
 
+#if defined(CONFIG_LLVM)
+    load_symbols(ehdr, image_fd, load_bias);
+#else
     if (qemu_log_enabled()) {
         load_symbols(ehdr, image_fd, load_bias);
     }
+#endif
 
     close(image_fd);
     return;
diff --git a/linux-user/main.c b/linux-user/main.c
index 8acfe0f..0f67ad4 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -33,11 +33,12 @@
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
 #include "elf.h"
+#include "hqemu.h"
 
 char *exec_path;
 
 int singlestep;
-static const char *filename;
+const char *filename;
 static const char *argv0;
 static int gdbstub_port;
 static envlist_t *envlist;
@@ -105,7 +106,10 @@ static int pending_cpus;
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
-    qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
+#if defined(CONFIG_LLVM)
+    llvm_fork_start();
+#endif
+    qemu_mutex_lock(&tcg_ctx.tb_ctx->tb_lock);
     pthread_mutex_lock(&exclusive_lock);
     mmap_fork_start();
 }
@@ -127,12 +131,15 @@ void fork_end(int child)
         pthread_mutex_init(&cpu_list_mutex, NULL);
         pthread_cond_init(&exclusive_cond, NULL);
         pthread_cond_init(&exclusive_resume, NULL);
-        qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
+        qemu_mutex_init(&tcg_ctx.tb_ctx->tb_lock);
         gdbserver_fork(thread_cpu);
     } else {
         pthread_mutex_unlock(&exclusive_lock);
-        qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
+        qemu_mutex_unlock(&tcg_ctx.tb_ctx->tb_lock);
     }
+#if defined(CONFIG_LLVM)
+    llvm_fork_end(child);
+#endif
 }
 
 /* Wait for pending exclusive operations to complete.  The exclusive lock
@@ -276,6 +283,9 @@ void cpu_loop(CPUX86State *env)
     abi_ulong pc;
     target_siginfo_t info;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for(;;) {
         cpu_exec_start(cs);
         trapnr = cpu_x86_exec(cs);
@@ -670,6 +680,9 @@ void cpu_loop(CPUARMState *env)
     target_siginfo_t info;
     uint32_t addr;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for(;;) {
         cpu_exec_start(cs);
         trapnr = cpu_arm_exec(cs);
@@ -1001,6 +1014,9 @@ void cpu_loop(CPUARMState *env)
     int trapnr, sig;
     target_siginfo_t info;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for (;;) {
         cpu_exec_start(cs);
         trapnr = cpu_arm_exec(cs);
@@ -1083,6 +1099,9 @@ void cpu_loop(CPUUniCore32State *env)
     unsigned int n, insn;
     target_siginfo_t info;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for (;;) {
         cpu_exec_start(cs);
         trapnr = uc32_cpu_exec(cs);
@@ -1284,6 +1303,9 @@ void cpu_loop (CPUSPARCState *env)
     abi_long ret;
     target_siginfo_t info;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_sparc_exec(cs);
@@ -1564,6 +1586,9 @@ void cpu_loop(CPUPPCState *env)
     int trapnr;
     target_ulong ret;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for(;;) {
         cpu_exec_start(cs);
         trapnr = cpu_ppc_exec(cs);
@@ -2416,6 +2441,9 @@ void cpu_loop(CPUMIPSState *env)
     unsigned int syscall_num;
 # endif
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for(;;) {
         cpu_exec_start(cs);
         trapnr = cpu_mips_exec(cs);
@@ -2653,6 +2681,9 @@ void cpu_loop(CPUOpenRISCState *env)
     CPUState *cs = CPU(openrisc_env_get_cpu(env));
     int trapnr, gdbsig;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for (;;) {
         cpu_exec_start(cs);
         trapnr = cpu_openrisc_exec(cs);
@@ -2743,6 +2774,9 @@ void cpu_loop(CPUSH4State *env)
     int trapnr, ret;
     target_siginfo_t info;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_sh4_exec(cs);
@@ -2805,6 +2839,9 @@ void cpu_loop(CPUCRISState *env)
     int trapnr, ret;
     target_siginfo_t info;
     
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_cris_exec(cs);
@@ -2866,6 +2903,9 @@ void cpu_loop(CPUMBState *env)
     int trapnr, ret;
     target_siginfo_t info;
     
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_mb_exec(cs);
@@ -2971,6 +3011,9 @@ void cpu_loop(CPUM68KState *env)
     target_siginfo_t info;
     TaskState *ts = cs->opaque;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     for(;;) {
         cpu_exec_start(cs);
         trapnr = cpu_m68k_exec(cs);
@@ -3110,6 +3153,9 @@ void cpu_loop(CPUAlphaState *env)
     target_siginfo_t info;
     abi_long sysret;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_alpha_exec(cs);
@@ -3298,6 +3344,9 @@ void cpu_loop(CPUS390XState *env)
     target_siginfo_t info;
     target_ulong addr;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_s390x_exec(cs);
@@ -3602,6 +3651,9 @@ void cpu_loop(CPUTLGState *env)
     CPUState *cs = CPU(tilegx_env_get_cpu(env));
     int trapnr;
 
+    copy_tcg_context();
+    optimization_init(env);
+
     while (1) {
         cpu_exec_start(cs);
         trapnr = cpu_tilegx_exec(cs);
@@ -3711,7 +3763,7 @@ CPUArchState *cpu_copy(CPUArchState *env)
         cpu_breakpoint_insert(new_cpu, bp->pc, bp->flags, NULL);
     }
     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
-        cpu_watchpoint_insert(new_cpu, wp->vaddr, wp->len, wp->flags, NULL);
+        cpu_watchpoint_insert(new_cpu, wp->addr, wp->len, wp->flags, NULL);
     }
 
     return new_env;
@@ -4009,6 +4061,12 @@ static void usage(int exitcode)
            "Note that if you provide several changes to a single variable\n"
            "the last change will stay in effect.\n");
 
+#if defined(CONFIG_LLVM)
+    printf("\n\nHQEMU ");
+    fflush(stdout);
+    hqemu_help();
+#endif
+
     exit(exitcode);
 }
 
@@ -4324,7 +4382,11 @@ int main(int argc, char **argv, char **envp)
     /* Now that we've loaded the binary, GUEST_BASE is fixed.  Delay
        generating the prologue until now so that the prologue can take
        the real value of GUEST_BASE into account.  */
-    tcg_prologue_init(&tcg_ctx);
+    tcg_prologue_init(&tcg_ctx_global);
+
+#if defined(CONFIG_LLVM)
+    llvm_init();
+#endif
 
 #if defined(TARGET_I386)
     env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
@@ -4663,6 +4725,7 @@ int main(int argc, char **argv, char **envp)
         }
         gdb_handlesig(cpu, 0);
     }
+
     cpu_loop(env);
     /* never exits */
     return 0;
diff --git a/linux-user/strace.c b/linux-user/strace.c
index ea6c1d2..69d5408 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -7,6 +7,7 @@
 #include <sys/types.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
+#include <sys/sysmacros.h>
 #include <unistd.h>
 #include <sched.h>
 #include "qemu.h"
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 6c64ba6..030eb2a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -114,6 +114,7 @@ int __clone2(int (*fn)(void *), void *child_stack_base,
 #include "uname.h"
 
 #include "qemu.h"
+#include "hqemu.h"
 
 #define CLONE_NPTL_FLAGS2 (CLONE_SETTLS | \
     CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
@@ -4495,7 +4496,7 @@ abi_long do_arch_prctl(CPUX86State *env, int code, abi_ulong addr)
 
 #endif /* defined(TARGET_I386) */
 
-#define NEW_STACK_SIZE 0x40000
+#define NEW_STACK_SIZE 0x80000
 
 
 static pthread_mutex_t clone_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -5710,6 +5711,12 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
             rcu_unregister_thread();
             pthread_exit(NULL);
         }
+
+        optimization_finalize((CPUArchState *)cpu_env);
+#if defined(CONFIG_LLVM)
+        llvm_finalize();
+#endif
+
 #ifdef TARGET_GPROF
         _mcleanup();
 #endif
@@ -7615,6 +7622,10 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #ifdef __NR_exit_group
         /* new thread calls */
     case TARGET_NR_exit_group:
+        optimization_finalize((CPUArchState *)cpu_env);
+#if defined(CONFIG_LLVM)
+        llvm_finalize();
+#endif
 #ifdef TARGET_GPROF
         _mcleanup();
 #endif
diff --git a/llvm/analysis/InnerLoopAnalysis.cpp b/llvm/analysis/InnerLoopAnalysis.cpp
new file mode 100644
index 0000000..f67d380
--- /dev/null
+++ b/llvm/analysis/InnerLoopAnalysis.cpp
@@ -0,0 +1,631 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "InnerLoopAnalysis.h"
+
+
+/*
+ * The InnertLoop class represents a single innermost loop. The shape of
+ * InnerLoop is specific to the DBT decoded guest loop, and its loop definition
+ * is different to a nature loop, e.g., latch and exiting block.
+ * For example, the binary of a nature loop (a) will be translated to the loop
+ * CFG (b), which includes an additional block L(loopback) to check flag
+ * `tcg_exit_req' and exits the loop to block E if the flag is raised, otherwise,
+ * goes back to the loop header A.
+ *
+ * In loop (b), a latch is split into two blocks, B and L. The loop bottom test
+ * is in block B and the backward branch is included in block L (which also
+ * has an exit block E attached to it). We include block L in the loop body and
+ * have the following definitions: (1) block B and L are latch head and tail,
+ * respectively; (2) a latch tail is the source of a backedge; (3) block B is a
+ * loop exiting block, but block L is not, and block E is not included in the
+ * exit blocks.
+ *
+ *  (a) A    (b) A
+ *      ||       |
+ *      B        B
+ *     /        / \
+ *    C        C   L -> A
+ *                  \
+ *                   E
+ */
+InnerLoop::InnerLoop(Loop *loop)
+    : TheLoop(*loop), Blocks(TheLoop.getBlocks()), UnknownPhi(false)
+{
+    for (auto BB : Blocks)
+        DenseBlockSet.insert(BB);
+
+    /* Find all latches and split latches. */
+    SmallVector<BasicBlock *, 8> LoopLatches;
+    TheLoop.getLoopLatches(LoopLatches);
+    for (BasicBlock *BB : LoopLatches) {
+        Latches.push_back(BB);
+
+        if (MDFactory::isLoop(BB->getTerminator()) &&
+            BB->getSinglePredecessor()) {
+            /* Map latch tail to latch head. */
+            SplitLatches[BB] = BB->getSinglePredecessor();
+        }
+    }
+}
+
+
+/* True if terminator in the block can branch to another block that is 
+ * outside of the current loop. */
+bool InnerLoop::isLoopExiting(BasicBlock *BB) const
+{
+    if (SplitLatches.find(BB) != SplitLatches.end())
+        return false;
+
+    typedef GraphTraits<const BasicBlock*> BlockTraits;
+    for (typename BlockTraits::ChildIteratorType SI =
+         BlockTraits::child_begin(BB),
+         SE = BlockTraits::child_end(BB); SI != SE; ++SI) {
+        if (!contains(*SI))
+            return true;
+    }
+    return false;
+}
+
+/* Calculate the number of back edges to the loop header. */
+unsigned InnerLoop::getNumBackEdges() const
+{
+    unsigned NumBackEdges = 0;
+    BasicBlock *H = getHeader();
+
+    typedef GraphTraits<Inverse<BasicBlock*> > InvBlockTraits;
+    for (typename InvBlockTraits::ChildIteratorType I =
+         InvBlockTraits::child_begin(H),
+         E = InvBlockTraits::child_end(H); I != E; ++I)
+        if (contains(*I))
+            ++NumBackEdges;
+
+    return NumBackEdges;
+}
+
+/* Return all blocks inside the loop that have successors outside of the loop. */
+void InnerLoop::getExitingBlocks(SmallVectorImpl<BasicBlock *> &ExitingBlocks) const
+{
+    typedef GraphTraits<BasicBlock *> BlockTraits;
+    for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
+        /* Skip the latch tail block. */
+        if (SplitLatches.find(*BI) != SplitLatches.end())
+            continue;
+
+        for (typename BlockTraits::ChildIteratorType I =
+             BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
+             I != E; ++I)
+            if (!contains(*I)) {
+                /* Not in current loop? It must be an exit block. */
+                ExitingBlocks.push_back(*BI);
+                break;
+            }
+    }
+}
+
+/* If getExitingBlocks would return exactly one block, return that block.
+ * Otherwise return null. */
+BasicBlock *InnerLoop::getExitingBlock() const
+{
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    getExitingBlocks(ExitingBlocks);
+    if (ExitingBlocks.size() == 1)
+        return ExitingBlocks[0];
+    return nullptr;
+}
+
+/* Return all of the successor blocks of this loop. */
+void InnerLoop::getExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const
+{
+    typedef GraphTraits<BasicBlock *> BlockTraits;
+    for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
+        /* Skip the latch tail block. */
+        if (SplitLatches.find(*BI) != SplitLatches.end())
+            continue;
+
+        for (typename BlockTraits::ChildIteratorType I =
+             BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
+             I != E; ++I)
+            if (!contains(*I))
+                /* Not in current loop? It must be an exit block. */
+                ExitBlocks.push_back(*I);
+    }
+}
+
+/* If getExitBlocks would return exactly one block, return that block.
+ * Otherwise return null. */
+BasicBlock *InnerLoop::getExitBlock() const
+{
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    getExitBlocks(ExitBlocks);
+    if (ExitBlocks.size() == 1)
+        return ExitBlocks[0];
+    return nullptr;
+}
+
+/* If there is a preheader for this loop, return it. A loop has a preheader
+ * if there is only one edge to the header of the loop from outside of the
+ * loop. If this is the case, the block branching to the header of the loop
+ * is the preheader node.
+ *
+ * This method returns null if there is no preheader for the loop. */
+BasicBlock *InnerLoop::getLoopPreheader() const
+{
+    /* Keep track of nodes outside the loop branching to the header. */
+    BasicBlock *Out = getLoopPredecessor();
+    if (!Out) return nullptr;
+
+    /* Make sure there is only one exit out of the preheader. */
+    typedef GraphTraits<BasicBlock *> BlockTraits;
+    typename BlockTraits::ChildIteratorType SI = BlockTraits::child_begin(Out);
+    ++SI;
+    if (SI != BlockTraits::child_end(Out))
+        return nullptr;  /* Multiple exits from the block, must not be a preheader. */
+
+    /* The predecessor has exactly one successor, so it is a preheader. */
+    return Out;
+}
+
+/* If the given loop's header has exactly one unique predecessor outside the
+ * loop, return it. Otherwise return null.
+ * This is less strict that the loop "preheader" concept, which requires
+ * the predecessor to have exactly one successor. */
+BasicBlock *InnerLoop::getLoopPredecessor() const
+{
+    /* Keep track of nodes outside the loop branching to the header. */
+    BasicBlock *Out = nullptr;
+
+    /* Loop over the predecessors of the header node. */
+    BasicBlock *Header = getHeader();
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39)
+    typedef GraphTraits<Inverse<BasicBlock *> > InvBlockTraits;
+    for (typename InvBlockTraits::ChildIteratorType PI =
+         InvBlockTraits::child_begin(Header),
+         PE = InvBlockTraits::child_end(Header); PI != PE; ++PI) {
+        typename InvBlockTraits::NodeType *N = *PI;
+        if (!contains(N)) {      /* If the block is not in the loop. */
+            if (Out && Out != N)
+                return nullptr;  /* Multiple predecessors outside the loop */
+            Out = N;
+        }
+    }
+#else
+    for (const auto Pred : children<Inverse<BasicBlock *> >(Header)) {
+        if (!contains(Pred)) {   /* If the block is not in the loop. */
+            if (Out && Out != Pred)
+                return nullptr;  /* Multiple predecessors outside the loop */
+            Out = Pred;
+        }
+    }
+#endif
+
+    return Out;
+}
+
+bool InnerLoop::isReachable(Instruction *From, Instruction *To)
+{
+    if (!contains(From->getParent()) || !contains(To->getParent()))
+        return false;
+    if (From == To)
+        return true;
+
+    SmallPtrSet<Instruction*, 8> Visited;
+    SmallVector<Instruction*, 8> VisitStack;
+
+    VisitStack.push_back(From);
+    while (!VisitStack.empty()) {
+        Instruction *I = VisitStack.back();
+        VisitStack.pop_back();
+
+        if (Visited.count(I))
+            continue;
+
+        Visited.insert(I);
+        for (User *U : I->users()) {
+            Instruction *UI = cast<Instruction>(U);
+            if (UI == To)
+                return true;
+
+            if (contains(UI->getParent()))
+                VisitStack.push_back(UI);
+        }
+    }
+
+    return false;
+}
+
+
+/*
+ * InnerLoopAnalysis
+ */
+static void addInnerLoop(Loop &L, std::vector<Loop *> &Loops)
+{
+    if (L.empty()) {
+        /* Innermost loop.
+         * If any basic block of current loop has been included in another
+         * loop, skip this loop. */
+        for (Loop *InnerL : Loops) {
+            for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+                if (InnerL->contains(*I))
+                    return;
+            }
+        }
+        Loops.push_back(&L);
+        return;
+    }
+    for (Loop *InnerL : L)
+        addInnerLoop(*InnerL, Loops);
+}
+
+
+void InnerLoopAnalysis::analyze(LoopInfo *LI, ScalarEvolution *SE)
+{
+    std::vector<Loop *> Loops;
+    for (Loop *L : *LI)
+        addInnerLoop(*L, Loops);
+
+    for (auto L : Loops)
+        InnerLoops.push_back(new InnerLoop(L));
+
+    for (auto L : InnerLoops)
+        analyzePhi(*L, SE);
+}
+
+bool InnerLoopAnalysis::analyzeInduction(InnerLoop &TheLoop,
+                                         ScalarEvolution *SE,
+                                         PHINode *Phi)
+{
+    Type *PhiTy = Phi->getType();
+    /* We only handle integer and pointer inductions variables. */
+    if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+        return false;
+
+    /* We only handle induction that has no outside users (except that the
+     * outside users are all stores.) */
+    for (User *U : Phi->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        if (!TheLoop.contains(UI) && !isa<StoreInst>(UI))
+            return false;
+    }
+
+    const SCEV *PhiScev = SE->getSCEV(Phi);
+    const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+    if (!AR)
+        return false;
+
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+    const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
+    if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
+        return false;
+
+    /* We found an induction variable. */
+    Value *StartValue =
+        Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
+    TheLoop.addInduction(Phi, StartValue, Step);
+
+    return true;
+}
+
+/*
+ * isReductionInstr()
+ *  Check if the reduction operation is supported.
+ *  We don't allow a reduction to bind more than one operation, so drop a
+ *  reduction if it already has one operation.
+ */
+static bool isReductionInstr(Instruction *I, ReductionDesc::ReductionKind &Kind,
+                             Type *&Ty)
+{
+    ReductionDesc::ReductionKind K = ReductionDesc::NoReduction;
+    switch (I->getOpcode()) {
+    default:
+        return false;
+    case Instruction::PHI:
+    case Instruction::BitCast:
+        return true;
+    case Instruction::Add:
+    case Instruction::Sub:
+        K = ReductionDesc::IntegerAdd;
+        break;
+    case Instruction::Mul:
+        K = ReductionDesc::IntegerMult;
+        break;
+    case Instruction::And:
+        K = ReductionDesc::IntegerAnd;
+        break;
+    case Instruction::Or:
+        K = ReductionDesc::IntegerOr;
+        break;
+    case Instruction::Xor:
+        K = ReductionDesc::IntegerXor;
+        break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+        K = ReductionDesc::FloatAdd;
+        break;
+    case Instruction::FMul:
+        K = ReductionDesc::FloatMult;
+        break;
+    }
+
+    if (VectorType *VecTy = dyn_cast<VectorType>(I->getType()))
+        Ty = VecTy->getScalarType();
+    else
+        Ty = I->getType();
+
+    if (Kind == ReductionDesc::NoReduction) {
+        Kind = K;
+        return true;
+    }
+
+    if (Kind != K) {
+        /* Different reduction operation to the previous one. */
+        return false;
+    }
+    return true;
+}
+
+static bool hasMultipleUsesOf(Instruction *I,
+                              SmallPtrSet<Instruction *, 8> &Insts)
+{
+    unsigned NumUses = 0;
+    for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
+        if (Insts.count(dyn_cast<Instruction>(*Use)))
+            ++NumUses;
+        if (NumUses > 1)
+            return true;
+    }
+    return false;
+}
+
+static bool isLegalUser(Instruction *I)
+{
+    if (isa<StoreInst>(I) && !MDFactory::isGuestMemory(I))
+        return true;
+    return false;
+}
+
+bool InnerLoopAnalysis::analyzeReduction(InnerLoop &TheLoop, PHINode *Phi)
+{
+    if (Phi->getNumIncomingValues() != 2)
+        return false;
+
+    /* Reduction variables are only found in the loop header block. */
+    if (Phi->getParent() != TheLoop.getHeader())
+        return false;
+
+    /* Obtain the reduction start value from from the loop preheader. */
+    Value *StartValue = Phi->getIncomingValueForBlock(TheLoop.getLoopPreheader());
+
+    /* ExitInstruction is the single value which is used outside the loop.
+     * We only allow for a single reduction value to be used outside the loop.
+     * This includes users of the reduction, variables (which form a cycle
+     * which ends in the phi node). */
+    Instruction *ExitInstruction = nullptr;
+    /* Indicates that we found a reduction operation in our scan. */
+    bool FoundReduxOp = false;
+
+    /* We start with the PHI node and scan for all of the users of this
+     * instruction. All users must be instructions that can be used as reduction
+     * variables (such as ADD). We must have a single out-of-block user. The cycle
+     * must include the original PHI. */
+    bool FoundStartPHI = false;
+
+    ReductionDesc::ReductionKind Kind = ReductionDesc::NoReduction;
+    Type *Ty = nullptr;
+
+    SmallPtrSet<Instruction *, 8> VisitedInsts;
+    SmallVector<Instruction *, 8> Worklist;
+    Worklist.push_back(Phi);
+    VisitedInsts.insert(Phi);
+
+    /* A value in the reduction can be used:
+     *   - By the reduction:
+     *     - Reduction operation:
+     *       - One use of reduction value (safe).
+     *       - Multiple use of reduction value (not safe).
+     *     - PHI:
+     *       - All uses of the PHI must be the reduction (safe).
+     *       - Otherwise, not safe.
+     *   - By one or no instruction outside of the loop (safe).
+     *   - By further instructions outside of the loop (not safe).
+     *   - By an instruction that is not part of the reduction (not safe).
+     *     This is either:
+     *       An instruction type other than PHI or the reduction operation.
+     *       A PHI in the header other than the initial PHI. */
+    while (!Worklist.empty()) {
+        Instruction *Cur = Worklist.back();
+        Worklist.pop_back();
+
+        /* No Users.
+         * If the instruction has no users then this is a broken chain and
+         * cannot be a reduction variable. */
+        if (Cur->use_empty())
+            return false;
+
+        bool IsAPhi = isa<PHINode>(Cur);
+        bool IsBitCast = isa<BitCastInst>(Cur);
+
+        /* Currenly, we don't handle a reduction used by another PHI other than
+         * the original PHI. */
+        if (IsAPhi && Cur != Phi)
+            return false;
+
+        /* Any reduction instruction must be of one of the allowed kinds. */
+        if (!isReductionInstr(Cur, Kind, Ty))
+            return false;
+
+        /* Reductions of instructions such as Div, and Sub is only possible if the
+         * LHS is the reduction variable. */
+        if (!IsAPhi && !Cur->isCommutative() &&
+            !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
+            return false;
+
+        /* A reduction operation must only have one use of the reduction value. */
+        if (!IsAPhi && hasMultipleUsesOf(Cur, VisitedInsts))
+            return false;
+
+        /* Check whether we found a reduction operator. */
+        FoundReduxOp |= (!IsAPhi && !IsBitCast);
+
+        /* Process users of current instruction. Push non-PHI nodes after PHI
+         * nodes onto the stack. This way we are going to have seen all inputs
+         * to PHI nodes once we get to them. */
+        SmallVector<Instruction *, 8> NonPHIs;
+        SmallVector<Instruction *, 8> PHIs;
+        for (User *U : Cur->users()) {
+            Instruction *UI = cast<Instruction>(U);
+
+            if (isLegalUser(UI))
+                continue;
+
+            /* Check if we found the exit user. */
+            BasicBlock *Parent = UI->getParent();
+            if (!TheLoop.contains(Parent)) {
+                /* Exit if you find multiple outside users or if the header phi node is
+                 * being used. In this case the user uses the value of the previous
+                 * iteration, in which case we would loose "VF-1" iterations of the
+                 * reduction operation if we vectorize. */
+                if (ExitInstruction != nullptr || Cur == Phi)
+                    return false;
+
+                /* The instruction used by an outside user must be the last instruction
+                 * before we feed back to the reduction phi. Otherwise, we loose VF-1
+                 * operations on the value. */
+                if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+                    return false;
+
+                ExitInstruction = Cur;
+                continue;
+            }
+
+            /* Process instructions only once (termination). Each reduction cycle
+             * value must only be used once, except by phi nodes and min/max
+             * reductions which are represented as a cmp followed by a select. */
+            if (!VisitedInsts.count(UI)) {
+                VisitedInsts.insert(UI);
+                if (isa<PHINode>(UI))
+                    PHIs.push_back(UI);
+                else
+                    NonPHIs.push_back(UI);
+            } else if (!isa<PHINode>(UI))
+                return false;
+
+            /* Remember that we completed the cycle. */
+            if (UI == Phi)
+                FoundStartPHI = true;
+        }
+        Worklist.append(PHIs.begin(), PHIs.end());
+        Worklist.append(NonPHIs.begin(), NonPHIs.end());
+    }
+
+    /* Set the exit instruction to the last instruction feed back to the
+     * reduction phi if we cannot find an exit instruction. */
+    if (!ExitInstruction) {
+        Value *NextValue = Phi->getIncomingValueForBlock(TheLoop.getSingleLatchTail());
+        if (!isa<Instruction>(NextValue))
+            return false;
+        ExitInstruction = cast<Instruction>(NextValue);
+    }
+
+    if (!FoundStartPHI || !FoundReduxOp)
+        return false;
+
+    /* We found an induction variable. */
+    TheLoop.addReduction(Phi, StartValue, ExitInstruction, Kind, Ty);
+
+    return true;
+}
+
+void InnerLoopAnalysis::analyzePhi(InnerLoop &TheLoop, ScalarEvolution *SE)
+{
+    BasicBlock *Header = TheLoop.getHeader();
+    for (BasicBlock *BB : TheLoop.blocks()) {
+        auto I = BB->begin();
+        auto E = BasicBlock::iterator(BB->getFirstNonPHI());
+
+        for (; I != E; ++I) {
+            /* Currently, we cannot handle PHIs in a non-header block, so set
+             * the loop with unknown PHI if we find any of it. */
+            if (BB != Header) {
+                TheLoop.UnknownPhi = true;
+                return;
+            }
+
+            /* The loop must have a preheader and one split latch for us to
+             * analyze inductions and reductions. */
+            if (!TheLoop.getLoopPreheader() || !TheLoop.getSingleLatchTail()) {
+                TheLoop.UnknownPhi = true;
+                return;
+            }
+
+            PHINode *Phi = cast<PHINode>(I);
+            if (!analyzeInduction(TheLoop, SE, Phi) &&
+                !analyzeReduction(TheLoop, Phi))
+                TheLoop.UnknownPhi = true;
+        }
+    }
+}
+
+
+/*
+ * InnerLoopAnalysisWrapperPass Pass
+ */
+char InnerLoopAnalysisWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InnerLoopAnalysisWrapperPass, "InnerLoopAnalysis",
+        "Inner Loop Analysis", true, true)
+#if defined(LLVM_V35)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+#else
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+#endif
+INITIALIZE_PASS_END(InnerLoopAnalysisWrapperPass, "InnerLoopAnalysis",
+        "Inner Loop Analysis", true, true)
+
+void InnerLoopAnalysisWrapperPass::releaseMemory() {
+    LA.releaseMemory();
+}
+
+void InnerLoopAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+#if defined(LLVM_V35)
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+#else
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+#endif
+}
+void InnerLoopAnalysisWrapperPass::print(raw_ostream &OS, const Module *) const {
+    LA.print(OS);
+}
+
+void InnerLoopAnalysisWrapperPass::verifyAnalysis() const {
+    LA.verify();
+}
+
+bool InnerLoopAnalysisWrapperPass::runOnFunction(Function &F) {
+#if defined(LLVM_V35)
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+    LoopInfo *LI = &getAnalysis<LoopInfo>();
+#else
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+#endif
+
+    LA.analyze(LI, SE);
+    return false;
+}
+
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/atomic/atomic-arm.c b/llvm/atomic/atomic-arm.c
new file mode 100644
index 0000000..4176caa
--- /dev/null
+++ b/llvm/atomic/atomic-arm.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* We include this file in op_helper.c */
+
+#include <stdlib.h>
+#include <pthread.h>
+#include "coremu-atomic.h"
+
+__thread uint64_t cm_exclusive_val;
+__thread uint32_t cm_exclusive_addr = -1;
+
+#define GEN_LOAD_EXCLUSIVE(type, TYPE) \
+void HELPER(load_exclusive##type)(CPUArchState *env, uint32_t reg,    \
+                uint32_t addr)                                        \
+{                                                                     \
+    unsigned long q_addr = 0;                                         \
+    DATA_##type val = 0;                                              \
+                                                                      \
+    cm_exclusive_addr = addr;                                         \
+    CM_GET_QEMU_ADDR(env, q_addr,addr);                               \
+    val = *(DATA_##type *)q_addr;                                     \
+    cm_exclusive_val = val;                                           \
+    env->regs[reg] = val;                                             \
+}
+
+GEN_LOAD_EXCLUSIVE(b, B);
+GEN_LOAD_EXCLUSIVE(w, W);
+GEN_LOAD_EXCLUSIVE(l, L);
+//GEN_LOAD_EXCLUSIVE(q, Q);
+
+#define GEN_STORE_EXCLUSIVE(type, TYPE) \
+void HELPER(store_exclusive##type)(CPUArchState *env, uint32_t res,           \
+                uint32_t reg, uint32_t addr)                                  \
+{                                                                             \
+    unsigned long q_addr = 0;                                                 \
+    DATA_##type val = 0;                                                      \
+    DATA_##type r = 0;                                                        \
+                                                                              \
+    if(addr != cm_exclusive_addr)                                             \
+        goto fail;                                                            \
+                                                                              \
+    CM_GET_QEMU_ADDR(env, q_addr,addr);                                       \
+    val = (DATA_##type)env->regs[reg];                                        \
+                                                                              \
+    r = atomic_compare_exchange##type((DATA_##type *)q_addr,                  \
+                                    (DATA_##type)cm_exclusive_val, val);      \
+                                                                              \
+    if(r == (DATA_##type)cm_exclusive_val) {                                  \
+        env->regs[res] = 0;                                                   \
+        goto done;                                                            \
+    } else {                                                                  \
+        goto fail;                                                            \
+    }                                                                         \
+                                                                              \
+fail:                                                                         \
+    env->regs[res] = 1;                                                       \
+                                                                              \
+done:                                                                         \
+    cm_exclusive_addr = -1;                                                   \
+    return;                                                                   \
+}
+
+GEN_STORE_EXCLUSIVE(b, B);
+GEN_STORE_EXCLUSIVE(w, W);
+GEN_STORE_EXCLUSIVE(l, L);
+//GEN_STORE_EXCLUSIVE(q, Q);
+
+void HELPER(load_exclusiveq)(CPUArchState *env, uint32_t reg, uint32_t addr)
+{
+   unsigned long q_addr = 0;
+   uint64_t val = 0;
+
+   cm_exclusive_addr = addr;
+   CM_GET_QEMU_ADDR(env, q_addr,addr);
+   val = *(uint64_t *)q_addr;
+   cm_exclusive_val = val;
+   env->regs[reg] = (uint32_t)val;
+   env->regs[reg + 1] = (uint32_t)(val>>32);
+}
+
+void HELPER(store_exclusiveq)(CPUArchState *env, uint32_t res, uint32_t reg, uint32_t addr)
+{
+   unsigned long q_addr = 0;
+   uint64_t val = 0;
+   uint64_t r = 0;
+
+   if(addr != cm_exclusive_addr)
+        goto fail;
+
+   CM_GET_QEMU_ADDR(env, q_addr,addr);
+   val = (uint32_t)env->regs[reg];
+   val |= ((uint64_t)env->regs[reg + 1]) << 32;
+
+   r = atomic_compare_exchangeq((uint64_t *)q_addr,
+                                    (uint64_t)cm_exclusive_val, val);
+
+   if(r == (uint64_t)cm_exclusive_val) {
+        env->regs[res] = 0;
+        goto done;
+   } else {
+        goto fail;
+   }
+
+fail:
+    env->regs[res] = 1;
+
+done:
+    cm_exclusive_addr = -1;
+    return;
+}
+
+void HELPER(clear_exclusive)(CPUArchState *env)
+{
+    cm_exclusive_addr = -1;
+}
+
+void HELPER(swpb)(CPUArchState *env, uint32_t dst, uint32_t src, uint32_t addr)
+{
+    uint8_t old, val;
+    unsigned long q_addr;
+    CM_GET_QEMU_ADDR(env, q_addr,env->regs[addr]);
+    val = (uint8_t)env->regs[src];
+    old = atomic_exchangeb((uint8_t *)q_addr, (uint8_t)val);
+    env->regs[dst] = old;
+    //printf("SWPB\n");
+}
+
+void HELPER(swp)(CPUArchState *env, uint32_t dst, uint32_t src, uint32_t addr)
+{
+    uint32_t old, val;
+    unsigned long q_addr;
+    CM_GET_QEMU_ADDR(env, q_addr,env->regs[addr]);
+    val = env->regs[src];
+    old = atomic_exchangel((uint32_t *)q_addr, val);
+    env->regs[dst] = old;
+    //printf("SWP\n");
+}
diff --git a/llvm/atomic/atomic-helper.h b/llvm/atomic/atomic-helper.h
new file mode 100644
index 0000000..9e3cedf
--- /dev/null
+++ b/llvm/atomic/atomic-helper.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config-target.h"
+
+#ifdef CONFIG_COREMU
+
+#if defined(TARGET_I386)
+#define __GEN_HEADER(type) \
+DEF_HELPER_3(atomic_inc##type, void, env, tl, int)                \
+DEF_HELPER_4(xchg##type, void, env, tl, int, int)                 \
+DEF_HELPER_4(atomic_op##type, void, env, tl, tl, int)             \
+DEF_HELPER_4(atomic_xadd##type, void, env, tl, int, int)          \
+DEF_HELPER_4(atomic_cmpxchg##type, void, env, tl, int, int)       \
+DEF_HELPER_2(atomic_not##type, void, env, tl)                     \
+DEF_HELPER_2(atomic_neg##type, void, env, tl)
+
+__GEN_HEADER(b)
+__GEN_HEADER(w)
+__GEN_HEADER(l)
+#ifdef TARGET_X86_64
+__GEN_HEADER(q)
+#endif
+
+DEF_HELPER_2(atomic_cmpxchg8b, void, env, tl)
+DEF_HELPER_2(atomic_cmpxchg16b, void, env, tl)
+
+DEF_HELPER_4(atomic_bts, void, env, tl, tl, int)
+DEF_HELPER_4(atomic_btr, void, env, tl, tl, int)
+DEF_HELPER_4(atomic_btc, void, env, tl, tl, int)
+
+/* fence */
+DEF_HELPER_1(fence, void, env)
+
+#elif defined(TARGET_ARM)
+#define __GEN_HEADER(type) \
+DEF_HELPER_3(load_exclusive##type, void, env, i32, i32)           \
+DEF_HELPER_4(store_exclusive##type, void, env, i32, i32, i32)
+
+__GEN_HEADER(b)
+__GEN_HEADER(w)
+__GEN_HEADER(l)
+__GEN_HEADER(q)
+
+DEF_HELPER_1(clear_exclusive, void, env)
+
+DEF_HELPER_4(swpb, void, env, i32, i32, i32)
+DEF_HELPER_4(swp, void, env, i32, i32, i32)
+#else
+#error "unsupported processor type"
+#endif
+
+#endif
+
diff --git a/llvm/atomic/atomic-x86.c b/llvm/atomic/atomic-x86.c
new file mode 100644
index 0000000..dc0baf0
--- /dev/null
+++ b/llvm/atomic/atomic-x86.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *  Xi Wu           <wuxi@fudan.edu.cn>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* We include this file in op_helper.c */
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <assert.h>
+#include "coremu-atomic.h"
+
+#define EAX (env->regs[R_EAX])
+#define ECX (env->regs[R_ECX])
+#define EDX (env->regs[R_EDX])
+#define EBX (env->regs[R_EBX])
+
+/* These definitions are copied from translate.c */
+#if defined(WORDS_BIGENDIAN)
+#define REG_B_OFFSET (sizeof(target_ulong) - 1)
+#define REG_H_OFFSET (sizeof(target_ulong) - 2)
+#define REG_W_OFFSET (sizeof(target_ulong) - 2)
+#define REG_L_OFFSET (sizeof(target_ulong) - 4)
+#define REG_LH_OFFSET (sizeof(target_ulong) - 8)
+#else
+#define REG_B_OFFSET 0
+#define REG_H_OFFSET 1
+#define REG_W_OFFSET 0
+#define REG_L_OFFSET 0
+#define REG_LH_OFFSET 4
+#endif
+
+#ifdef TARGET_X86_64
+#define X86_64_DEF(...)  __VA_ARGS__
+#else
+#define X86_64_DEF(...)
+#endif
+
+#define REG_LOW_MASK (~(uint64_t)0x0>>32)
+
+/* gen_op instructions */
+/* i386 arith/logic operations */
+enum {
+    OP_ADDL,
+    OP_ORL,
+    OP_ADCL,
+    OP_SBBL,
+    OP_ANDL,
+    OP_SUBL,
+    OP_XORL,
+    OP_CMPL,
+};
+
+/* */
+static target_ulong cm_get_reg_val(CPUX86State *env, int ot, int hregs, int reg)
+{
+    target_ulong val, offset;
+    CPUX86State *env1 = env;
+
+    switch(ot) {
+    case 0:  /* OT_BYTE */
+        if (reg < 4 X86_64_DEF( || reg >= 8 || hregs)) {
+            goto std_case;
+        } else {
+            offset = offsetof(CPUX86State, regs[reg - 4]) + REG_H_OFFSET;
+            val = *(((uint8_t *)env1) + offset);
+        }
+        break;
+    default:
+    std_case:
+        val =  env1->regs[reg];
+        break;
+    }
+
+    return val;
+}
+
+static void cm_set_reg_val(CPUX86State *env, int ot, int hregs, int reg, target_ulong val)
+{
+      target_ulong offset;
+
+      CPUX86State *env1 = env;
+
+      switch(ot) {
+      case 0: /* OT_BYTE */
+          if (reg < 4 X86_64_DEF (|| reg >= 8 || hregs)) {
+              offset = offsetof(CPUX86State, regs[reg]) + REG_B_OFFSET;
+              *(((uint8_t *) env1) + offset) = (uint8_t)val;
+          } else {
+              offset = offsetof(CPUX86State, regs[reg - 4]) + REG_H_OFFSET;
+              *(((uint8_t *) env1) + offset) = (uint8_t)val;
+          }
+          break;
+      case 1: /* OT_WORD */
+          offset = offsetof(CPUX86State, regs[reg]) + REG_W_OFFSET;
+          *((uint16_t *)((uint8_t *)env1 + offset)) = (uint16_t)val;
+          break;
+      case 2: /* OT_LONG */
+          env1->regs[reg] = REG_LOW_MASK & val;
+          break;
+      default:
+      case 3: /* OT_QUAD */
+          env1->regs[reg] = val;
+          break;
+      }
+}
+
+#define LD_b ldub_p
+#define LD_w lduw_p
+#define LD_l ldl_p
+#define LD_q ldq_p
+
+/* Lightweight transactional memory. */
+#define TX(vaddr, type, value, command) \
+    unsigned long __q_addr;                                   \
+    DATA_##type __oldv;                                       \
+    DATA_##type value;                                        \
+                                                              \
+    CM_GET_QEMU_ADDR(env, __q_addr, vaddr);                   \
+    do {                                                      \
+        __oldv = value = LD_##type((DATA_##type *)__q_addr);  \
+        {command;};                                           \
+        mb();                                                 \
+    } while (__oldv != (atomic_compare_exchange##type(        \
+                    (DATA_##type *)__q_addr, __oldv, value)))
+
+/* Atomically emulate INC instruction using CAS1 and memory transaction. */
+
+#define GEN_ATOMIC_INC(type, TYPE) \
+void helper_atomic_inc##type(CPUX86State *env, target_ulong a0, int c) \
+{                                                                      \
+    int eflags_c, eflags;                                              \
+    int cc_op;                                                         \
+                                                                       \
+    /* compute the previous instruction c flags */                     \
+    eflags_c = helper_cc_compute_c(CC_DST, CC_SRC, CC_SRC2, CC_OP);    \
+                                                                       \
+    TX(a0, type, value, {                                              \
+        if (c > 0) {                                                   \
+            value++;                                                   \
+            cc_op = CC_OP_INC##TYPE;                                   \
+        } else {                                                       \
+            value--;                                                   \
+            cc_op = CC_OP_DEC##TYPE;                                   \
+        }                                                              \
+    });                                                                \
+                                                                       \
+    CC_SRC = eflags_c;                                                 \
+    CC_DST = value;                                                    \
+                                                                       \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, cc_op);    \
+    CC_SRC = eflags;                                                   \
+}                                                                      \
+
+GEN_ATOMIC_INC(b, B);
+GEN_ATOMIC_INC(w, W);
+GEN_ATOMIC_INC(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_INC(q, Q);
+#endif
+
+#define OT_b 0
+#define OT_w 1
+#define OT_l 2
+#define OT_q 3
+
+#define GEN_ATOMIC_XCHG(type) \
+void helper_xchg##type(CPUX86State *env, target_ulong a0, int reg, \
+                int hreg)                                          \
+{                                                                  \
+    DATA_##type val, out;                                          \
+    unsigned long q_addr;                                          \
+                                                                   \
+    CM_GET_QEMU_ADDR(env, q_addr, a0);                             \
+    val = (DATA_##type)cm_get_reg_val(env, OT_##type, hreg, reg);  \
+    out = atomic_exchange##type((DATA_##type *)q_addr, val);       \
+    mb();                                                          \
+                                                                   \
+    cm_set_reg_val(env, OT_##type, hreg, reg, out);                \
+}
+
+GEN_ATOMIC_XCHG(b);
+GEN_ATOMIC_XCHG(w);
+GEN_ATOMIC_XCHG(l);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_XCHG(q);
+#endif
+
+#define GEN_ATOMIC_OP(type, TYPE) \
+void helper_atomic_op##type(CPUX86State *env, target_ulong a0,      \
+                target_ulong t1, int op)                            \
+{                                                                   \
+    DATA_##type operand;                                            \
+    int eflags_c, eflags;                                           \
+    int cc_op;                                                      \
+                                                                    \
+    /* compute the previous instruction c flags */                  \
+    eflags_c = helper_cc_compute_c(CC_DST, CC_SRC, CC_SRC2, CC_OP); \
+    operand = (DATA_##type)t1;                                      \
+                                                                    \
+    TX(a0, type, value, {                                           \
+        switch(op) {                                                \
+        case OP_ADCL:                                               \
+            value += operand + eflags_c;                            \
+            cc_op = CC_OP_ADD##TYPE + (eflags_c << 2);              \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_SBBL:                                               \
+            value = value - operand - eflags_c;                     \
+            cc_op = CC_OP_SUB##TYPE + (eflags_c << 2);              \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_ADDL:                                               \
+            value += operand;                                       \
+            cc_op = CC_OP_ADD##TYPE;                                \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_SUBL:                                               \
+            value -= operand;                                       \
+            cc_op = CC_OP_SUB##TYPE;                                \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        default:                                                    \
+        case OP_ANDL:                                               \
+            value &= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_ORL:                                                \
+            value |= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_XORL:                                               \
+            value ^= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_CMPL:                                               \
+            abort();                                                \
+            break;                                                  \
+        }                                                           \
+    });                                                             \
+    CC_DST = value;                                                 \
+    /* successful transaction, compute the eflags */                \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, cc_op); \
+    CC_SRC = eflags;                                                \
+}
+
+GEN_ATOMIC_OP(b, B);
+GEN_ATOMIC_OP(w, W);
+GEN_ATOMIC_OP(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_OP(q, Q);
+#endif
+
+/* xadd */
+#define GEN_ATOMIC_XADD(type, TYPE) \
+void helper_atomic_xadd##type(CPUX86State *env, target_ulong a0, \
+                int reg, int hreg)                               \
+{                                                                \
+    DATA_##type operand, oldv;                                   \
+    int eflags;                                                  \
+                                                                 \
+    operand = (DATA_##type)cm_get_reg_val(                       \
+            env, OT_##type, hreg, reg);                          \
+                                                                 \
+    TX(a0, type, newv, {                                         \
+        oldv = newv;                                             \
+        newv += operand;                                         \
+    });                                                          \
+                                                                 \
+    /* transaction successes */                                  \
+    /* xchg the register and compute the eflags */               \
+    cm_set_reg_val(env, OT_##type, hreg, reg, oldv);             \
+    CC_SRC = oldv;                                               \
+    CC_DST = newv;                                               \
+                                                                 \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,      \
+		    CC_OP_ADD##TYPE);                            \
+    CC_SRC = eflags;                                             \
+}
+
+GEN_ATOMIC_XADD(b, B);
+GEN_ATOMIC_XADD(w, W);
+GEN_ATOMIC_XADD(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_XADD(q, Q);
+#endif
+
+/* cmpxchg */
+#define GEN_ATOMIC_CMPXCHG(type, TYPE) \
+void helper_atomic_cmpxchg##type(CPUX86State *env, target_ulong a0, \
+                int reg, int hreg)                                  \
+{                                                                   \
+    DATA_##type reg_v, eax_v, res;                                  \
+    int eflags;                                                     \
+    unsigned long q_addr;                                           \
+                                                                    \
+    CM_GET_QEMU_ADDR(env, q_addr, a0);                              \
+    reg_v = (DATA_##type)cm_get_reg_val(env, OT_##type, hreg, reg); \
+    eax_v = (DATA_##type)cm_get_reg_val(env, OT_##type, 0, R_EAX);  \
+                                                                    \
+    res = atomic_compare_exchange##type(                            \
+            (DATA_##type *)q_addr, eax_v, reg_v);                   \
+    mb();                                                           \
+                                                                    \
+    if (res != eax_v)                                               \
+        cm_set_reg_val(env, OT_##type, 0, R_EAX, res);              \
+                                                                    \
+    CC_SRC = res;                                                   \
+    CC_DST = eax_v - res;                                           \
+                                                                    \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,         \
+		    CC_OP_SUB##TYPE);                               \
+    CC_SRC = eflags;                                                \
+}
+
+GEN_ATOMIC_CMPXCHG(b, B);
+GEN_ATOMIC_CMPXCHG(w, W);
+GEN_ATOMIC_CMPXCHG(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_CMPXCHG(q, Q);
+#endif
+
+#if defined(_LP64)
+/* cmpxchgb (8, 16) */
+void helper_atomic_cmpxchg8b(CPUX86State *env, target_ulong a0)
+{
+    uint64_t edx_eax, ecx_ebx, res;
+    int eflags;
+    unsigned long q_addr;
+
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, CC_OP);
+    CM_GET_QEMU_ADDR(env, q_addr, a0);
+
+    edx_eax = (((uint64_t)EDX << 32) | (uint32_t)EAX);
+    ecx_ebx = (((uint64_t)ECX << 32) | (uint32_t)EBX);
+
+    res = atomic_compare_exchangeq((uint64_t *)q_addr, edx_eax, ecx_ebx);
+    mb();
+
+    if (res == edx_eax) {
+         eflags |= CC_Z;
+    } else {
+        EDX = (uint32_t)(res >> 32);
+        EAX = (uint32_t)res;
+        eflags &= ~CC_Z;
+    }
+
+    CC_SRC = eflags;
+}
+#else
+void helper_atomic_cmpxchg8b(CPUX86State *env, target_ulong a0)
+{
+    assert("helper_atomic_cmpxchg8b: not supported.\n");
+    exit(0);
+}
+#endif
+
+void helper_atomic_cmpxchg16b(CPUX86State *env, target_ulong a0)
+{
+    uint8_t res;
+    int eflags;
+    unsigned long q_addr;
+
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, CC_OP);
+    CM_GET_QEMU_ADDR(env, q_addr, a0);
+
+    uint64_t old_rax = *(uint64_t *)q_addr;
+    uint64_t old_rdx = *(uint64_t *)(q_addr + 8);
+    res = atomic_compare_exchange16b((uint64_t *)q_addr, EAX, EDX, EBX, ECX);
+    mb();
+
+    if (res) {
+        eflags |= CC_Z;         /* swap success */
+    } else {
+        EDX = old_rdx;
+        EAX = old_rax;
+        eflags &= ~CC_Z;        /* read the old value ! */
+    }
+
+    CC_SRC = eflags;
+}
+
+/* not */
+#define GEN_ATOMIC_NOT(type) \
+void helper_atomic_not##type(CPUX86State *env, \
+                target_ulong a0)               \
+{                                              \
+    TX(a0, type, value, {                      \
+        value = ~value;                        \
+    });                                        \
+}
+
+GEN_ATOMIC_NOT(b);
+GEN_ATOMIC_NOT(w);
+GEN_ATOMIC_NOT(l);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_NOT(q);
+#endif
+
+/* neg */
+#define GEN_ATOMIC_NEG(type, TYPE) \
+void helper_atomic_neg##type(CPUX86State *env,              \
+                target_ulong a0)                            \
+{                                                           \
+    int eflags;                                             \
+                                                            \
+    TX(a0, type, value, {                                   \
+        value = -value;                                     \
+    });                                                     \
+                                                            \
+    /* We should use the old value to compute CC */         \
+    CC_SRC = CC_DST = -value;                               \
+                                                            \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, \
+		    CC_OP_SUB##TYPE);                       \
+    CC_SRC = eflags;                                        \
+}                                                           \
+
+GEN_ATOMIC_NEG(b, B);
+GEN_ATOMIC_NEG(w, W);
+GEN_ATOMIC_NEG(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_NEG(q, Q);
+#endif
+
+/* This is only used in BTX instruction, with an additional offset.
+ * Note that, when using register bitoffset, the value can be larger than
+ * operand size - 1 (operand size can be 16/32/64), refer to intel manual 2A
+ * page 3-11. */
+#define TX2(vaddr, type, value, offset, command) \
+    unsigned long __q_addr;                                   \
+    DATA_##type __oldv;                                       \
+    DATA_##type value;                                        \
+                                                              \
+    CM_GET_QEMU_ADDR(env, __q_addr, vaddr);                   \
+    __q_addr += offset >> 3;                                  \
+    do {                                                      \
+        __oldv = value = LD_##type((DATA_##type *)__q_addr);  \
+        {command;};                                           \
+        mb();                                                 \
+    } while (__oldv != (atomic_compare_exchange##type(        \
+                    (DATA_##type *)__q_addr, __oldv, value)))
+
+#define GEN_ATOMIC_BTX(ins, command) \
+void helper_atomic_##ins(CPUX86State *env, target_ulong a0,   \
+                target_ulong offset, int ot)                  \
+{                                                             \
+    uint8_t old_byte;                                         \
+    int eflags;                                               \
+                                                              \
+    TX2(a0, b, value, offset, {                               \
+        old_byte = value;                                     \
+        {command;};                                           \
+    });                                                       \
+                                                              \
+    CC_SRC = (old_byte >> (offset & 0x7));                    \
+    CC_DST = 0;                                               \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,   \
+		    CC_OP_SARB + ot);                         \
+    CC_SRC = eflags;                                          \
+}
+
+/* bts */
+GEN_ATOMIC_BTX(bts, {
+    value |= (1 << (offset & 0x7));
+});
+/* btr */
+GEN_ATOMIC_BTX(btr, {
+    value &= ~(1 << (offset & 0x7));
+});
+/* btc */
+GEN_ATOMIC_BTX(btc, {
+    value ^= (1 << (offset & 0x7));
+});
+
+/* fence **/
+void helper_fence(CPUX86State *env)
+{
+    mb();
+}
+
+#undef EAX
+#undef ECX
+#undef EDX
+#undef EBX
diff --git a/llvm/atomic/coremu-atomic.h b/llvm/atomic/coremu-atomic.h
new file mode 100644
index 0000000..998232b
--- /dev/null
+++ b/llvm/atomic/coremu-atomic.h
@@ -0,0 +1,412 @@
+/*
+ * COREMU Parallel Emulator Framework
+ *
+ * Atomic support for COREMU system.
+ * XXX: Now only support x86-64 architecture.
+ *
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *  Xi Wu           <wuxi@fudan.edu.cn>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _COREMU_ATOMIC_H
+#define _COREMU_ATOMIC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "config-target.h"
+#include "hqemu.h"
+
+/* Given the guest virtual address, get the corresponding host address.
+ * This macro resembles ldxxx in softmmu_template.h
+ * NOTE: This must be inlined since the use of GETPC needs to get the
+ * return address. Using always inline also works, we use macro here to be more
+ * explicit. */
+#if defined(CONFIG_USER_ONLY)
+#define CM_GET_QEMU_ADDR(__env1, q_addr, v_addr) \
+do {					         \
+    q_addr = v_addr + GUEST_BASE;	         \
+} while (0)
+
+#else
+#define CM_GET_QEMU_ADDR(__env1, q_addr, v_addr) \
+do {                                                                        \
+    CPUState *cpu = ENV_GET_CPU(__env1);                                    \
+    int __mmu_idx, __index;                                                 \
+    uintptr_t __retaddr;                                                    \
+    __index = (v_addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);            \
+    /* get the CPL, hence determine the MMU mode */                         \
+    __mmu_idx = cpu_mmu_index(__env1, false);                               \
+    /* We use this function in the implementation of atomic instructions */ \
+    /* and we are going to modify these memory. So we use addr_write. */    \
+    if (unlikely(__env1->tlb_table[__mmu_idx][__index].addr_write           \
+                != ((v_addr & TARGET_PAGE_MASK) | tlb_version(__env1)))) {  \
+        __retaddr = GETPC();                                                \
+        tlb_fill(cpu, v_addr, 1, __mmu_idx, __retaddr);                     \
+    }                                                                       \
+    q_addr = v_addr + __env1->tlb_table[__mmu_idx][__index].addend;         \
+} while(0)
+#endif
+
+/* XXX These are also used by atomic instruction handling.
+ * Put these defines in some other files? */
+#define DATA_b uint8_t
+#define DATA_w uint16_t
+#define DATA_l uint32_t
+#define DATA_q uint64_t
+
+#define __inline__ inline __attribute__((always_inline))
+
+#if defined(__i386__) || defined(__x86_64__)
+// Is this the correct way to detect 64 system?
+#if defined(_LP64)
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t rax, uint64_t rdx,
+                           uint64_t rbx, uint64_t rcx)
+{
+    uint8_t z;
+    __asm __volatile__ ( "lock; cmpxchg16b %3\n\t"
+                         "setz %2\n\t"
+                         : "=a" (rax), "=d" (rdx), "=r" (z), "+m" (*memp)
+                         : "a" (rax), "d" (rdx), "b" (rbx), "c" (rcx)
+                         : "memory", "cc" );
+    return z;
+}
+#else
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t rax, uint64_t rdx,
+                           uint64_t rbx, uint64_t rcx)
+{
+    assert("atomic_compare_exchange16b: not supported.\n");
+    exit(0);
+}
+
+static __inline__ uint8_t
+atomic_compare_exchangeq(uint64_t *addr,
+		uint64_t oldval, uint64_t newval)
+{
+    assert("atomic_compare_exchangeq: not supported.\n");
+    exit(0);
+}
+
+#endif
+
+/* Memory Barriers: x86-64 ONLY now */
+#define mb()    asm volatile("mfence":::"memory")
+#define rmb()   asm volatile("lfence":::"memory")
+#define wmb()   asm volatile("sfence" ::: "memory")
+
+#define LOCK_PREFIX "lock; "
+
+#define coremu_xglue(a, b) a ## b
+// If a/b is macro, it will expand first, then pass to coremu_xglue
+#define coremu_glue(a, b) coremu_xglue(a, b)
+
+#define coremu_xstr(s) # s
+#define coremu_str(s) coremu_xstr(s)
+
+#define DATA_BITS 8
+#include "coremu-template.h"
+
+#define DATA_BITS 16
+#include "coremu-template.h"
+
+#define DATA_BITS 32
+#include "coremu-template.h"
+
+#if defined(_LP64)
+#define DATA_BITS 64
+#include "coremu-template.h"
+#else
+static inline uint64_t atomic_exchangeq(uint64_t *p, uint64_t val)
+{
+    assert("atomic_exchangeq: not supported.\n");
+    exit(0);
+}
+
+#endif
+
+#elif defined(__arm__)
+
+#if defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || \
+    defined(__ARM_ARCH_7EM__) || \
+    defined(__ARM_ARCH_7M__)  || \
+    defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || \
+    defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__)
+#define USE_ARMV6_INSTRUCTIONS
+#endif
+
+#ifdef USE_ARMV6_INSTRUCTIONS
+#define mb()	__asm__ __volatile__("dmb" : : : "memory")
+#define raw_local_irq_save(x)                                   \
+        ({                                                      \
+        __asm__ __volatile__(                                   \
+        "mrs    %0, cpsr                @ local_irq_save\n"     \
+        "cpsid  i"                                              \
+        : "=r" (x) : : "memory", "cc");                         \
+        })
+#else
+#define mb()    __asm__ __volatile__("":::"memory")
+#define raw_local_irq_save(x)                                   \
+        ({                                                      \
+                unsigned long temp;                             \
+                (void) (&temp == &x);                           \
+        __asm__ __volatile__(                                   \
+        "mrs    %0, cpsr                @ local_irq_save\n"     \
+"       orr     %1, %0, #128\n"                                 \
+"       msr     cpsr_c, %1"                                     \
+        : "=r" (x), "=r" (temp)                                 \
+        :                                                       \
+        : "memory", "cc");                                      \
+        })
+#endif
+
+#define raw_local_irq_restore(x)                                \
+	__asm__ __volatile(                                     \
+	"msr    cpsr_c, %0              @ local_irq_restore\n"  \
+	:                                                       \
+	: "r" (x)                                               \
+	: "memory", "cc")
+
+static __inline__ uint8_t atomic_compare_exchangeb(uint8_t *addr,
+        uint8_t oldval, uint8_t newval)
+{
+    uint8_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexb  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexbeq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint16_t atomic_compare_exchangew(uint16_t *addr,
+        uint16_t oldval, uint16_t newval)
+{
+    uint16_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexh  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexheq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint32_t atomic_compare_exchangel(uint32_t *addr,
+        uint32_t oldval, uint32_t newval)
+{
+    uint32_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrex  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexeq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint64_t atomic_compare_exchangeq(uint64_t *addr,
+        uint64_t oldval, uint64_t newval)
+{
+    uint64_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexd  %1, %H1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       teqeq  %H1, %H4\n"
+    "       strexdeq %0, %5, %H5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t old_less, uint64_t old_most,
+                           uint64_t new_less, uint64_t new_most)
+{
+    uint8_t ret = 0;
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *memp;
+    if (likely(*memp == old_less && *(memp+1) == old_most))
+    {
+        *memp = new_less;
+	*(memp+1) = new_most;
+	ret = 1;
+    }
+    raw_local_irq_restore(flags);
+    return ret;
+}
+
+static __inline__ unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
+{
+    unsigned long ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned int tmp;
+#endif
+
+    mb();
+
+    switch (size) {
+#ifdef USE_ARMV6_INSTRUCTIONS
+        case 1:
+		__asm __volatile("@	__xchg1\n"
+		"1:	ldrexb	%0, [%3]\n"
+		"	strexb	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+        case 2:
+		__asm __volatile("@	__xchg1\n"
+		"1:	ldrexh	%0, [%3]\n"
+		"	strexh	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+	case 4:
+		__asm __volatile("@	__xchg4\n"
+		"1:	ldrex	%0, [%3]\n"
+		"	strex	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+#else
+	case 1:
+		__asm __volatile("@	__xchg1\n"
+		"	swpb	%0, %1, [%2]"
+			: "=&r" (ret)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+
+	case 4:
+		__asm __volatile("@	__xchg4\n"
+		"	swp	%0, %1, [%2]"
+			: "=&r" (ret)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+	case 2:
+		{
+    		unsigned long flags = 0;
+		raw_local_irq_save(flags);
+		ret = *(volatile uint16_t *)ptr;
+		*(volatile uint16_t *)ptr = x;
+		raw_local_irq_restore(flags);
+		break;
+		}
+
+#endif
+	default:
+		exit(0);
+    }
+    mb();
+
+    return ret;
+}
+
+#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+#define GEN_ATOMIC_XCHG_HELPER(TYPE) \
+static __inline__ DATA_##TYPE atomic_exchange##TYPE(DATA_##TYPE *p, DATA_##TYPE val) { return xchg(p, val); }
+
+GEN_ATOMIC_XCHG_HELPER(b);
+GEN_ATOMIC_XCHG_HELPER(w);
+GEN_ATOMIC_XCHG_HELPER(l);
+
+#endif
+
+#endif /* _COREMU_ATOMIC_H */
+
diff --git a/llvm/atomic/coremu-template.h b/llvm/atomic/coremu-template.h
new file mode 100644
index 0000000..66b185c
--- /dev/null
+++ b/llvm/atomic/coremu-template.h
@@ -0,0 +1,101 @@
+/* The following code may be included multiple times in a single file. */
+
+#if DATA_BITS == 64
+#  define DATA_TYPE uint64_t
+#  define SUFFIX q
+#elif DATA_BITS == 32
+#  define DATA_TYPE uint32_t
+#  define SUFFIX l
+#elif DATA_BITS == 16
+#  define DATA_TYPE uint16_t
+#  define SUFFIX w
+#elif DATA_BITS == 8
+#  define DATA_TYPE uint8_t
+#  define SUFFIX b
+#else
+#error unsupported data size
+#endif
+
+static __inline__ void coremu_glue(atomic_inc, SUFFIX)(DATA_TYPE *p) {
+    asm volatile(
+        LOCK_PREFIX "inc"coremu_str(SUFFIX)" %0"
+        : "+m"(*p)
+        :
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_dec, SUFFIX)(DATA_TYPE *p) {
+    asm volatile(
+        LOCK_PREFIX "dec"coremu_str(SUFFIX)" %0"
+        : "+m"(*p)
+        :
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_add, SUFFIX)(DATA_TYPE* addr,
+        DATA_TYPE val) {
+    asm volatile(
+        LOCK_PREFIX "add"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "a"(val)
+        : "cc");
+}
+
+/* swap the value VAL and *p.
+ * Return the value swapped out from memory. */
+static inline DATA_TYPE coremu_glue(atomic_exchange, SUFFIX)(
+        DATA_TYPE *p, DATA_TYPE val)
+{
+    DATA_TYPE out;
+    __asm __volatile(
+            "lock; xchg"coremu_str(SUFFIX)" %1,%2 \n\t"
+            : "=a" (out), "+m" (*p)
+            : "a" (val)
+            );
+    return out;
+}
+/* Return previous value in addr. So if the return value is the same as oldval,
+ * swap occured. */
+static __inline__ DATA_TYPE coremu_glue(atomic_compare_exchange, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE oldval, DATA_TYPE newval) {
+    asm volatile(
+        LOCK_PREFIX "cmpxchg"coremu_str(SUFFIX)" %2, %1"
+        : "+a"(oldval), "+m"(*addr)
+        : "q"(newval)
+        : "cc");
+
+    return oldval;
+}
+
+static __inline__ void coremu_glue(atomic_and, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE mask) {
+    asm volatile(
+        LOCK_PREFIX "and"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "r"(mask)
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_or, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE mask) {
+    asm volatile(
+        LOCK_PREFIX "or"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "r"(mask)
+        : "cc");
+}
+
+static __inline__ DATA_TYPE coremu_glue(atomic_xadd, SUFFIX)(
+        DATA_TYPE* addr, DATA_TYPE val) {
+    asm volatile(
+        LOCK_PREFIX "xadd"coremu_str(SUFFIX)" %0, %1"
+        : "+a"(val), "+m"(*addr)
+        :
+        : "cc");
+
+    return val;
+}
+
+#undef DATA_BITS
+#undef DATA_TYPE
+#undef SUFFIX
diff --git a/llvm/fpu/softfloat-native-def.h b/llvm/fpu/softfloat-native-def.h
new file mode 100644
index 0000000..4b0fd22
--- /dev/null
+++ b/llvm/fpu/softfloat-native-def.h
@@ -0,0 +1,127 @@
+/*
+ * QEMU float support
+ *
+ * Derived from SoftFloat.
+ */
+
+/*============================================================================
+
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+#ifndef SOFTFLOAT_NATIVE_DEF_H
+#define SOFTFLOAT_NATIVE_DEF_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "fpu/softfloat.h"
+
+int num_native_fpu_helpers(void);
+void *get_native_fpu_helpers(void);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 llvm_int32_to_float32(int32_t v);
+float64 llvm_int32_to_float64(int32_t v);
+float32 llvm_uint32_to_float32(uint32_t v);
+float64 llvm_uint32_to_float64(uint32_t v);
+float32 llvm_int64_to_float32(int64_t v);
+float32 llvm_uint64_to_float32(uint64_t v);
+float64 llvm_int64_to_float64(int64_t v);
+float64 llvm_uint64_to_float64(uint64_t v);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 llvm_float32_to_int32( float32 a );
+int32 llvm_float32_to_int32_round_to_zero( float32 a );
+int64 llvm_float32_to_int64( float32 a );
+int64 llvm_float32_to_int64_round_to_zero( float32 a );
+float64 llvm_float32_to_float64( float32 a );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 llvm_float32_round_to_int( float32 a );
+float32 llvm_float32_add( float32 a, float32 b );
+float32 llvm_float32_sub( float32 a, float32 b );
+float32 llvm_float32_mul( float32 a, float32 b );
+float32 llvm_float32_div( float32 a, float32 b );
+float32 llvm_float32_rem( float32 a, float32 b );
+float32 llvm_float32_sqrt( float32 a );
+int llvm_float32_eq( float32 a, float32 b );
+int llvm_float32_le( float32 a, float32 b );
+int llvm_float32_lt( float32 a, float32 b );
+int llvm_float32_unordered( float32 a, float32 b );
+float32 llvm_float32_abs(float32 a);
+float32 llvm_float32_chs(float32 a);
+
+float32 llvm_float32_muladd( float32 a, float32 b, float32 c );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 llvm_float64_to_int32( float64 a );
+int32 llvm_float64_to_int32_round_to_zero( float64 a );
+int64 llvm_float64_to_int64( float64 a );
+int64 llvm_float64_to_int64_round_to_zero( float64 a );
+float32 llvm_float64_to_float32( float64 a );
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 llvm_float64_round_to_int( float64 a );
+float64 llvm_float64_trunc_to_int( float64 a );
+float64 llvm_float64_add( float64 a, float64 b );
+float64 llvm_float64_sub( float64 a, float64 b );
+float64 llvm_float64_mul( float64 a, float64 b );
+float64 llvm_float64_div( float64 a, float64 b );
+float64 llvm_float64_rem( float64 a, float64 b );
+float64 llvm_float64_sqrt( float64 a );
+int llvm_float64_eq( float64 a, float64 b );
+int llvm_float64_le( float64 a, float64 b );
+int llvm_float64_lt( float64 a, float64 b );
+int llvm_float64_unordered( float64 a, float64 b );
+float64 llvm_float64_abs(float64 a);
+float64 llvm_float64_chs(float64 a);
+
+float64 llvm_float64_muladd( float64 a, float64 b, float64 c );
+
+float32 llvm_float32_maybe_silence_nan( float32 a );
+float64 llvm_float64_maybe_silence_nan( float64 a );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !SOFTFLOAT_NATIVE_DEF_H */
diff --git a/llvm/fpu/softfloat-native.h b/llvm/fpu/softfloat-native.h
new file mode 100644
index 0000000..c12f62b
--- /dev/null
+++ b/llvm/fpu/softfloat-native.h
@@ -0,0 +1,248 @@
+/*
+ * QEMU float support
+ *
+ * Derived from SoftFloat.
+ */
+
+/*============================================================================
+
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+
+=============================================================================*/
+
+#ifndef SOFTFLOAT_NATIVE_H
+#define SOFTFLOAT_NATIVE_H
+
+#include <math.h>
+#include "fpu/softfloat-native-def.h"
+
+typedef union {
+   float32 f;
+   int32_t i;
+   uint32_t u;
+   float s;
+} llvm_float32;
+
+typedef union {
+   float64 f;
+   int64_t i;
+   uint64_t u;
+   double d;
+} llvm_float64;
+
+#ifdef float32_val
+#undef float32_val
+#endif
+#ifdef float64_val
+#undef float64_val
+#endif
+
+#define float32_val(x)  ((llvm_float32)(x)).f
+#define float64_val(x)  ((llvm_float64)(x)).f
+#define lfloat(x)       ((llvm_float32)(x)).s
+#define ldouble(x)      ((llvm_float64)(x)).d
+
+#define DEF_HELPER(name) { (void *)llvm_##name, "llvm_"#name }
+static TCGHelperInfo native_fpu_helpers[] = {
+    DEF_HELPER(int32_to_float32),
+    DEF_HELPER(int32_to_float64),
+    DEF_HELPER(uint32_to_float32),
+    DEF_HELPER(uint32_to_float64),
+    DEF_HELPER(int64_to_float32),
+    DEF_HELPER(uint64_to_float32),
+    DEF_HELPER(int64_to_float64),
+    DEF_HELPER(uint64_to_float64),
+    DEF_HELPER(float32_to_int32),
+    DEF_HELPER(float32_to_int64),
+    DEF_HELPER(float32_to_float64),
+    DEF_HELPER(float32_add),
+    DEF_HELPER(float32_sub),
+    DEF_HELPER(float32_mul),
+    DEF_HELPER(float32_div),
+    DEF_HELPER(float32_rem),
+    DEF_HELPER(float32_sqrt),
+    DEF_HELPER(float32_abs),
+    DEF_HELPER(float32_chs),
+    DEF_HELPER(float64_to_int32),
+    DEF_HELPER(float64_to_int64),
+    DEF_HELPER(float64_to_float32),
+    DEF_HELPER(float64_add),
+    DEF_HELPER(float64_sub),
+    DEF_HELPER(float64_mul),
+    DEF_HELPER(float64_div),
+    DEF_HELPER(float64_rem),
+    DEF_HELPER(float64_sqrt),
+    DEF_HELPER(float64_abs),
+    DEF_HELPER(float64_chs),
+
+    DEF_HELPER(float32_muladd),
+    DEF_HELPER(float64_muladd),
+
+    DEF_HELPER(float32_maybe_silence_nan),
+    DEF_HELPER(float64_maybe_silence_nan),
+#if 0
+    DEF_HELPER(float32_to_int32_round_to_zero),
+    DEF_HELPER(float32_to_int64_round_to_zero),
+    DEF_HELPER(float32_round_to_int),
+    DEF_HELPER(float32_eq),
+    DEF_HELPER(float32_le),
+    DEF_HELPER(float32_lt),
+    DEF_HELPER(float32_unordered),
+    DEF_HELPER(float64_to_int32_round_to_zero),
+    DEF_HELPER(float64_to_int64_round_to_zero),
+    DEF_HELPER(float64_round_to_int),
+    DEF_HELPER(float64_trunc_to_int),
+    DEF_HELPER(float64_eq),
+    DEF_HELPER(float64_le),
+    DEF_HELPER(float64_lt),
+    DEF_HELPER(float64_unordered),
+#endif
+};
+#undef DEF_HELPER
+
+int num_native_fpu_helpers(void)
+{
+    return ARRAY_SIZE(native_fpu_helpers);
+}
+
+void *get_native_fpu_helpers(void)
+{
+    return native_fpu_helpers;
+}
+
+/* XXX: this code implements the x86 behaviour, not the IEEE one.  */
+#if TCG_TARGET_REG_BITS == 32
+static inline int32 long_to_int32(long a)
+{
+    return a;
+}
+#else
+static inline int32 long_to_int32(long a)
+{
+    if (a != (int32_t)a)
+        a = 0x80000000;
+    return a;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 llvm_int32_to_float32(int32_t v)   { return float32_val((float)v);  }
+float64 llvm_int32_to_float64(int32_t v)   { return float64_val((double)v); }
+float32 llvm_uint32_to_float32(uint32_t v) { return float32_val((float)v);  }
+float64 llvm_uint32_to_float64(uint32_t v) { return float64_val((double)v); }
+float32 llvm_int64_to_float32(int64_t v)   { return float32_val((float)v);  }
+float32 llvm_uint64_to_float32(uint64_t v) { return float32_val((float)v);  }
+float64 llvm_int64_to_float64(int64_t v)   { return float64_val((double)v); }
+float64 llvm_uint64_to_float64(uint64_t v) { return float64_val((double)v); }
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 llvm_float32_to_int32( float32 a ) { return long_to_int32(lrintf(lfloat(a))); }
+int32 llvm_float32_to_int32_round_to_zero( float32 a ) { return (int32)lfloat(a); }
+int64 llvm_float32_to_int64( float32 a ) { return llrintf(lfloat(a)); }
+int64 llvm_float32_to_int64_round_to_zero( float32 a ) { return (int64)lfloat(a); }
+float64 llvm_float32_to_float64( float32 a ) { return float64_val((double)lfloat(a)); }
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 llvm_float32_round_to_int( float32 a ) { return float32_val(rintf(lfloat(a))); }
+float32 llvm_float32_add( float32 a, float32 b ) { return float32_val(lfloat(a) + lfloat(b)); }
+float32 llvm_float32_sub( float32 a, float32 b ) { return float32_val(lfloat(a) - lfloat(b)); }
+float32 llvm_float32_mul( float32 a, float32 b ) { return float32_val(lfloat(a) * lfloat(b)); }
+float32 llvm_float32_div( float32 a, float32 b ) { return float32_val(lfloat(a) / lfloat(b)); }
+float32 llvm_float32_rem( float32 a, float32 b ) { return float32_val(remainderf(lfloat(a), lfloat(b))); }
+float32 llvm_float32_sqrt( float32 a ) { return float32_val(sqrtf(lfloat(a))); }
+int llvm_float32_eq( float32 a, float32 b ) { return lfloat(a) == lfloat(b); }
+int llvm_float32_le( float32 a, float32 b ) { return lfloat(a) <= lfloat(b); }
+int llvm_float32_lt( float32 a, float32 b ) { return lfloat(a) < lfloat(b); }
+int llvm_float32_unordered( float32 a, float32 b ) { return isunordered(lfloat(a), lfloat(b)); }
+float32 llvm_float32_abs(float32 a) { return float32_val(fabsf(lfloat(a))); }
+float32 llvm_float32_chs(float32 a) { return float32_val(-lfloat(a)); }
+
+float32 llvm_float32_muladd( float32 a, float32 b, float32 c ) { return float32_val(lfloat(a) * lfloat(b) + lfloat(c)); }
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+int32 llvm_float64_to_int32( float64 a ) { return long_to_int32(lrint(ldouble(a))); }
+int32 llvm_float64_to_int32_round_to_zero( float64 a ) { return (int32)ldouble(a); }
+int64 llvm_float64_to_int64( float64 a ) { return llrint(ldouble(a)); }
+int64 llvm_float64_to_int64_round_to_zero( float64 a ) { return (int64)ldouble(a); }
+float32 llvm_float64_to_float32( float64 a ) { return float32_val((float)ldouble(a)); }
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 llvm_float64_round_to_int( float64 a ) { return float64_val(rint(ldouble(a))); }
+float64 llvm_float64_trunc_to_int( float64 a ) { return float64_val(trunc(ldouble(a))); }
+float64 llvm_float64_add( float64 a, float64 b ) { return float64_val(ldouble(a) + ldouble(b)); }
+float64 llvm_float64_sub( float64 a, float64 b ) { return float64_val(ldouble(a) - ldouble(b)); }
+float64 llvm_float64_mul( float64 a, float64 b ) { return float64_val(ldouble(a) * ldouble(b)); }
+float64 llvm_float64_div( float64 a, float64 b ) { return float64_val(ldouble(a) / ldouble(b)); }
+float64 llvm_float64_rem( float64 a, float64 b ) { return float64_val(remainder(ldouble(a), ldouble(b))); }
+float64 llvm_float64_sqrt( float64 a ) { return float64_val(sqrt(ldouble(a))); }
+int llvm_float64_eq( float64 a, float64 b ) { return ldouble(a) == ldouble(b); }
+int llvm_float64_le( float64 a, float64 b ) { return ldouble(a) <= ldouble(b); }
+int llvm_float64_lt( float64 a, float64 b ) { return ldouble(a) < ldouble(b); }
+int llvm_float64_unordered( float64 a, float64 b ) { return isunordered(ldouble(a), ldouble(b)); }
+float64 llvm_float64_abs(float64 a) { return float64_val(fabs(ldouble(a))); }
+float64 llvm_float64_chs(float64 a) { return float64_val(-ldouble(a)); }
+
+float64 llvm_float64_muladd( float64 a, float64 b, float64 c ) { return float64_val(ldouble(a) * ldouble(b) + ldouble(c)); }
+
+float32 llvm_float32_maybe_silence_nan( float32 a ) {
+    uint32_t _a = ((llvm_float32)(a)).u;
+    if ( ((_a >> 22) & 0x1FF) == 0x1FE  && (_a & 0x003FFFFF)) {
+        _a |= (1 << 22);
+        return float32_val(_a);
+    }
+    return a;
+}
+float64 llvm_float64_maybe_silence_nan( float64 a ) {
+    uint64_t _a = ((llvm_float64)(a)).u;
+    if (((_a >> 51) & 0xFFF) == 0xFFE && (_a & 0x0007FFFFFFFFFFFFLL)) {
+        _a |= 0x0008000000000000LL;
+        return float64_val(_a);
+    }
+    return a;
+}
+
+#undef float32_val
+#undef float64_val
+#undef lfloat
+#undef ldouble
+
+#endif /* !SOFTFLOAT_NATIVE_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/hqemu-helper.c b/llvm/hqemu-helper.c
new file mode 100644
index 0000000..6325716
--- /dev/null
+++ b/llvm/hqemu-helper.c
@@ -0,0 +1,77 @@
+#include "cpu.h"
+#include "tcg.h"
+#include "exec/helper-proto.h"
+#include "hqemu.h"
+#include "fpu/softfloat-native.h"
+
+CPUArchState basereg;
+target_ulong pcid;
+
+#if defined(TARGET_I386)
+XMMReg xmm_reg;
+#endif
+
+extern TranslationBlock *tbs;
+
+void *ibtc_lookup(CPUArchState *env);
+void *cpbl_lookup(CPUArchState *env);
+int cpbl_validate(CPUArchState *env, target_ulong pc, int id);
+
+/* This helper is a hack to export symbols of helper functions in the LLVM
+ * bitcode file. If a target is alerted with lacks of symbols of function/variable,
+ * add such symbols in this helper by accessing it. */
+void helper_export_hqemu(CPUArchState *env)
+{
+    helper_lookup_ibtc(env);
+    helper_lookup_cpbl(env);
+    helper_validate_cpbl(env, 0, 0);
+
+#if defined(CONFIG_SOFTMMU) && defined(CONFIG_LLVM)
+    target_ulong ptr = 0;
+    llvm_ret_ldub_mmu(env, ptr, 0);
+    llvm_le_lduw_mmu(env, ptr, 0);
+    llvm_le_ldul_mmu(env, ptr, 0);
+    llvm_le_ldq_mmu(env, ptr, 0);
+    llvm_be_lduw_mmu(env, ptr, 0);
+    llvm_be_ldul_mmu(env, ptr, 0);
+    llvm_be_ldq_mmu(env, ptr, 0);
+    llvm_ret_ldsb_mmu(env, ptr, 0);
+    llvm_le_ldsw_mmu(env, ptr, 0);
+    llvm_le_ldsl_mmu(env, ptr, 0);
+    llvm_be_ldsw_mmu(env, ptr, 0);
+    llvm_be_ldsl_mmu(env, ptr, 0);
+    llvm_ret_stb_mmu(env, ptr, 0, 0);
+    llvm_le_stw_mmu(env, ptr, 0, 0);
+    llvm_le_stl_mmu(env, ptr, 0, 0);
+    llvm_le_stq_mmu(env, ptr, 0, 0);
+    llvm_be_stw_mmu(env, ptr, 0, 0);
+    llvm_be_stl_mmu(env, ptr, 0, 0);
+    llvm_be_stq_mmu(env, ptr, 0, 0);
+#endif
+}
+
+void helper_verify_tb(CPUArchState *env, int id)
+{
+    static TranslationBlock *last_tb;
+    TranslationBlock *tb = &tbs[id];
+    if (tb->mode == BLOCK_INVALID) {
+        fprintf(stderr, "%s: tb=%p pc=" TARGET_FMT_lx " last_pc="
+                TARGET_FMT_lx "\n", __func__, tb, tb->pc,
+                (last_tb) ? last_tb->pc : -1U);
+    }
+    last_tb = tb;
+}
+
+/*
+ * helper_profile_exec is used to profile LLVM translated code.
+ */
+void helper_profile_exec(CPUArchState *env, void *counter_p, int idx)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    uint64_t **counter = (uint64_t **)counter_p;
+    counter[cpu->cpu_index][idx]++;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/hqemu.mk b/llvm/hqemu.mk
new file mode 100644
index 0000000..01de6d6
--- /dev/null
+++ b/llvm/hqemu.mk
@@ -0,0 +1,191 @@
+# Makefile for HQEMU.
+
+QEMU_CFLAGS += -I$(SRC_PATH)/llvm -I$(SRC_PATH)/llvm/include -I$(SRC_PATH)/llvm/atomic
+QEMU_CXXFLAGS += -std=c++11 -Wno-narrowing
+obj-y += llvm/optimization.o llvm/tracer.o llvm/utils.o llvm/hqemu-helper.o
+
+# LLVM
+ifdef CONFIG_LLVM
+
+LLVM_EXTRA_FLAGS += -Wall -DNEED_CPU_H -D$(LLVM_VERSION) -I..
+
+LLVM_CXXFLAGS := $(patsubst -Wcast-qual, ,$(LLVM_CXXFLAGS))
+LLVM_CXXFLAGS := $(patsubst -fno-exceptions, ,$(LLVM_CXXFLAGS))
+LLVM_CXXFLAGS := $(patsubst -pedantic, ,$(LLVM_CXXFLAGS))
+LLVM_CXXFLAGS += -Wno-unused-local-typedefs -Wno-cast-qual -fno-rtti
+LLVM_CFLAGS += $(patsubst -O2, ,$(patsubst -g, ,$(CFLAGS)))
+LLVM_CFLAGS += $(LLVM_EXTRA_FLAGS) $(QEMU_INCLUDES) \
+	       -I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+	       -I$(SRC_PATH)/target-$(TARGET_BASE_ARCH) -I$(SRC_PATH)/llvm \
+	       -I$(SRC_PATH)/llvm/atomic -I$(SRC_PATH)/llvm/include/pmu
+LLVM_CFLAGS := $(patsubst -pedantic, ,$(LLVM_CFLAGS))
+LLVM_CFLAGS := $(patsubst -g, ,$(LLVM_CFLAGS))
+
+PASS := llvm/pass
+ANALYSIS := llvm/analysis
+HPM := llvm/pmu
+QEMU_CXXFLAGS += $(LLVM_CXXFLAGS) $(LLVM_EXTRA_FLAGS) -Wno-undef
+LDFLAGS += $(LLVM_LDFLAGS)
+LIBS += $(LLVM_LIBS) -ldl -lz -lncurses
+
+
+ifeq ($(CONFIG_WIN32), y)
+LIBS += -lpthread -limagehlp -lpsapi
+endif
+
+obj-y += llvm/xml/tinyxml2.o
+obj-y += llvm/llvm.o              \
+         llvm/llvm-translator.o   \
+         llvm/llvm-opc.o          \
+         llvm/llvm-opc-vector.o   \
+         llvm/llvm-opc-mmu.o      \
+         llvm/llvm-debug.o        \
+         llvm/llvm-target.o       \
+         llvm/llvm-soft-perfmon.o \
+         llvm/llvm-hard-perfmon.o \
+         llvm/llvm-annotate.o
+obj-y += $(PASS)/ProfileExec.o        \
+         $(PASS)/ReplaceIntrinsic.o   \
+         $(PASS)/CombineGuestMemory.o \
+         $(PASS)/CombineCasts.o       \
+         $(PASS)/CombineZExtTrunc.o   \
+         $(PASS)/FastMathPass.o       \
+         $(PASS)/StateMappingPass.o   \
+         $(PASS)/RedundantStateElimination.o   \
+         $(PASS)/SimplifyPointer.o
+obj-y += $(ANALYSIS)/InnerLoopAnalysis.o
+
+# HPM
+obj-y += $(HPM)/pmu.o \
+         $(HPM)/pmu-events.o
+
+ifeq ($(ARCH),$(filter $(ARCH),i386 x86_64))
+obj-y += $(HPM)/x86/x86-events.o
+endif
+ifeq ($(ARCH),$(filter $(ARCH),arm aarch64))
+obj-y += $(HPM)/arm/arm-events.o
+endif
+ifeq ($(ARCH),$(filter $(ARCH),ppc64))
+obj-y += $(HPM)/ppc/ppc-events.o
+endif
+
+#
+# LLVM Bitcode file
+#
+
+ifdef CONFIG_SOFTMMU
+BCSUF = _softmmu
+MMU_HELPER = $(TARGET_PATH)/mmu_helper.bc
+endif
+
+LLVM_BITCODE = llvm_helper_${TARGET_NAME}${BCSUF}.bc
+TARGET_PATH = target-$(TARGET_BASE_ARCH)
+LLVM_HELPER += tcg-runtime.bc llvm/hqemu-helper.bc $(TARGET_PATH)/helper.bc
+
+ifeq ($(TARGET_I386), y)
+LLVM_HELPER += $(TARGET_PATH)/cc_helper.bc   \
+	       $(TARGET_PATH)/int_helper.bc  \
+	       $(TARGET_PATH)/smm_helper.bc  \
+	       $(TARGET_PATH)/excp_helper.bc \
+	       $(TARGET_PATH)/mem_helper.bc  \
+	       $(TARGET_PATH)/svm_helper.bc  \
+	       $(TARGET_PATH)/fpu_helper.bc  \
+	       $(TARGET_PATH)/misc_helper.bc \
+	       $(TARGET_PATH)/seg_helper.bc  \
+	       $(TARGET_PATH)/bpt_helper.bc
+endif
+ifeq ($(TARGET_X86_64), y)
+LLVM_HELPER += $(TARGET_PATH)/cc_helper.bc   \
+	       $(TARGET_PATH)/int_helper.bc  \
+	       $(TARGET_PATH)/smm_helper.bc  \
+	       $(TARGET_PATH)/excp_helper.bc \
+	       $(TARGET_PATH)/mem_helper.bc  \
+	       $(TARGET_PATH)/svm_helper.bc  \
+	       $(TARGET_PATH)/fpu_helper.bc  \
+	       $(TARGET_PATH)/misc_helper.bc \
+	       $(TARGET_PATH)/seg_helper.bc
+endif
+ifeq ($(TARGET_ALPHA), y)
+LLVM_HELPER += $(TARGET_PATH)/fpu_helper.bc \
+	       $(TARGET_PATH)/int_helper.bc \
+	       $(TARGET_PATH)/mem_helper.bc \
+	       $(TARGET_PATH)/sys_helper.bc
+endif
+ifeq ($(TARGET_ARM), y)
+LLVM_HELPER += $(TARGET_PATH)/op_helper.bc \
+	       $(TARGET_PATH)/neon_helper.bc
+endif
+ifeq ($(TARGET_AARCH64), y)
+LLVM_HELPER += $(TARGET_PATH)/op_helper.bc \
+	       $(TARGET_PATH)/helper-a64.bc \
+	       $(TARGET_PATH)/neon_helper.bc
+endif
+ifeq ($(TARGET_MICROBLAZE), y)
+LLVM_HELPER += $(TARGET_PATH)/op_helper.bc
+endif
+ifeq ($(TARGET_MIPS), y)
+LLVM_HELPER += $(TARGET_PATH)/op_helper.bc  \
+	       $(TARGET_PATH)/dsp_helper.bc \
+	       $(TARGET_PATH)/lmi_helper.bc
+endif
+ifeq ($(TARGET_OPENRISC), y)
+LLVM_HELPER += $(TARGET_PATH)/exception_helper.bc \
+	       $(TARGET_PATH)/fpu_helper.bc \
+	       $(TARGET_PATH)/interrupt_helper.bc \
+	       $(TARGET_PATH)/int_helper.bc \
+	       $(TARGET_PATH)/sys_helper.bc \
+	       $(MMU_HELPER)
+endif
+ifeq ($(TARGET_PPC), y)
+LLVM_HELPER += $(TARGET_PATH)/excp_helper.bc \
+	       $(TARGET_PATH)/int_helper.bc  \
+	       $(TARGET_PATH)/misc_helper.bc \
+	       $(TARGET_PATH)/fpu_helper.bc  \
+	       $(TARGET_PATH)/mem_helper.bc  \
+	       $(TARGET_PATH)/timebase_helper.bc \
+	       $(MMU_HELPER)
+endif
+ifeq ($(TARGET_PPC64), y)
+LLVM_HELPER += $(TARGET_PATH)/excp_helper.bc \
+	       $(TARGET_PATH)/int_helper.bc  \
+	       $(TARGET_PATH)/misc_helper.bc \
+	       $(TARGET_PATH)/fpu_helper.bc  \
+	       $(TARGET_PATH)/mem_helper.bc  \
+	       $(TARGET_PATH)/timebase_helper.bc \
+	       $(MMU_HELPER)
+endif
+ifeq ($(TARGET_SH4), y)
+LLVM_HELPER += $(TARGET_PATH)/op_helper.bc
+endif
+ifeq ($(TARGET_SPARC), y)
+LLVM_HELPER += $(TARGET_PATH)/cc_helper.bc    \
+	       $(TARGET_PATH)/fop_helper.bc   \
+	       $(TARGET_PATH)/int32_helper.bc \
+	       $(TARGET_PATH)/ldst_helper.bc  \
+	       $(TARGET_PATH)/vis_helper.bc   \
+	       $(TARGET_PATH)/win_helper.bc   \
+	       $(MMU_HELPER)
+endif
+ifeq ($(TARGET_SPARC64), y)
+LLVM_HELPER += $(TARGET_PATH)/cc_helper.bc    \
+	       $(TARGET_PATH)/fop_helper.bc   \
+	       $(TARGET_PATH)/int64_helper.bc \
+	       $(TARGET_PATH)/ldst_helper.bc  \
+	       $(TARGET_PATH)/vis_helper.bc   \
+	       $(TARGET_PATH)/win_helper.bc   \
+	       $(MMU_HELPER)
+endif
+
+LOCAL_BC := clang
+LOCAL_BC_CFLAGS := -S -emit-llvm $(BCFLAGS) -I$(SRC_PATH)/llvm/include $(LLVM_CFLAGS) \
+	           -Wno-missing-prototypes -Wno-sign-compare -Wno-unused-function \
+		   -Wno-constant-conversion
+
+%.bc: %.c
+	$(call quiet-command,$(LOCAL_BC) $(LOCAL_BC_CFLAGS) -c -o $@ $<, "  LCC   $(TARGET_DIR)$@")
+
+
+$(LLVM_BITCODE): $(LLVM_HELPER)
+	$(call quiet-command,llvm-link -o $@ $^, "  LCC   $(TARGET_DIR)$@")
+
+endif
diff --git a/llvm/include/InnerLoopAnalysis.h b/llvm/include/InnerLoopAnalysis.h
new file mode 100644
index 0000000..f11225d
--- /dev/null
+++ b/llvm/include/InnerLoopAnalysis.h
@@ -0,0 +1,291 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __INNERLOOPANALYSIS_H
+#define __INNERLOOPANALYSIS_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm-types.h"
+
+
+class InductionDesc {
+    /* Start value. */
+    Value *StartValue;
+    /* Step value. */
+    const SCEV *Step;
+
+public:
+    InductionDesc() : StartValue(nullptr), Step(nullptr) {}
+    InductionDesc(Value *Start, const SCEV *Step)
+        : StartValue(Start), Step(Step) {}
+
+    Value *getStartValue() const { return StartValue; }
+    const SCEV *getStep() const { return Step; }
+};
+
+class ReductionDesc {
+public:
+
+    enum ReductionKind {
+        NoReduction, /* Not a reduction. */
+        IntegerAdd,  /* Sum of numbers. */
+        IntegerMult, /* Product of numbers. */
+        IntegerOr,   /* Bitwise or logical OR of numbers. */
+        IntegerAnd,  /* Bitwise or logical AND of numbers. */
+        IntegerXor,  /* Bitwise or logical XOR of numbers. */
+        FloatAdd,    /* Sum of float numbers. */
+        FloatMult,   /* Product of float numbers. */
+    };
+
+    ReductionDesc()
+        : StartValue(nullptr), LoopExitInstr(nullptr),
+          Kind(ReductionKind::NoReduction), Ty(nullptr) {}
+    ReductionDesc(Value *Start, Instruction *Exit, ReductionKind K, Type *Ty)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), Ty(Ty) {}
+
+    Value *getStartValue() const { return StartValue; }
+    Value *getNextValue() const { return LoopExitInstr; }
+    Instruction *getLoopExitInstr() { return LoopExitInstr; }
+    ReductionKind getReductionKind() { return Kind; }
+    Type *getScalarType() { return Ty; }
+
+private:
+    /* The starting value of the recurrence. */
+    Value *StartValue;
+    /* The instruction who's value is used outside the loop. */
+    Instruction *LoopExitInstr;
+    /* The kind of the recurrence.*/
+    ReductionKind Kind;
+    /* The scalar type. */
+    Type *Ty;
+};
+
+/*
+ * The InnertLoop class represents a single innertmost loop. The InnerLoop has
+ * a special shape that is specific to the DBT decoded guest loop, and its loop
+ * definition is different to a nature loop, e.g., latch and exiting block.
+ */
+class InnerLoop {
+public:
+    typedef std::map<PHINode *, InductionDesc> InductionList;
+    typedef std::map<PHINode *, ReductionDesc> ReductionList;
+
+private:
+    Loop &TheLoop;
+
+    /* The list of blocks in this loop. First entry is the header node. */
+    std::vector<BasicBlock *> Blocks;
+    SmallPtrSet<const BasicBlock *, 8> DenseBlockSet;
+
+    std::vector<BasicBlock *> Latches;
+    std::map<BasicBlock *, BasicBlock *> SplitLatches;
+
+    bool UnknownPhi;
+    InductionList Inductions;
+    ReductionList Reductions;
+
+    void addInduction(PHINode *Phi, Value *Start, const SCEV *Step) {
+        Inductions[Phi] = InductionDesc(Start, Step);
+    }
+
+    void addReduction(PHINode *Phi, Value *Start, Instruction *Exit,
+                      ReductionDesc::ReductionKind K, Type *Ty) {
+        Reductions[Phi] = ReductionDesc(Start, Exit, K, Ty);
+    }
+
+    InnerLoop(const InnerLoop &) = delete;
+    const InnerLoop& operator=(const InnerLoop &) = delete;
+
+    friend class InnerLoopAnalysis;
+
+public:
+    InnerLoop(Loop *loop);
+    ~InnerLoop() {}
+
+    Loop &getLoop() const { return TheLoop; }
+
+    BasicBlock *getHeader() const { return Blocks.front(); }
+
+    /* Return true if the specified basic block is in this loop. */
+    bool contains(const BasicBlock *BB) const {
+        return DenseBlockSet.count(BB);
+    }
+
+    /* Return true if the specified instruction is in this loop. */
+    bool contains(const Instruction *Inst) const {
+        return contains(Inst->getParent());
+    }
+
+    /* Get a list of the basic blocks which make up this loop. */
+    typedef typename std::vector<BasicBlock*>::const_iterator block_iterator;
+    const std::vector<BasicBlock*> &getBlocks() const { return Blocks; }
+    block_iterator block_begin() const { return Blocks.begin(); }
+    block_iterator block_end() const { return Blocks.end(); }
+    inline iterator_range<block_iterator> blocks() const {
+        return make_range(block_begin(), block_end());
+    }
+
+    /* Get the number of blocks in this loop in constant time. */
+    unsigned getNumBlocks() const { return Blocks.size(); }
+
+    /* True if terminator in the block can branch to another block that is 
+     * outside of the current loop. */
+    bool isLoopExiting(BasicBlock *BB) const;
+
+    /* Calculate the number of back edges to the loop header. */
+    unsigned getNumBackEdges() const;
+
+    /* Return all blocks inside the loop that have successors outside of the
+     * loop. */
+    void getExitingBlocks(SmallVectorImpl<BasicBlock *> &ExitingBlocks) const;
+
+    /* If getExitingBlocks would return exactly one block, return that block.
+     * Otherwise return null. */
+    BasicBlock *getExitingBlock() const;
+
+    /* Return all of the successor blocks of this loop. */
+    void getExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const;
+
+    /* If getExitBlocks would return exactly one block, return that block.
+     * Otherwise return null. */
+    BasicBlock *getExitBlock() const;
+
+    /* If there is a preheader for this loop, return it. A loop has a preheader
+     * if there is only one edge to the header of the loop from outside of the
+     * loop. If this is the case, the block branching to the header of the loop
+     * is the preheader node.
+     *
+     * This method returns null if there is no preheader for the loop. */
+    BasicBlock *getLoopPreheader() const;
+
+    /* If the given loop's header has exactly one unique predecessor outside the
+     * loop, return it. Otherwise return null.
+     * This is less strict that the loop "preheader" concept, which requires
+     * the predecessor to have exactly one successor. */
+    BasicBlock *getLoopPredecessor() const;
+
+    unsigned getNumLoopLatches()  const { return Latches.size(); }
+    unsigned getNumSplitLatches() const { return SplitLatches.size(); }
+
+    /* Return all loop latch blocks of this loop. A latch block is a block that
+     * contains a branch back to the header. */
+    void getLoopLatches(SmallVectorImpl<BasicBlock *> &LoopLatches) const {
+        for (auto I : Latches)
+            LoopLatches.push_back(I);
+    }
+
+    /* If there is a latch tail, return it. */
+    BasicBlock *getSingleLatchTail() const {
+        return (SplitLatches.size() == 1) ? SplitLatches.begin()->first :
+                                            nullptr;
+    }
+
+    /* If there is a latch head, return it. */
+    BasicBlock *getSingleLatchHead() const {
+        return (SplitLatches.size() == 1) ? SplitLatches.begin()->second :
+                                            nullptr;
+    }
+
+    /* Return all of the latch tails of this loop. */
+    void getLatchTails(SmallVectorImpl<BasicBlock *> &LatchTails) const {
+        for (auto &I : SplitLatches)
+            LatchTails.push_back(I.first);
+    }
+
+    /* Given a latch tail, return its latch head. */
+    BasicBlock *getLatchHead(BasicBlock *BB) {
+        if (SplitLatches.find(BB) == SplitLatches.end())
+            return nullptr;
+        return SplitLatches[BB];
+    }
+
+    /* If the given phi is an induction of the loop, return the induciton. */
+    InductionDesc *getInduction(PHINode *Phi) {
+        if (Inductions.find(Phi) == Inductions.end())
+            return nullptr;
+        return &Inductions[Phi];
+    }
+
+    /* If the given phi is a reduction of the loop, return the induciton. */
+    ReductionDesc *getReduction(PHINode *Phi) {
+        if (Reductions.find(Phi) == Reductions.end())
+            return nullptr;
+        return &Reductions[Phi];
+    }
+
+    /* Return true if the loop has unknown phi(s). A loop has unknown phi(s) if
+     * a phi node is not identified, or the loop has no preheader or latch tail.
+     * 
+     * If the loop has unknown phi(s), the data structure of Inductions and
+     * Reductions can be undefined. */
+    bool hasUnknownPhi() { return UnknownPhi; }
+
+    /* Return true if the instruction `From' can flow to instruction `To' in
+     * the loop. */
+    bool isReachable(Instruction *From, Instruction *To);
+};
+
+class InnerLoopAnalysis {
+    std::vector<InnerLoop *> InnerLoops;
+
+    void analyzePhi(InnerLoop &TheLoop, ScalarEvolution *SE);
+    bool analyzeInduction(InnerLoop &TheLoop, ScalarEvolution *SE, PHINode *Phi);
+    bool analyzeReduction(InnerLoop &TheLoop, PHINode *Phi);
+
+public:
+    InnerLoopAnalysis() {}
+    ~InnerLoopAnalysis() { releaseMemory(); }
+
+    void releaseMemory() {
+        while (!InnerLoops.empty()) {
+            InnerLoop *L = InnerLoops.back();
+            InnerLoops.pop_back();
+            delete L;
+        }
+    }
+    void print(raw_ostream &OS, const Module * = nullptr) const {}
+    void verify() const {}
+    void analyze(LoopInfo *LI, ScalarEvolution *SE);
+
+    /* iterator/begin/end - The interface to the innermost loops. */
+    typedef typename std::vector<InnerLoop *>::const_iterator iterator;
+    typedef typename std::vector<InnerLoop *>::const_reverse_iterator
+        reverse_iterator;
+    iterator begin() const { return InnerLoops.begin(); }
+    iterator end() const { return InnerLoops.end(); }
+    reverse_iterator rbegin() const { return InnerLoops.rbegin(); }
+    reverse_iterator rend() const { return InnerLoops.rend(); }
+    bool empty() const { return InnerLoops.empty(); }
+    unsigned size() { return InnerLoops.size(); }
+};
+
+/*
+ * InnerLoopAnalysisWrapperPass Pass
+ */
+class InnerLoopAnalysisWrapperPass : public FunctionPass {
+    InnerLoopAnalysis LA;
+
+public:
+    static char ID;
+    InnerLoopAnalysisWrapperPass() : FunctionPass(ID) {
+        initializeInnerLoopAnalysisWrapperPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    InnerLoopAnalysis &getLoopAnalysis() { return LA; }
+    const InnerLoopAnalysis &getLoopAnalysis() const { return LA; }
+
+    void releaseMemory() override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    void print(raw_ostream &OS, const Module * = nullptr) const override;
+    void verifyAnalysis() const override;
+    bool runOnFunction(Function &F) override;
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/JIT.h b/llvm/include/JIT.h
new file mode 100644
index 0000000..a1b3c8d
--- /dev/null
+++ b/llvm/include/JIT.h
@@ -0,0 +1,228 @@
+//===-- JIT.h - Class definition for the JIT --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the top-level JIT data structure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __JIT_H
+#define __JIT_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/PassManager.h"
+
+namespace llvm {
+
+class Function;
+struct JITEvent_EmittedFunctionDetails;
+class MachineCodeEmitter;
+class MachineCodeInfo;
+class TargetJITInfo;
+class TargetMachine;
+
+class JITState {
+private:
+  FunctionPassManager PM;  // Passes to compile a function
+  Module *M;               // Module used to create the PM
+
+  /// PendingFunctions - Functions which have not been code generated yet, but
+  /// were called from a function being code generated.
+  std::vector<AssertingVH<Function> > PendingFunctions;
+
+public:
+  explicit JITState(Module *M) : PM(M), M(M) {}
+
+  FunctionPassManager &getPM() {
+    return PM;
+  }
+
+  Module *getModule() const { return M; }
+  std::vector<AssertingVH<Function> > &getPendingFunctions() {
+    return PendingFunctions;
+  }
+};
+
+
+class JIT : public ExecutionEngine {
+  /// types
+  typedef ValueMap<const BasicBlock *, void *>
+      BasicBlockAddressMapTy;
+  /// data
+  TargetMachine &TM;       // The current target we are compiling to
+  TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
+  JITCodeEmitter *JCE;     // JCE object
+  JITMemoryManager *JMM;
+  std::vector<JITEventListener*> EventListeners;
+
+  /// AllocateGVsWithCode - Some applications require that global variables and
+  /// code be allocated into the same region of memory, in which case this flag
+  /// should be set to true.  Doing so breaks freeMachineCodeForFunction.
+  bool AllocateGVsWithCode;
+
+  /// True while the JIT is generating code.  Used to assert against recursive
+  /// entry.
+  bool isAlreadyCodeGenerating;
+
+  JITState *jitstate;
+
+  /// BasicBlockAddressMap - A mapping between LLVM basic blocks and their
+  /// actualized version, only filled for basic blocks that have their address
+  /// taken.
+  BasicBlockAddressMapTy BasicBlockAddressMap;
+
+
+  JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
+      JITMemoryManager *JMM, bool AllocateGVsWithCode);
+public:
+  ~JIT();
+
+  static void Register() {
+    JITCtor = createJIT;
+  }
+
+  /// getJITInfo - Return the target JIT information structure.
+  ///
+  TargetJITInfo &getJITInfo() const { return TJI; }
+
+  /// create - Create an return a new JIT compiler if there is one available
+  /// for the current target.  Otherwise, return null.
+  ///
+  static ExecutionEngine *create(Module *M,
+                                 std::string *Err,
+                                 JITMemoryManager *JMM,
+                                 CodeGenOpt::Level OptLevel =
+                                   CodeGenOpt::Default,
+                                 bool GVsWithCode = true,
+                                 Reloc::Model RM = Reloc::Default,
+                                 CodeModel::Model CMM = CodeModel::JITDefault) {
+    return ExecutionEngine::createJIT(M, Err, JMM, OptLevel, GVsWithCode,
+                                      RM, CMM);
+  }
+
+  void addModule(Module *M) override;
+
+  /// removeModule - Remove a Module from the list of modules.  Returns true if
+  /// M is found.
+  bool removeModule(Module *M) override;
+
+  /// runFunction - Start execution with the specified function and arguments.
+  ///
+  GenericValue runFunction(Function *F,
+                           const std::vector<GenericValue> &ArgValues) override;
+
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function by using the MemoryManager. As such it is only
+  /// useful for resolving library symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function silently returns a null pointer. Otherwise,
+  /// it prints a message to stderr and aborts.
+  ///
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true) override;
+
+  // CompilationCallback - Invoked the first time that a call site is found,
+  // which causes lazy compilation of the target function.
+  //
+  static void CompilationCallback();
+
+  /// getPointerToFunction - This returns the address of the specified function,
+  /// compiling it if necessary.
+  ///
+  void *getPointerToFunction(Function *F) override;
+
+  /// addPointerToBasicBlock - Adds address of the specific basic block.
+  void addPointerToBasicBlock(const BasicBlock *BB, void *Addr);
+
+  /// clearPointerToBasicBlock - Removes address of specific basic block.
+  void clearPointerToBasicBlock(const BasicBlock *BB);
+
+  /// getPointerToBasicBlock - This returns the address of the specified basic
+  /// block, assuming function is compiled.
+  void *getPointerToBasicBlock(BasicBlock *BB) override;
+
+  /// getOrEmitGlobalVariable - Return the address of the specified global
+  /// variable, possibly emitting it to memory if needed.  This is used by the
+  /// Emitter.
+  void *getOrEmitGlobalVariable(const GlobalVariable *GV) override;
+
+  /// getPointerToFunctionOrStub - If the specified function has been
+  /// code-gen'd, return a pointer to the function.  If not, compile it, or use
+  /// a stub to implement lazy compilation if available.
+  ///
+  void *getPointerToFunctionOrStub(Function *F) override;
+
+  /// recompileAndRelinkFunction - This method is used to force a function
+  /// which has already been compiled, to be compiled again, possibly
+  /// after it has been modified. Then the entry to the old copy is overwritten
+  /// with a branch to the new copy. If there was no old copy, this acts
+  /// just like JIT::getPointerToFunction().
+  ///
+  void *recompileAndRelinkFunction(Function *F) override;
+
+  /// freeMachineCodeForFunction - deallocate memory used to code-generate this
+  /// Function.
+  ///
+  void freeMachineCodeForFunction(Function *F) override;
+
+  /// addPendingFunction - while jitting non-lazily, a called but non-codegen'd
+  /// function was encountered.  Add it to a pending list to be processed after
+  /// the current function.
+  ///
+  void addPendingFunction(Function *F);
+
+  /// getCodeEmitter - Return the code emitter this JIT is emitting into.
+  ///
+  JITCodeEmitter *getCodeEmitter() const { return JCE; }
+
+  static ExecutionEngine *createJIT(Module *M,
+                                    std::string *ErrorStr,
+                                    JITMemoryManager *JMM,
+                                    bool GVsWithCode,
+                                    TargetMachine *TM);
+
+  // Run the JIT on F and return information about the generated code
+  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = nullptr) override;
+
+  void RegisterJITEventListener(JITEventListener *L) override;
+  void UnregisterJITEventListener(JITEventListener *L) override;
+
+  TargetMachine *getTargetMachine() override { return &TM; }
+
+  /// These functions correspond to the methods on JITEventListener.  They
+  /// iterate over the registered listeners and call the corresponding method on
+  /// each.
+  void NotifyFunctionEmitted(
+      const Function &F, void *Code, size_t Size,
+      const JITEvent_EmittedFunctionDetails &Details);
+  void NotifyFreeingMachineCode(void *OldPtr);
+
+  BasicBlockAddressMapTy &
+  getBasicBlockAddressMap() {
+    return BasicBlockAddressMap;
+  }
+
+
+private:
+  static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
+                                       TargetMachine &tm);
+  void runJITOnFunctionUnlocked(Function *F);
+  void updateFunctionStubUnlocked(Function *F);
+  void jitTheFunctionUnlocked(Function *F);
+
+protected:
+
+  /// getMemoryforGV - Allocate memory for a global variable.
+  char* getMemoryForGV(const GlobalVariable* GV) override;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/JITMemoryManager.h b/llvm/include/JITMemoryManager.h
new file mode 100644
index 0000000..301d227
--- /dev/null
+++ b/llvm/include/JITMemoryManager.h
@@ -0,0 +1,318 @@
+//===-- JITMemoryManager.cpp - Memory Allocator for JIT'd code ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DefaultJITMemoryManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __JITMEMORYMANAGER_H
+#define __JITMEMORYMANAGER_H
+
+#include <sys/mman.h>
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm-debug.h"
+#include "utils.h"
+
+using namespace llvm;
+
+#define MIN_CODE_CACHE_SIZE         (1 * 1024 * 1024)
+#define DEFAULT_GLOBAL_SIZE         (64 * 1024)
+#define DEFAULT_THRESHOLD           (32 * 1024)
+
+
+// AtExitHandlers - List of functions to call when the program exits,
+// registered with the atexit() library function.
+static std::vector<void (*)()> AtExitHandlers;
+
+/// runAtExitHandlers - Run any functions registered by the program's
+/// calls to atexit(3), which we intercept and store in
+/// AtExitHandlers.
+///
+static void runAtExitHandlers() {
+  while (!AtExitHandlers.empty()) {
+    void (*Fn)() = AtExitHandlers.back();
+    AtExitHandlers.pop_back();
+    Fn();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Function stubs that are invoked instead of certain library calls
+//
+// Force the following functions to be linked in to anything that uses the
+// JIT. This is a hack designed to work around the all-too-clever Glibc
+// strategy of making these functions work differently when inlined vs. when
+// not inlined, and hiding their real definitions in a separate archive file
+// that the dynamic linker can't see. For more info, search for
+// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+#if defined(__linux__) && defined(__GLIBC__)
+/* stat functions are redirecting to __xstat with a version number.  On x86-64
+ * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
+ * available as an exported symbol, so we have to add it explicitly.
+ */
+namespace {
+class StatSymbols {
+public:
+  StatSymbols() {
+    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
+    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
+    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
+    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
+    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
+    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
+    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
+    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
+    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
+  }
+};
+}
+static StatSymbols initStatSymbols;
+#endif // __linux__
+
+// jit_exit - Used to intercept the "exit" library call.
+static void jit_exit(int Status) {
+  runAtExitHandlers();   // Run atexit handlers...
+  exit(Status);
+}
+
+// jit_atexit - Used to intercept the "atexit" library call.
+static int jit_atexit(void (*Fn)()) {
+  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
+  return 0;  // Always successful
+}
+
+static int jit_noop() {
+  return 0;
+}
+
+
+/// DefaultJITMemoryManager - Manage trace cache memory  for the JIT code generation.
+class DefaultJITMemoryManager : public JITMemoryManager {
+  uint8_t *TraceCache;
+  size_t TraceCacheSize;
+
+  uint8_t *GlobalBase;  /* section for global data used by QEMU helpers */
+  uint8_t *CodeBase;    /* section for emitting trace code */
+  uint8_t *CodeGenPtr;
+
+  size_t GlobalRemain;
+  size_t CodeRemain;
+  size_t Threshold;
+
+  hqemu::Mutex lock;
+
+public:
+  DefaultJITMemoryManager(uint8_t *Cache, size_t Size)
+    : TraceCache(Cache), TraceCacheSize(Size), Threshold(DEFAULT_THRESHOLD)
+  {
+    GlobalBase = TraceCache;
+    GlobalRemain = DEFAULT_GLOBAL_SIZE;
+ 
+    CodeBase = GlobalBase + DEFAULT_GLOBAL_SIZE;
+    CodeBase = (uint8_t *)(((uintptr_t)CodeBase + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeBase;
+    CodeGenPtr = CodeBase;
+  }
+
+  ~DefaultJITMemoryManager() {}
+
+  //===----------------------------------------------------------------------===//
+  //
+  /// getPointerToNamedFunction - This method returns the address of the specified
+  /// function by using the dynamic loader interface.  As such it is only useful
+  /// for resolving library symbols, not code generated symbols.
+  ///
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true) override {
+    // Check to see if this is one of the functions we want to intercept.  Note,
+    // we cast to intptr_t here to silence a -pedantic warning that complains
+    // about casting a function pointer to a normal pointer.
+    if (Name == "exit") return (void*)(intptr_t)&jit_exit;
+    if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
+  
+    // We should not invoke parent's ctors/dtors from generated main()!
+    // On Mingw and Cygwin, the symbol __main is resolved to
+    // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+    // (and register wrong callee's dtors with atexit(3)).
+    // We expect ExecutionEngine::runStaticConstructorsDestructors()
+    // is called before ExecutionEngine::runFunctionAsMain() is called.
+    if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+  
+    const char *NameStr = Name.c_str();
+    // If this is an asm specifier, skip the sentinal.
+    if (NameStr[0] == 1) ++NameStr;
+  
+    // If it's an external function, look it up in the process image...
+    void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+    if (Ptr) return Ptr;
+  
+    // If it wasn't found and if it starts with an underscore ('_') character,
+    // try again without the underscore.
+    if (NameStr[0] == '_') {
+      Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+      if (Ptr) return Ptr;
+    }
+  
+    // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
+    // are references to hidden visibility symbols that dlsym cannot resolve.
+    // If we have one of these, strip off $LDBLStub and try again.
+#if defined(__APPLE__) && defined(__ppc__)
+    if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
+        memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
+      // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
+      // This mirrors logic in libSystemStubs.a.
+      std::string Prefix = std::string(Name.begin(), Name.end()-9);
+      if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
+        return Ptr;
+      if (void *Ptr = getPointerToNamedFunction(Prefix, false))
+        return Ptr;
+    }
+#endif
+  
+    if (AbortOnFailure) {
+      report_fatal_error("Program used external function '"+Name+
+                        "' which could not be resolved!");
+    }
+    return nullptr;
+  }
+
+  void AllocateGOT() override { hqemu_error("fixme.\n"); }
+
+  // Testing methods.
+  bool CheckInvariants(std::string &ErrorStr) override { hqemu_error("fixme.\n"); return false; }
+  size_t GetDefaultCodeSlabSize() override { hqemu_error("fixme.\n"); return 0; }
+  size_t GetDefaultDataSlabSize() override { hqemu_error("fixme.\n"); return 0; }
+  size_t GetDefaultStubSlabSize() override { hqemu_error("fixme.\n"); return 0; }
+  unsigned GetNumCodeSlabs() override { hqemu_error("fixme.\n"); return 0; }
+  unsigned GetNumDataSlabs() override { hqemu_error("fixme.\n"); return 0; }
+  unsigned GetNumStubSlabs() override { hqemu_error("fixme.\n"); return 0; }
+
+  /// startFunctionBody - When a function starts, allocate a block of free
+  /// executable memory, returning a pointer to it and its actual size.
+  uint8_t *startFunctionBody(const Function *F,
+                             uintptr_t &ActualSize) override {
+    lock.acquire();
+    if (unlikely(CodeRemain < Threshold))
+      hqemu_error("internal error (fixme).\n");
+
+    ActualSize = CodeRemain;
+    return CodeGenPtr;
+  }
+
+  /// endFunctionBody - The function F is now allocated, and takes the memory
+  /// in the range [FunctionStart,FunctionEnd).
+  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                       uint8_t *FunctionEnd) override {
+    assert(FunctionEnd > FunctionStart);
+
+    size_t GenSize = FunctionEnd - FunctionStart;
+    if (unlikely(GenSize > CodeRemain))
+        hqemu_error("exceeds available cache size.\n");
+
+    CodeGenPtr = (uint8_t *)(((uintptr_t)CodeGenPtr + GenSize + CODE_GEN_ALIGN - 1)
+                             & ~(CODE_GEN_ALIGN - 1));
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeGenPtr;
+    lock.release();
+  }
+
+  /// allocateSpace - Allocate a memory block of the given size.  This method
+  /// cannot be called between calls to startFunctionBody and endFunctionBody.
+  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) override {
+    hqemu_error("fixme.\n");
+    return nullptr;
+  }
+
+  /// allocateStub - Allocate memory for a function stub.
+  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                        unsigned Alignment) override {
+    return allocateGlobal(StubSize, Alignment);
+  }
+
+  /// allocateGlobal - Allocate memory for a global.
+  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) override {
+    hqemu::MutexGuard locked(lock);
+
+    if (!Alignment)
+      Alignment = 16;
+    if (Alignment & (Alignment - 1))
+      hqemu_error("alignment must be a power of two.\n");
+
+    unsigned MisAligned = ((intptr_t)GlobalBase & (Alignment - 1));
+    if (MisAligned)
+      MisAligned = Alignment - MisAligned;
+
+    if (GlobalRemain < Size + MisAligned)
+      hqemu_error("exceeds available global size.\n");
+
+    uint8_t *GlobalPtr = GlobalBase + MisAligned;
+    GlobalBase = GlobalPtr + Size;
+    GlobalRemain -= (Size + MisAligned);
+    return GlobalPtr;
+  }
+
+  /// allocateCodeSection - Allocate memory for a code section.
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID,
+                               StringRef SectionName) override {
+    hqemu_error("fixme.\n"); return nullptr;
+  }
+
+  /// allocateDataSection - Allocate memory for a data section.
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly) override {
+    hqemu_error("fixme.\n"); return nullptr;
+  }
+
+  bool finalizeMemory(std::string *ErrMsg) override { return false; }
+
+  uint8_t *getGOTBase() const override { return nullptr; }
+
+  void deallocateBlock(void *Block) {}
+
+  /// deallocateFunctionBody - Deallocate all memory for the specified
+  /// function body.
+  void deallocateFunctionBody(void *Body) override {}
+
+  /// setMemoryWritable - When code generation is in progress,
+  /// the code pages may need permissions changed.
+  void setMemoryWritable() override {}
+  /// setMemoryExecutable - When code generation is done and we're ready to
+  /// start execution, the code pages may need permissions changed.
+  void setMemoryExecutable() override {}
+
+  /// setPoisonMemory - Controls whether we write garbage over freed memory.
+  ///
+  void setPoisonMemory(bool poison) override {}
+
+  size_t getCodeSize()      { return CodeGenPtr - CodeBase; }
+  bool isSizeAvailable()    {
+    hqemu::MutexGuard locked(lock);
+    return CodeRemain >= Threshold ? 1 : 0;
+  }
+  void Flush() {
+    CodeGenPtr = CodeBase;
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeBase;
+  }
+
+  static DefaultJITMemoryManager *Create(uint8_t *Cache, size_t Size) {
+    if (Size < MIN_CODE_CACHE_SIZE)
+      hqemu_error("Trace cache size is too small.\n");
+    return new DefaultJITMemoryManager(Cache, Size);
+  }
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/MCJITMemoryManager.h b/llvm/include/MCJITMemoryManager.h
new file mode 100644
index 0000000..33059a5
--- /dev/null
+++ b/llvm/include/MCJITMemoryManager.h
@@ -0,0 +1,213 @@
+//===-- MCJITMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface of the MCJIT memory manager base class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __MCJITMEMORYMANAGER_H
+#define __MCJITMEMORYMANAGER_H
+
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm-debug.h"
+#include "utils.h"
+
+using namespace llvm;
+
+#define MIN_CODE_CACHE_SIZE         (1 * 1024 * 1024)
+#define DEFAULT_GLOBAL_SIZE         (64 * 1024)
+#define DEFAULT_THRESHOLD           (32 * 1024)
+
+// RuntimeDyld clients often want to handle the memory management of
+// what gets placed where. For JIT clients, this is the subset of
+// JITMemoryManager required for dynamic loading of binaries.
+//
+// FIXME: As the RuntimeDyld fills out, additional routines will be needed
+//        for the varying types of objects to be allocated.
+class DefaultMCJITMemoryManager : public RTDyldMemoryManager {
+  uint8_t *TraceCache;
+  size_t TraceCacheSize;
+
+  uint8_t *GlobalBase;  /* section for global data used by QEMU helpers */
+  uint8_t *CodeBase;    /* section for emitting trace code */
+  uint8_t *CodeGenPtr;
+
+  size_t GlobalRemain;
+  size_t CodeRemain;
+  size_t Threshold;
+
+  hqemu::Mutex lock;
+
+  SymbolMap Symbols;
+
+public:
+  DefaultMCJITMemoryManager(uint8_t *Cache, size_t Size)
+    : TraceCache(Cache), TraceCacheSize(Size), Threshold(DEFAULT_THRESHOLD)
+  {
+    GlobalBase = TraceCache;
+    GlobalRemain = DEFAULT_GLOBAL_SIZE;
+
+    CodeBase = GlobalBase + DEFAULT_GLOBAL_SIZE;
+    CodeBase = (uint8_t *)(((uintptr_t)CodeBase + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeBase;
+    CodeGenPtr = CodeBase;
+  }
+  ~DefaultMCJITMemoryManager() {}
+
+  /// Allocate a memory block of (at least) the given size suitable for
+  /// executable code. The SectionID is a unique identifier assigned by the JIT
+  /// engine, and optionally recorded by the memory manager to access a loaded
+  /// section.
+  uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName) override {
+    hqemu::MutexGuard locked(lock);
+
+    if (!Alignment)
+      Alignment = 16;
+
+    if (Alignment & (Alignment - 1))
+      hqemu_error("Alignment must be a power of two.\n");
+
+    uintptr_t CurGenPtr = (uintptr_t)CodeGenPtr;
+    CurGenPtr = (CurGenPtr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+    CodeGenPtr = (uint8_t *)((CurGenPtr + Size + CODE_GEN_ALIGN - 1) &
+                             ~(uintptr_t)(CODE_GEN_ALIGN - 1));
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeGenPtr;
+    return (uint8_t *)CurGenPtr;
+  }
+
+  /// Allocate a memory block of (at least) the given size suitable for data.
+  /// The SectionID is a unique identifier assigned by the JIT engine, and
+  /// optionally recorded by the memory manager to access a loaded section.
+  uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool IsReadOnly) override {
+    return allocateCodeSection(Size, Alignment, SectionID, SectionName);
+  }
+
+  /// Inform the memory manager about the total amount of memory required to
+  /// allocate all sections to be loaded:
+  /// \p CodeSize - the total size of all code sections
+  /// \p DataSizeRO - the total size of all read-only data sections
+  /// \p DataSizeRW - the total size of all read-write data sections
+  /// 
+  /// Note that by default the callback is disabled. To enable it
+  /// redefine the method needsToReserveAllocationSpace to return true.
+  void reserveAllocationSpace(
+    uintptr_t CodeSize, uintptr_t DataSizeRO, uintptr_t DataSizeRW) {
+    hqemu_error("fixme.\n");
+  }
+  
+  /// Override to return true to enable the reserveAllocationSpace callback.
+  bool needsToReserveAllocationSpace() { return false; }
+
+  /// Register the EH frames with the runtime so that c++ exceptions work.
+  ///
+  /// \p Addr parameter provides the local address of the EH frame section
+  /// data, while \p LoadAddr provides the address of the data in the target
+  /// address space.  If the section has not been remapped (which will usually
+  /// be the case for local execution) these two values will be the same.
+  void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+    hqemu_error("fixme.\n");
+  }
+
+  void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+    hqemu_error("fixme.\n");
+  }
+
+  /// This method returns the address of the specified function or variable.
+  /// It is used to resolve symbols during module linking.
+  uint64_t getSymbolAddress(const std::string &Name) {
+    hqemu::MutexGuard locked(lock);
+    if (Symbols.find(Name) == Symbols.end()) {
+      std::string ErrMsg = "Program used external symbol '" + Name +
+                           "'which could not be resolved!\n";
+      hqemu_error(ErrMsg.c_str());
+    }
+    return Symbols[Name];
+  }
+
+  /// This method returns the address of the specified function. As such it is
+  /// only useful for resolving library symbols, not code generated symbols.
+  ///
+  /// If \p AbortOnFailure is false and no function with the given name is
+  /// found, this function returns a null pointer. Otherwise, it prints a
+  /// message to stderr and aborts.
+  ///
+  /// This function is deprecated for memory managers to be used with
+  /// MCJIT or RuntimeDyld.  Use getSymbolAddress instead.
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true) {
+    if (AbortOnFailure) {
+      std::string ErrMsg = "Program used external symbol '" + Name +
+                           "'which could not be resolved!\n";
+      hqemu_error(ErrMsg.c_str());
+    }
+    return nullptr;
+  }
+
+  /// This method is called after an object has been loaded into memory but
+  /// before relocations are applied to the loaded sections.  The object load
+  /// may have been initiated by MCJIT to resolve an external symbol for another
+  /// object that is being finalized.  In that case, the object about which
+  /// the memory manager is being notified will be finalized immediately after
+  /// the memory manager returns from this call.
+  ///
+  /// Memory managers which are preparing code for execution in an external
+  /// address space can use this call to remap the section addresses for the
+  /// newly loaded object.
+#if defined(LLVM_V35)
+  void notifyObjectLoaded(ExecutionEngine *EE,
+                          const ObjectImage *Obj) {
+  }
+#else
+  void notifyObjectLoaded(RuntimeDyld &RTDyld,
+		          const object::ObjectFile &Obj) {
+  }
+#endif
+
+  /// This method is called when object loading is complete and section page
+  /// permissions can be applied.  It is up to the memory manager implementation
+  /// to decide whether or not to act on this method.  The memory manager will
+  /// typically allocate all sections as read-write and then apply specific
+  /// permissions when this method is called.  Code sections cannot be executed
+  /// until this function has been called.  In addition, any cache coherency
+  /// operations needed to reliably use the memory are also performed.
+  ///
+  /// Returns true if an error occurred, false otherwise.
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
+    return false;
+  }
+
+  void AddSymbols(SymbolMap &symbols) {
+    Symbols = symbols;
+  }
+
+  size_t getCodeSize()      { return CodeGenPtr - CodeBase; }
+  bool isSizeAvailable()    {
+    hqemu::MutexGuard locked(lock);
+    return CodeRemain >= Threshold ? 1 : 0;
+  }
+  void Flush() {
+    CodeGenPtr = CodeBase;
+    CodeRemain = (uintptr_t)TraceCache + TraceCacheSize - (uintptr_t)CodeBase;
+  }
+
+  static DefaultMCJITMemoryManager *Create(uint8_t *Cache, size_t Size) {
+    if (Size < MIN_CODE_CACHE_SIZE) {
+      std::string ErrMsg = "Trace cache size is too small (" +
+                           std::to_string(Size) + ")\n.";
+      hqemu_error(ErrMsg.c_str());
+    }
+    return new DefaultMCJITMemoryManager(Cache, Size);
+  }
+};
+
+#endif
diff --git a/llvm/include/hqemu-config.h b/llvm/include/hqemu-config.h
new file mode 100644
index 0000000..2e2f42f
--- /dev/null
+++ b/llvm/include/hqemu-config.h
@@ -0,0 +1,142 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __HQEMU_CONFIG_H
+#define __HQEMU_CONFIG_H
+
+
+#define PACKAGE_NAME "HQEMU"
+#define PACKAGE_VERSION_MAJOR "2.5"
+#define PACKAGE_VERSION_MINOR "2"
+
+#define ENABLE_IBTC
+#define ENABLE_CPBL
+#define ENABLE_LPAGE
+#define ENABLE_PASSES
+#define ENABLE_MCJIT
+//#define ENABLE_HPM_THREAD
+//#define ENABLE_TLBVERSION
+//#define ENALBE_CPU_PROFILE
+//#define USE_TRACETREE_ONLY
+
+
+#if defined(CONFIG_USER_ONLY)
+#  define ENABLE_TCG_VECTOR
+#  define GUEST_BASE guest_base
+#else
+#  define GUEST_BASE (0UL)
+#endif
+
+#if defined(ENABLE_TLBVERSION)
+#  if defined(ALIGNED_ONLY)
+#    undef ENABLE_TLBVERSION
+#  elif HOST_LONG_BITS == 64 && TARGET_LONG_BITS == 32 && defined(HOST_X86_64)
+#    define ENABLE_TLBVERSION_EXT
+#  endif
+#endif
+
+#ifndef ENABLE_TLBVERSION
+#  define TLB_INVALID_SHIFT   3
+#  define TLB_NOTDIRTY_SHIFT  4
+#  define TLB_MMIO_SHIFT      5
+#  define TLB_VERSION_BITS    0
+#  define TLB_VERSION_MASK    0
+#  define TLB_VERSION_SHIFT   (0)
+#  define tlb_version(__env)  0
+typedef target_ulong tlbaddr_t;
+#elif defined(ENABLE_TLBVERSION_EXT)
+#  define TLB_INVALID_SHIFT   3
+#  define TLB_NOTDIRTY_SHIFT  4
+#  define TLB_MMIO_SHIFT      5
+#  define TLB_VERSION_BITS    32
+#  define TLB_VERSION_SIZE    (1UL << TLB_VERSION_BITS)
+#  define TLB_VERSION_MASK    (0xFFFFFFFF00000000UL)
+#  define TLB_VERSION_SHIFT   (32)
+#  define tlb_version(__env)  (__env->tlb_version)
+typedef unsigned long tlbaddr_t;
+#else
+#  define TLB_INVALID_SHIFT   (TARGET_PAGE_BITS - 3)
+#  define TLB_NOTDIRTY_SHIFT  (TARGET_PAGE_BITS - 2)
+#  define TLB_MMIO_SHIFT      (TARGET_PAGE_BITS - 1)
+#  define TLB_VERSION_BITS    (TARGET_PAGE_BITS - 3)
+#  define TLB_VERSION_SIZE    (1 << TLB_VERSION_BITS)
+#  define TLB_VERSION_MASK    (TLB_VERSION_SIZE - 1)
+#  define TLB_VERSION_SHIFT   (0)
+#  define tlb_version(__env)  (__env->tlb_version)
+typedef target_ulong tlbaddr_t;
+#endif
+
+
+typedef int BlockID;
+typedef int TraceID;
+#define BUILD_NONE  ((uint16_t)0)
+#define BUILD_TCG   ((uint16_t)1 << 0)
+#define BUILD_LLVM  ((uint16_t)1 << 1)
+
+#define CPU_OPTIMIZATION_COMMON \
+    unsigned long sp;           \
+    void *opt_link;             \
+    uint16_t build_mode;        \
+    int start_trace_prediction; \
+    int fallthrough;            \
+    uintptr_t image_base;       \
+    uint32_t restore_val;       \
+    uint64_t num_trace_exits;   \
+
+
+#define TB_OPTIMIZATION_COMMON                                     \
+    BlockID id;                                                    \
+    TraceID tid;            /* trace id */                         \
+    int mode;               /* current state */                    \
+    void *opt_ptr;          /* pointer to the optimized code */    \
+    uint32_t exec_count;    /* trace profile execution count */    \
+    uint16_t patch_jmp;     /* offset of trace trampoline */       \
+    uint16_t patch_next;    /* offset of trace prediction stub */  \
+    target_ulong jmp_pc[2]; /* pc of the succeeding blocks */      \
+    void *image;                                                   \
+    void *state;                                                   \
+    void *chain;
+
+
+enum {
+    BLOCK_NONE = 0,
+    BLOCK_ACTIVE,
+    BLOCK_TRACEHEAD,
+    BLOCK_OPTIMIZED,
+    BLOCK_INVALID,
+};
+
+enum {
+    TRANS_MODE_NONE = 0,
+    TRANS_MODE_BLOCK,
+    TRANS_MODE_HYBRIDS,
+    TRANS_MODE_HYBRIDM,
+    TRANS_MODE_INVALID,
+};
+
+/* Parse translation mode from env-variable LLVM_MODE. */
+static inline int getTransMode(void) {
+    char *p = getenv("LLVM_MODE");
+    if (p == NULL)             return TRANS_MODE_HYBRIDM;
+    if (!strcmp(p, "hybridm")) return TRANS_MODE_HYBRIDM;
+    if (!strcmp(p, "hybrids")) return TRANS_MODE_HYBRIDS;
+    if (!strcmp(p, "block"))   return TRANS_MODE_BLOCK;
+    if (!strcmp(p, "none"))    return TRANS_MODE_NONE;
+    return TRANS_MODE_INVALID;
+}
+
+/* Annotation/attribute for traces. */
+enum {
+    A_None   = ((uint32_t)0),
+    A_SetCC  = ((uint32_t)1 << 0),
+    A_NoSIMDization = ((uint32_t)1 << 1),
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/hqemu-helper.h b/llvm/include/hqemu-helper.h
new file mode 100644
index 0000000..dfcb396
--- /dev/null
+++ b/llvm/include/hqemu-helper.h
@@ -0,0 +1,8 @@
+DEF_HELPER_1(export_hqemu, void, env)
+DEF_HELPER_1(lookup_ibtc, ptr, env)
+DEF_HELPER_1(lookup_cpbl, ptr, env)
+DEF_HELPER_3(validate_cpbl, int, env, tl, int)
+DEF_HELPER_2(NET_profile, void, env, int)
+DEF_HELPER_2(NET_predict, void, env, int)
+DEF_HELPER_2(verify_tb, void, env, int) 
+DEF_HELPER_3(profile_exec, void, env, ptr, int)
diff --git a/llvm/include/hqemu.h b/llvm/include/hqemu.h
new file mode 100644
index 0000000..f5e7180
--- /dev/null
+++ b/llvm/include/hqemu.h
@@ -0,0 +1,84 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __HQEMU_H
+#define __HQEMU_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "config-host.h"
+#include "config-target.h"
+#include "hqemu-config.h"
+
+#define build_tcg(_env)        ((_env)->build_mode & BUILD_TCG)
+#define build_llvm(_env)       ((_env)->build_mode & BUILD_LLVM)
+#define build_llvm_only(_env)  ((_env)->build_mode == BUILD_LLVM)
+
+void hqemu_help(void);
+
+/* Optimizations */
+int optimization_init(CPUArchState *env);
+int optimization_finalize(CPUArchState *env);
+int optimization_reset(CPUArchState *env, int force_flush);
+int optimization_remove_entry(CPUArchState *env, TranslationBlock *tb);
+int optimization_flush_page(CPUArchState *env, target_ulong pc);
+int optimization_init_tb(TranslationBlock *tb, int id);
+
+void itlb_update_entry(CPUArchState *env, TranslationBlock *tb);
+void ibtc_update_entry(CPUArchState *env, TranslationBlock *tb);
+
+int lpt_reset(CPUArchState *env);
+int lpt_add_page(CPUArchState *env, target_ulong addr, target_ulong size);
+int lpt_search_page(CPUArchState *env, target_ulong addr, target_ulong *addrp, target_ulong *sizep);
+int lpt_flush_page(CPUArchState *env, target_ulong addr, target_ulong *addrp, target_ulong *sizep);
+
+
+/* Tracer */
+void tracer_exec_tb(CPUArchState *env, uintptr_t next_tb, TranslationBlock *tb);
+void tracer_reset(CPUArchState *env);
+
+
+/* LLVM */
+int llvm_init(void);
+int llvm_finalize(void);
+int llvm_alloc_cache(void);
+int llvm_check_cache(void);
+int llvm_tb_flush(void);
+int llvm_tb_remove(TranslationBlock *tb);
+void llvm_handle_chaining(uintptr_t next_tb, TranslationBlock *tb);
+int llvm_locate_trace(uintptr_t searched_pc);
+TranslationBlock *llvm_find_pc(CPUState *cpu, uintptr_t searched_pc);
+int llvm_restore_state(CPUState *cpu, TranslationBlock *tb, uintptr_t searched_pc);
+void llvm_fork_start(void);
+void llvm_fork_end(int child);
+
+
+/* Annotation */
+enum {
+    ANNOTATION_NONE = 0,
+    ANNOTATION_LOOP,
+};
+int llvm_has_annotation(target_ulong addr, int annotation);
+
+
+/* External variables */
+extern int tracer_mode;
+extern target_ulong pcid;
+extern unsigned long alignment_count[]; /* 0: misaligned, 1: aligned. */
+extern unsigned long aligned_boundary;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/llvm-annotate.h b/llvm/include/llvm-annotate.h
new file mode 100644
index 0000000..25454ed
--- /dev/null
+++ b/llvm/include/llvm-annotate.h
@@ -0,0 +1,51 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_ANNOTATE_H
+#define __LLVM_ANNOTATE_H
+
+#include <map>
+#include <cstdint>
+#include "qemu-types.h"
+#include "llvm-types.h"
+#include "utils.h"
+
+/* Loop metadata */
+struct LoopMetadata {
+    LoopMetadata()
+        : Address(-1), Length(-1), VS(-1), VF(-1), Distance(INT_MIN), Start(-1),
+          End(-1), Stride(-1) {}
+    target_ulong Address;
+    uint32_t Length;
+    uint32_t VS, VF;
+    int Distance;
+    int Start, End;
+    int Stride;
+};
+
+/*
+ * The AnnotationFactory class manages the metadata information.
+ */
+class AnnotationFactory {
+    typedef std::map<uintptr_t, LoopMetadata*> LoopList;
+
+    std::string MetaFile;
+
+    int ParseXML(const char *name);
+
+public:
+    AnnotationFactory();
+    ~AnnotationFactory();
+
+    LoopList Loops;
+    LoopMetadata *getLoopAnnotation(target_ulong addr);
+    bool hasLoopAnnotation(target_ulong addr);
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-debug.h b/llvm/include/llvm-debug.h
new file mode 100644
index 0000000..405b466
--- /dev/null
+++ b/llvm/include/llvm-debug.h
@@ -0,0 +1,247 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_DEBUG_H
+#define __LLVM_DEBUG_H
+
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <cstdarg>
+#include <unistd.h>
+#include <sys/time.h>
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/Support/FileSystem.h"
+#include "utils.h"
+
+
+struct DebugMode {
+    uint64_t Mode;
+
+    DebugMode(uint64_t M) : Mode(M) {}
+
+    bool operator==(const DebugMode &RHS) const {
+        return Mode == RHS.Mode;
+    }
+    bool operator&(const DebugMode &RHS) const {
+        return Mode & RHS.Mode;
+    }
+    DebugMode operator|(const DebugMode &RHS) {
+        return DebugMode(Mode | RHS.Mode);
+    }
+    DebugMode &operator|=(const DebugMode &RHS) {
+        Mode |= RHS.Mode;
+        return *this;
+    }
+};
+
+/*
+ * LLVMDebug provides facilities to debug the LLVM translator, based on the
+ * debug levels.
+ */
+class LLVMDebug {
+public:
+    enum LLVMDebugMode {
+        D_NONE     = ((uint64_t)0),
+        D_LLVM     = ((uint64_t)1 << 0),
+        D_INASM    = ((uint64_t)1 << 1),
+        D_OP       = ((uint64_t)1 << 2),
+        D_OUTASM   = ((uint64_t)1 << 3),
+        D_IR       = ((uint64_t)1 << 4),
+        D_IR_OPT   = ((uint64_t)1 << 5),
+        D_ENTRY    = ((uint64_t)1 << 6),
+        D_VERIFY   = ((uint64_t)1 << 7),
+        D_PASS     = ((uint64_t)1 << 8),
+        D_ANNOTATE = ((uint64_t)1 << 9),
+        D_HPM      = ((uint64_t)1 << 10),
+        D_ASM      = (D_INASM | D_OP | D_OUTASM),
+        D_DEBUG    = (D_LLVM | D_IR_OPT | D_OUTASM | D_PASS),
+        D_ALL      = (D_LLVM | D_INASM | D_OP | D_OUTASM | D_IR | D_IR_OPT |
+                      D_ENTRY | D_VERIFY | D_PASS | D_ANNOTATE | D_HPM),
+    };
+
+    LLVMDebug() : Mode(D_NONE)
+    {
+        hqemu_out.reset(new llvm::raw_fd_ostream(STDOUT_FILENO, false, true));
+        hqemu_dbg.reset(new llvm::raw_fd_ostream(STDERR_FILENO, false, true));
+
+        std::string Str("");
+        gettimeofday(&uptime, nullptr); 
+        ParseDebugMode(Str, false);
+        hqemu_null.SetUnbuffered();
+    }
+
+    DebugMode &getDebugMode() {
+        return Mode;
+    }
+
+    DebugMode &getDebugMode(LLVMDebugMode M) {
+        if (Modes.find(M) == Modes.end())
+            M = D_NONE;
+        return *Modes[M];
+    }
+
+    void setDebugMode(std::string &DebugLevel, std::string &DebugFile) {
+        ParseDebugMode(DebugLevel);
+        if (DebugFile != "") {
+            std::error_code EC;
+            auto OS = new llvm::raw_fd_ostream(DebugFile, EC,
+                                               llvm::sys::fs::F_Text);
+            if (EC) {
+                *hqemu_dbg << "Error: failed to open debug file " << DebugFile
+                           << ". (" << EC.message().c_str() << ")\n";
+            }
+            OS->SetUnbuffered();
+            hqemu_dbg.reset(OS);
+        }
+    }
+
+    void Flush() {
+        hqemu_dbg->flush();
+    }
+
+    void error(const char *fname, const char *fmt, ...) {
+        static char str[256] = {'\0'};
+        va_list ap;
+        va_start(ap, fmt);
+        vsprintf(str, fmt, ap);
+        va_end(ap);
+        *hqemu_dbg << timestamp() << " Error: " << fname << " - " << str;
+        exit(0);
+    }
+
+    llvm::raw_ostream &output() {
+        return *hqemu_out;
+    }
+
+    llvm::raw_ostream &debug() {
+        return *hqemu_dbg;
+    }
+
+    llvm::raw_ostream &operator<<(DebugMode &M) {
+        if (M & Mode) {
+            *hqemu_dbg << timestamp() << " ";
+            return *hqemu_dbg;
+        }
+        return hqemu_null;
+    };
+
+private:
+    llvm::raw_null_ostream hqemu_null;
+    std::unique_ptr<llvm::raw_fd_ostream> hqemu_out;
+    std::unique_ptr<llvm::raw_fd_ostream> hqemu_dbg;
+    struct timeval uptime; /* The startup time of the DBT */
+    DebugMode Mode;        /* The debug level */
+    std::map<LLVMDebugMode, DebugMode*> Modes;
+
+    std::string timestamp() {
+        struct timeval tv;
+        char timestamp[32];
+        gettimeofday(&tv, 0);
+        timersub(&tv, &uptime, &tv);
+        strftime(timestamp, 32, "[%H:%M:%S", gmtime(&tv.tv_sec));
+        sprintf(timestamp + 9, ".%06ld]", tv.tv_usec);
+        return timestamp;
+    }
+
+    void ParseDebugMode(std::string &DebugLevel, bool Update=true) {
+        static std::string debug_str[] = {
+            "none", "llvm", "in_asm", "op", "out_asm", "ir", "ir_opt", 
+            "entry", "verify", "pass", "annotate", "hpm", "asm", "debug",
+            "all"
+        };
+        static LLVMDebugMode debug_enum[] = {
+            D_NONE, D_LLVM, D_INASM, D_OP, D_OUTASM, D_IR, D_IR_OPT,
+            D_ENTRY, D_VERIFY, D_PASS, D_ANNOTATE, D_HPM, D_ASM, D_DEBUG,
+            D_ALL
+        };
+
+        if (!Update) {
+            for (auto M : debug_enum)
+                Modes[M] = new DebugMode(M);
+            return;
+        }
+
+        if (DebugLevel.empty())
+            return;
+
+	std::istringstream ss(DebugLevel);
+        std::string token;
+        while(std::getline(ss, token, ',')) {
+            for (unsigned i = 0, e = ARRAY_SIZE(debug_enum); i != e; ++i) {
+                if (token == debug_str[i]) {
+                    Mode |= getDebugMode(debug_enum[i]);
+                    break;
+                }
+            }
+        }
+    }
+};
+
+extern LLVMDebug DM;
+
+/* Print messages to stdout. Should not use this function in release mode. */
+static inline llvm::raw_ostream &out() {
+    return DM.output();
+}
+/* Print messages to stderr, controlled by DebugMode. */
+static inline LLVMDebug &dbg() {
+    return DM;
+}
+/* Print error messages to stderr and terminate the process. */
+#define hqemu_error(msg,args...) do { DM.error(__func__,msg,##args); } while(0)
+
+/* Macros to get defined DebugMode. */
+#define DEBUG_NONE      DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_NONE)
+#define DEBUG_LLVM      DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_LLVM)
+#define DEBUG_INASM     DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_INASM)
+#define DEBUG_OP        DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_OP)
+#define DEBUG_OUTASM    DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_OUTASM)
+#define DEBUG_IR        DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_IR)
+#define DEBUG_IR_OPT    DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_IR_OPT)
+#define DEBUG_ENTRY     DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_ENTRY)
+#define DEBUG_VERIFY    DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_VERIFY)
+#define DEBUG_PASS      DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_PASS)
+#define DEBUG_ANNOTATE  DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_ANNOTATE)
+#define DEBUG_HPM       DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_HPM)
+#define DEBUG_ASM       DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_ASM)
+#define DEBUG_DEBUG     DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_DEBUG)
+#define DEBUG_ALL       DM.getDebugMode(LLVMDebug::LLVMDebugMode::D_ALL)
+
+
+
+/*
+ * Binary disassembler using MCDisassembler.
+ */
+class MCDisasm {
+    const llvm::MCDisassembler *DisAsm;
+    const llvm::MCSubtargetInfo *STI;
+    llvm::MCInstPrinter *IP;
+    const llvm::MCInstrAnalysis *MIA;
+    bool HostDisAsm;
+    bool NoShowRawInsn;
+
+    MCDisasm(const llvm::Target *TheTarget, std::string TripleName,
+             bool isHost);
+
+    void DumpBytes(llvm::ArrayRef<uint8_t> bytes, llvm::raw_ostream &OS);
+
+public:
+    ~MCDisasm();
+    void PrintInAsm(uint64_t Addr, uint64_t Size, uint64_t GuestAddr);
+    void PrintOutAsm(uint64_t Addr, uint64_t Size);
+
+    static MCDisasm *CreateMCDisasm(std::string TripleName, bool isHost);
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-hard-perfmon.h b/llvm/include/llvm-hard-perfmon.h
new file mode 100644
index 0000000..ac03b23
--- /dev/null
+++ b/llvm/include/llvm-hard-perfmon.h
@@ -0,0 +1,87 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_HARD_PERFMON_H
+#define __LLVM_HARD_PERFMON_H
+
+#include <map>
+#include <thread>
+#include "pmu/pmu.h"
+#include "utils.h"
+
+class PerfmonData;
+class BaseTracer;
+
+enum HPMControl {
+    HPM_INIT = 0,
+    HPM_FINALIZE,
+    HPM_START,
+    HPM_STOP,
+};
+
+/*
+ * Hardware Performance Monitor (HPM)
+ */
+class HardwarePerfmon {
+    std::thread MonThread;  /* Monitor thread */
+    int MonThreadID;        /* Monitor thread id */
+    bool MonThreadStop;     /* Monitor thread is stopped or not */
+    hqemu::Mutex Lock;
+
+    /* Start monitor thread. */
+    void StartMonThread();
+
+    /* Monitor thread routine. */
+    void MonitorFunc();
+
+public:
+    HardwarePerfmon();
+    ~HardwarePerfmon();
+
+    /* Set up HPM with the monitor thread id */
+    void Init(int monitor_thread_tid);
+
+    /* Register a thread to be monitored. */
+    void RegisterThread(BaseTracer *Tracer);
+
+    /* Unreigster a thread from being monitored. */
+    void UnregisterThread(BaseTracer *Tracer);
+
+    /* Notify that the execution enters/leaves the code cache. */
+    void NotifyCacheEnter(BaseTracer *Tracer);
+    void NotifyCacheLeave(BaseTracer *Tracer);
+
+    /* Stop the monitor. */
+    void Pause();
+
+    /* Restart the monitor. */
+    void Resume();
+};
+
+
+class PerfmonData {
+public:
+    PerfmonData(int tid);
+    ~PerfmonData();
+
+    int TID;
+    pmu::Handle ICountHndl;
+    pmu::Handle BranchHndl;
+    pmu::Handle MemLoadHndl;
+    pmu::Handle MemStoreHndl;
+    pmu::Handle CoverSetHndl;
+    uint64_t LastNumBranches, LastNumLoads, LastNumStores;
+
+    void MonitorBasic(HPMControl Ctl);
+    void MonitorCoverSet(HPMControl Ctl);
+};
+
+extern HardwarePerfmon *HP;
+
+#endif /* __LLVM_HARD_PERFMON_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-helper.h b/llvm/include/llvm-helper.h
new file mode 100644
index 0000000..2d24f81
--- /dev/null
+++ b/llvm/include/llvm-helper.h
@@ -0,0 +1,755 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file defines the QEMU helper functions that could be inlined by
+ *   the LLVM translators.
+ */
+
+#ifndef __LLVM_HELPER_H
+#define __LLVM_HELPER_H
+
+/* Speical TCG runtime helper */
+    "tcg_helper_div_i32",
+    "tcg_helper_rem_i32",
+    "tcg_helper_divu_i32",
+    "tcg_helper_remu_i32",
+    "tcg_helper_shl_i64",
+    "tcg_helper_shr_i64",
+    "tcg_helper_sar_i64",
+    "tcg_helper_div_i64",
+    "tcg_helper_rem_i64",
+    "tcg_helper_divu_i64",
+    "tcg_helper_remu_i64",
+
+#if defined(TARGET_I386)
+    /* General */
+    "helper_cc_compute_c",
+    "helper_cc_compute_all",
+    "helper_load_seg",
+    "helper_write_eflags",
+    "helper_read_eflags",
+    "helper_cli",
+    "helper_sti",
+    "helper_set_inhibit_irq",
+    "helper_reset_inhibit_irq",
+    /* FPU */
+    "helper_divb_AL", 
+    "helper_idivb_AL", 
+    "helper_divw_AX", 
+    "helper_idivw_AX", 
+    "helper_divl_EAX", 
+    "helper_idivl_EAX", 
+    "helper_flds_FT0", 
+    "helper_fldl_FT0", 
+    "helper_fildl_FT0", 
+    "helper_flds_ST0", 
+    "helper_fldl_ST0", 
+    "helper_fildl_ST0", 
+    "helper_fildll_ST0",
+    "helper_fsts_ST0",
+    "helper_fstl_ST0", 
+    "helper_fist_ST0", 
+    "helper_fistl_ST0", 
+    "helper_fistll_ST0", 
+    "helper_fistt_ST0", 
+    "helper_fisttl_ST0", 
+    "helper_fisttll_ST0",
+    "helper_fldt_ST0", 
+    "helper_fstt_ST0", 
+    "helper_fpush", 
+    "helper_fpop", 
+    "helper_fdecstp", 
+    "helper_fincstp", 
+    "helper_ffree_STN", 
+    "helper_fmov_ST0_FT0", 
+    "helper_fmov_FT0_STN", 
+    "helper_fmov_ST0_STN", 
+    "helper_fmov_STN_ST0", 
+    "helper_fxchg_ST0_STN",
+    "helper_fcom_ST0_FT0", 
+    "helper_fucom_ST0_FT0", 
+    "helper_fcomi_ST0_FT0", 
+    "helper_fucomi_ST0_FT0",
+    "helper_fadd_ST0_FT0", 
+    "helper_fmul_ST0_FT0", 
+    "helper_fsub_ST0_FT0",
+    "helper_fsubr_ST0_FT0", 
+    "helper_fdiv_ST0_FT0", 
+    "helper_fdivr_ST0_FT0", 
+    "helper_fadd_STN_ST0", 
+    "helper_fmul_STN_ST0", 
+    "helper_fsub_STN_ST0", 
+    "helper_fsubr_STN_ST0", 
+    "helper_fdiv_STN_ST0", 
+    "helper_fdivr_STN_ST0", 
+    "helper_fchs_ST0",
+    "helper_fabs_ST0",
+#if defined(TCG_TARGET_I386) && TCG_TARGET_REG_BITS == 64
+    "helper_fxam_ST0",
+#endif
+    "helper_fld1_ST0",
+    "helper_fldl2t_ST0",
+    "helper_fldl2e_ST0",
+    "helper_fldpi_ST0", 
+    "helper_fldlg2_ST0",
+    "helper_fldln2_ST0",
+    "helper_fldz_ST0", 
+    "helper_fldz_FT0", 
+    "helper_fnstsw", 
+    "helper_fnstcw", 
+    "helper_fldcw",
+    "helper_fclex",
+    "helper_fwait", 
+    "helper_fninit", 
+    "helper_fbld_ST0",
+    "helper_fbst_ST0",
+    "helper_f2xm1", 
+    "helper_fyl2x", 
+    "helper_fptan", 
+    "helper_fpatan", 
+    "helper_fxtract",
+    "helper_fprem1", 
+    "helper_fprem", 
+    "helper_fyl2xp1",
+    "helper_fsqrt", 
+    "helper_fsincos",
+    "helper_frndint",
+    "helper_fscale", 
+    "helper_fsin", 
+    "helper_fcos", 
+    "helper_fstenv", 
+    "helper_fldenv", 
+    "helper_fsave", 
+    "helper_frstor", 
+    "helper_fxsave", 
+    "helper_fxrstor",
+    "helper_bsf",
+    "helper_bsr",
+    "helper_lzcnt", 
+
+    /* MMX/SSE */
+    "helper_psrlw_xmm", 
+    "helper_psraw_xmm",
+    "helper_psllw_xmm",
+    "helper_psrld_xmm",
+    "helper_psrad_xmm",
+    "helper_pslld_xmm",
+    "helper_psrlq_xmm",
+    "helper_psllq_xmm",
+    "helper_psrldq_xmm", 
+    "helper_pslldq_xmm",
+    "helper_paddb_xmm", 
+    "helper_paddw_xmm",
+    "helper_paddl_xmm",
+    "helper_paddq_xmm",
+    "helper_psubb_xmm",
+    "helper_psubw_xmm",
+    "helper_psubl_xmm",
+    "helper_psubq_xmm",
+    "helper_paddusb_xmm",
+    "helper_paddsb_xmm",
+    "helper_psubusb_xmm",
+    "helper_psubsb_xmm", 
+    "helper_paddusw_xmm",
+    "helper_paddsw_xmm",
+    "helper_psubusw_xmm",
+    "helper_psubsw_xmm", 
+    "helper_pminub_xmm",
+    "helper_pmaxub_xmm",
+    "helper_pminsw_xmm",
+    "helper_pmaxsw_xmm",
+    "helper_pand_xmm",
+    "helper_pandn_xmm",
+    "helper_por_xmm",
+    "helper_pxor_xmm",
+    "helper_pcmpgtb_xmm", 
+    "helper_pcmpgtw_xmm",
+    "helper_pcmpgtl_xmm",
+    "helper_pcmpeqb_xmm",
+    "helper_pcmpeqw_xmm",
+    "helper_pcmpeql_xmm",
+    "helper_pmullw_xmm",
+    "helper_pmulhuw_xmm",
+    "helper_pmulhw_xmm",
+    "helper_pavgb_xmm",
+    "helper_pavgw_xmm",
+    "helper_pmuludq_xmm",
+    "helper_pmaddwd_xmm",
+    "helper_psadbw_xmm",
+    "helper_maskmov_xmm",
+    "helper_movl_mm_T0_xmm",
+    "helper_shufps_xmm",
+    "helper_shufpd_xmm",
+#if !defined(TCG_TARGET_ARM)
+    "helper_pshufd_xmm",
+    "helper_pshuflw_xmm",
+    "helper_pshufhw_xmm",
+    "helper_punpcklbw_xmm", 
+    "helper_punpcklwd_xmm",
+    "helper_punpckldq_xmm",
+    "helper_punpckhbw_xmm",
+    "helper_punpckhwd_xmm",
+    "helper_punpckhdq_xmm",
+#endif
+    "helper_punpcklqdq_xmm",
+    "helper_punpckhqdq_xmm",
+
+    "helper_enter_mmx",
+    "helper_psrlw_mmx", 
+    "helper_psraw_mmx",
+    "helper_psllw_mmx",
+    "helper_psrld_mmx",
+    "helper_psrad_mmx",
+    "helper_pslld_mmx",
+    "helper_psrlq_mmx",
+    "helper_psllq_mmx",
+    "helper_psrldq_mmx", 
+    "helper_pslldq_mmx",
+    "helper_paddb_mmx", 
+    "helper_paddw_mmx",
+    "helper_paddl_mmx",
+    "helper_paddq_mmx",
+    "helper_psubb_mmx",
+    "helper_psubw_mmx",
+    "helper_psubl_mmx",
+    "helper_psubq_mmx",
+    "helper_paddusb_mmx",
+    "helper_paddsb_mmx",
+    "helper_psubusb_mmx",
+    "helper_psubsb_mmx", 
+    "helper_paddusw_mmx",
+    "helper_paddsw_mmx",
+    "helper_psubusw_mmx",
+    "helper_psubsw_mmx", 
+    "helper_pminub_mmx",
+    "helper_pmaxub_mmx",
+    "helper_pminsw_mmx",
+    "helper_pmaxsw_mmx",
+    "helper_pand_mmx",
+    "helper_pandn_mmx",
+    "helper_por_mmx",
+    "helper_pxor_mmx",
+    "helper_pcmpgtb_mmx", 
+    "helper_pcmpgtw_mmx",
+    "helper_pcmpgtl_mmx",
+    "helper_pcmpeqb_mmx",
+    "helper_pcmpeqw_mmx",
+    "helper_pcmpeql_mmx",
+    "helper_pmullw_mmx",
+    "helper_pmulhuw_mmx",
+    "helper_pmulhw_mmx",
+    "helper_pavgb_mmx",
+    "helper_pavgw_mmx",
+    "helper_pmuludq_mmx",
+    "helper_pmaddwd_mmx",
+    "helper_psadbw_mmx",
+    "helper_maskmov_mmx",
+    "helper_movl_mm_T0_mmx",
+    "helper_shufps_mmx",
+    "helper_shufpd_mmx",
+#if !defined(TCG_TARGET_ARM)
+    "helper_pshufd_mmx",
+    "helper_pshuflw_mmx",
+    "helper_pshufhw_mmx",
+    "helper_punpcklbw_mmx", 
+    "helper_punpcklwd_mmx",
+    "helper_punpckldq_mmx",
+    "helper_punpckhbw_mmx",
+    "helper_punpckhwd_mmx",
+    "helper_punpckhdq_mmx",
+#endif
+    "helper_punpcklqdq_mmx",
+    "helper_punpckhqdq_mmx",
+
+    "helper_addps",
+    "helper_addss",
+    "helper_addpd",
+    "helper_addsd",
+    "helper_subps",
+    "helper_subss",
+    "helper_subpd",
+    "helper_subsd",
+    "helper_mulps",
+    "helper_mulss",
+    "helper_mulpd",
+    "helper_mulsd",
+    "helper_divps",
+    "helper_divss",
+    "helper_divpd",
+    "helper_divsd",
+    "helper_minps",
+    "helper_minss",
+    "helper_minpd",
+    "helper_minsd",
+    "helper_maxps",
+    "helper_maxss",
+    "helper_maxpd",
+    "helper_maxsd",
+    "helper_sqrtps",
+    "helper_sqrtss",
+    "helper_sqrtpd",
+    "helper_sqrtsd",
+    "helper_shufps",
+    "helper_shufpd",
+
+    "helper_cmpeqps",
+    "helper_cmpeqss",
+    "helper_cmpeqpd",
+    "helper_cmpeqsd",
+    "helper_cmpltps",
+    "helper_cmpltss",
+    "helper_cmpltpd",
+    "helper_cmpltsd",
+    "helper_cmpleps",
+    "helper_cmpless",
+    "helper_cmplepd",
+    "helper_cmplesd",
+    "helper_cmpunordps",
+    "helper_cmpunordss",
+    "helper_cmpunordpd",
+    "helper_cmpunordsd",
+    "helper_cmpneqps",
+    "helper_cmpneqss",
+    "helper_cmpneqpd",
+    "helper_cmpneqsd",
+    "helper_cmpnltps",
+    "helper_cmpnltss",
+    "helper_cmpnltpd",
+    "helper_cmpnltsd",
+    "helper_cmpnleps",
+    "helper_cmpnless",
+    "helper_cmpnlepd",
+    "helper_cmpnlesd",
+    "helper_cmpordps",
+    "helper_cmpordss",
+    "helper_cmpordpd",
+    "helper_cmpordsd",
+
+    "helper_cvtps2pd",
+    "helper_cvtpd2ps",
+    "helper_cvtss2sd",
+    "helper_cvtsd2ss",
+    "helper_cvtdq2ps",
+    "helper_cvtdq2pd",
+    "helper_cvtpi2ps",
+    "helper_cvtpi2pd",
+    "helper_cvtsi2ss",
+    "helper_cvtsi2sd",
+    "helper_cvtps2dq",
+    "helper_cvtpd2dq",
+    "helper_cvtps2pi",
+    "helper_cvtpd2pi",
+    "helper_cvtss2si",
+    "helper_cvtsd2si",
+    "helper_cvttps2dq",
+    "helper_cvttpd2dq",
+    "helper_cvttps2pi",
+    "helper_cvttpd2pi",
+    "helper_cvttss2si",
+    "helper_cvttsd2si",
+
+    "helper_cmpeqps",
+    "helper_cmpeqss",
+    "helper_cmpeqpd",
+    "helper_cmpeqsd",
+    "helper_cmpltps",
+    "helper_cmpltss",
+    "helper_cmpltpd",
+    "helper_cmpltsd",
+    "helper_cmpleps",
+    "helper_cmpless",
+    "helper_cmplepd",
+    "helper_cmplesd",
+    "helper_cmpunordps",
+    "helper_cmpunordss",
+    "helper_cmpunordpd",
+    "helper_cmpunordsd",
+    "helper_cmpneqps",
+    "helper_cmpneqss",
+    "helper_cmpneqpd",
+    "helper_cmpneqsd",
+    "helper_cmpnltps",
+    "helper_cmpnltss",
+    "helper_cmpnltpd",
+    "helper_cmpnltsd",
+    "helper_cmpnleps",
+    "helper_cmpnless",
+    "helper_cmpnlepd",
+    "helper_cmpnlesd",
+    "helper_cmpordps",
+    "helper_cmpordss",
+    "helper_cmpordpd",
+    "helper_cmpordsd",
+
+    "helper_ucomisd",
+    "helper_comisd",
+    "helper_ucomiss",
+    "helper_comiss",
+
+    "helper_packuswb_xmm",
+    "helper_packsswb_xmm",
+    "helper_pmovmskb_xmm",
+    "helper_pshufw_mmx",
+
+#elif defined(TARGET_ARM)
+    "helper_add_cc",
+    "helper_sub_cc",
+    "helper_shl_cc",
+    "helper_shr_cc",
+    "helper_sar_cc",
+    "helper_adc_cc",
+    "helper_sbc_cc",
+    "helper_shl",
+    "helper_shr",
+    "helper_sar",
+    "helper_clz",
+
+    "helper_sadd8",
+    "helper_sadd16",
+    "helper_ssub8",
+    "helper_ssub16",
+    "helper_ssubaddx",
+    "helper_saddsubx",
+    "helper_uadd8",
+    "helper_uadd16",
+    "helper_usub8",
+    "helper_usub16",
+    "helper_usubaddx",
+    "helper_uaddsubx",
+
+    "helper_qadd8",
+    "helper_qadd16",
+    "helper_qsub8",
+    "helper_qsub16",
+    "helper_qsubaddx",
+    "helper_qaddsubx",
+    "helper_uqadd8",
+    "helper_uqadd16",
+    "helper_uqsub8",
+    "helper_uqsub16",
+    "helper_uqsubaddx",
+    "helper_uqaddsubx",
+
+    "helper_set_rmode",
+    "helper_cpsr_write_nzcv",
+    "helper_cpsr_write",
+    "helper_cpsr_read",
+    "helper_vfp_get_fpscr",
+    "helper_vfp_set_fpscr",
+    "helper_vfp_adds",
+    "helper_vfp_addd",
+    "helper_vfp_subs",
+    "helper_vfp_subd",
+    "helper_vfp_muls",
+    "helper_vfp_muld",
+    "helper_vfp_divs",
+    "helper_vfp_divd",
+    "helper_vfp_negs",
+    "helper_vfp_negd",
+    "helper_vfp_abss",
+    "helper_vfp_absd",
+    "helper_vfp_sqrts",
+    "helper_vfp_sqrtd",
+    "helper_vfp_cmps",
+    "helper_vfp_cmpd",
+    "helper_vfp_cmpes",
+    "helper_vfp_cmped",
+
+    "helper_vfp_muladds",
+    "helper_vfp_muladdd",
+
+#if defined(TARGET_AARCH64)
+    "helper_vfp_cmps_a64",
+    "helper_vfp_cmpd_a64",
+    "helper_vfp_cmpes_a64",
+    "helper_vfp_cmped_a64",
+    "helper_vfp_minnums",
+    "helper_vfp_maxnums",
+    "helper_vfp_minnumd",
+    "helper_vfp_maxnumd",
+#endif
+#if !defined(TCG_TARGET_PPC64)
+    "helper_vfp_fcvtds",
+    "helper_vfp_fcvtsd",
+    "helper_vfp_uitos",
+    "helper_vfp_uitod",
+    "helper_vfp_sitos",
+    "helper_vfp_sitod",
+    "helper_vfp_touis",
+    "helper_vfp_touid",
+    "helper_vfp_touizs",
+    "helper_vfp_touizd",
+    "helper_vfp_tosis",
+    "helper_vfp_tosid",
+    "helper_vfp_tosizs",
+    "helper_vfp_tosizd",
+    "helper_vfp_toshs", 
+    "helper_vfp_tosls", 
+    "helper_vfp_touhs", 
+    "helper_vfp_touls", 
+    "helper_vfp_toshd", 
+    "helper_vfp_tosld", 
+    "helper_vfp_touhd", 
+    "helper_vfp_tould", 
+    "helper_vfp_shtos", 
+    "helper_vfp_sltos", 
+    "helper_vfp_uhtos", 
+    "helper_vfp_ultos", 
+    "helper_vfp_shtod", 
+    "helper_vfp_sltod", 
+    "helper_vfp_uhtod", 
+    "helper_vfp_ultod", 
+#endif
+
+    /* neon helper */
+    "helper_neon_qadd_u8",
+    "helper_neon_qadd_s8",
+    "helper_neon_qadd_u16",
+    "helper_neon_qadd_s16",
+    "helper_neon_qsub_u8",
+    "helper_neon_qsub_s8",
+    "helper_neon_qsub_u16",
+    "helper_neon_qsub_s16",
+ 
+    "helper_neon_hadd_s8",
+    "helper_neon_hadd_u8",
+    "helper_neon_hadd_s16",
+    "helper_neon_hadd_u16",
+    "helper_neon_hadd_s32",
+    "helper_neon_hadd_u32",
+    "helper_neon_rhadd_s8",
+    "helper_neon_rhadd_u8",
+    "helper_neon_rhadd_s16",
+    "helper_neon_rhadd_u16",
+    "helper_neon_rhadd_s32",
+    "helper_neon_rhadd_u32",
+    "helper_neon_hsub_s8",
+    "helper_neon_hsub_u8",
+    "helper_neon_hsub_s16",
+    "helper_neon_hsub_u16",
+    "helper_neon_hsub_s32",
+    "helper_neon_hsub_u32",
+
+    "helper_neon_cgt_u8",
+    "helper_neon_cgt_s8",
+    "helper_neon_cgt_u16",
+    "helper_neon_cgt_s16",
+    "helper_neon_cgt_u32",
+    "helper_neon_cgt_s32",
+    "helper_neon_cge_u8",
+    "helper_neon_cge_s8",
+    "helper_neon_cge_u16",
+    "helper_neon_cge_s16",
+    "helper_neon_cge_u32",
+    "helper_neon_cge_s32",
+
+    "helper_neon_min_u8",
+    "helper_neon_min_s8",
+    "helper_neon_min_u16",
+    "helper_neon_min_s16",
+    "helper_neon_min_u32",
+    "helper_neon_min_s32",
+    "helper_neon_max_u8",
+    "helper_neon_max_s8",
+    "helper_neon_max_u16",
+    "helper_neon_max_s16",
+    "helper_neon_max_u32",
+    "helper_neon_max_s32",
+    "helper_neon_pmin_u8",
+    "helper_neon_pmin_s8",
+    "helper_neon_pmin_u16",
+    "helper_neon_pmin_s16",
+    "helper_neon_pmax_u8",
+    "helper_neon_pmax_s8",
+    "helper_neon_pmax_u16",
+    "helper_neon_pmax_s16",
+
+    "helper_neon_abd_u8",
+    "helper_neon_abd_s8",
+    "helper_neon_abd_u16",
+    "helper_neon_abd_s16",
+    "helper_neon_abd_u32",
+    "helper_neon_abd_s32",
+
+    "helper_neon_shl_u8",
+    "helper_neon_shl_s8",
+    "helper_neon_shl_u16",
+    "helper_neon_shl_s16",
+    "helper_neon_shl_u32",
+    "helper_neon_shl_s32",
+    "helper_neon_shl_u64",
+    "helper_neon_shl_s64",
+    "helper_neon_rshl_u8",
+    "helper_neon_rshl_s8",
+    "helper_neon_rshl_u16",
+    "helper_neon_rshl_s16",
+    "helper_neon_rshl_u32",
+    "helper_neon_rshl_s32",
+    "helper_neon_rshl_u64",
+    "helper_neon_rshl_s64",
+    "helper_neon_qshl_u8",
+    "helper_neon_qshl_s8",
+    "helper_neon_qshl_u16",
+    "helper_neon_qshl_s16",
+    "helper_neon_qshl_u32",
+    "helper_neon_qshl_s32",
+    "helper_neon_qshl_u64",
+    "helper_neon_qshl_s64",
+    "helper_neon_qrshl_u8",
+    "helper_neon_qrshl_s8",
+    "helper_neon_qrshl_u16",
+    "helper_neon_qrshl_s16",
+    "helper_neon_qrshl_u32",
+    "helper_neon_qrshl_s32",
+    "helper_neon_qrshl_u64",
+    "helper_neon_qrshl_s64",
+
+    "helper_neon_add_u8",
+    "helper_neon_add_u16",
+    "helper_neon_padd_u8",
+    "helper_neon_padd_u16",
+    "helper_neon_sub_u8",
+    "helper_neon_sub_u16",
+    "helper_neon_mul_u8",
+    "helper_neon_mul_u16",
+    "helper_neon_mul_p8",
+
+    "helper_neon_tst_u8",
+    "helper_neon_tst_u16",
+    "helper_neon_tst_u32",
+    "helper_neon_ceq_u8",
+    "helper_neon_ceq_u16",
+    "helper_neon_ceq_u32",
+
+    "helper_neon_abs_s8",
+    "helper_neon_abs_s16",
+    "helper_neon_clz_u8",
+    "helper_neon_clz_u16",
+    "helper_neon_cls_s8",
+    "helper_neon_cls_s16",
+    "helper_neon_cls_s32",
+    "helper_neon_cnt_u8",
+
+    "helper_neon_qdmulh_s16",
+    "helper_neon_qrdmulh_s16",
+    "helper_neon_qdmulh_s32",
+    "helper_neon_qrdmulh_s32",
+
+    "helper_neon_narrow_u8",
+    "helper_neon_narrow_u16",
+    "helper_neon_narrow_sat_u8",
+    "helper_neon_narrow_sat_s8",
+    "helper_neon_narrow_sat_u16",
+    "helper_neon_narrow_sat_s16",
+    "helper_neon_narrow_sat_u32",
+    "helper_neon_narrow_sat_s32",
+    "helper_neon_narrow_high_u8",
+    "helper_neon_narrow_high_u16",
+    "helper_neon_narrow_round_high_u8",
+    "helper_neon_narrow_round_high_u16",
+    "helper_neon_widen_u8",
+    "helper_neon_widen_s8",
+    "helper_neon_widen_u16",
+    "helper_neon_widen_s16",
+
+    "helper_neon_addl_u16",
+    "helper_neon_addl_u32",
+    "helper_neon_paddl_u16",
+    "helper_neon_paddl_u32",
+    "helper_neon_subl_u16",
+    "helper_neon_subl_u32",
+    "helper_neon_addl_saturate_s32",
+    "helper_neon_addl_saturate_s64",
+    "helper_neon_abdl_u16",
+    "helper_neon_abdl_s16",
+    "helper_neon_abdl_u32",
+    "helper_neon_abdl_s32",
+    "helper_neon_abdl_u64",
+    "helper_neon_abdl_s64",
+    "helper_neon_mull_u8",
+    "helper_neon_mull_s8",
+    "helper_neon_mull_u16",
+    "helper_neon_mull_s16",
+
+    "helper_neon_negl_u16",
+    "helper_neon_negl_u32",
+    "helper_neon_negl_u64",
+
+    "helper_neon_qabs_s8",
+    "helper_neon_qabs_s16",
+    "helper_neon_qabs_s32",
+    "helper_neon_qneg_s8",
+    "helper_neon_qneg_s16",
+    "helper_neon_qneg_s32",
+
+    "helper_neon_min_f32",
+    "helper_neon_max_f32",
+    "helper_neon_abd_f32",
+    "helper_neon_add_f32",
+    "helper_neon_sub_f32",
+    "helper_neon_mul_f32",
+    "helper_neon_ceq_f32",
+    "helper_neon_cge_f32",
+    "helper_neon_cgt_f32",
+    "helper_neon_acge_f32",
+    "helper_neon_acgt_f32",
+
+#elif defined(TARGET_PPC)
+    "helper_popcntb",
+    "helper_cntlzw",
+    "helper_cntlsw32",
+    "helper_cntlzw32",
+
+    "helper_compute_fprf",
+    "helper_store_fpscr",
+    "helper_fpscr_clrbit",
+    "helper_fpscr_setbit",
+    "helper_fcmpo",
+    "helper_fcmpu",
+
+    "helper_fctiw",
+    "helper_fctiwz",
+    "helper_frsp",
+    "helper_frin",
+    "helper_friz",
+    "helper_frip",
+    "helper_frim",
+
+    "helper_fadd",
+    "helper_fsub",
+    "helper_fmul",
+    "helper_fdiv",
+    "helper_fmadd",
+    "helper_fmsub",
+    "helper_fnmadd",
+    "helper_fnmsub",
+    "helper_fabs",
+    "helper_fnabs",
+    "helper_fneg",
+    "helper_fsqrt",
+    "helper_fre",
+    "helper_fres",
+    "helper_frsqrte",
+    "helper_fsel",
+
+#elif defined(TARGET_MICROBLAZE)
+    "helper_addkc",
+    "helper_subkc",
+    "helper_cmp",
+    "helper_cmpu",
+    "helper_divs",
+    "helper_divu",
+#elif defined(TARGET_MIPS)
+    "helper_lwl",
+    "helper_lwr",
+    "helper_swl",
+    "helper_swr",
+#endif
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/llvm-macro.h b/llvm/include/llvm-macro.h
new file mode 100644
index 0000000..7b0e613
--- /dev/null
+++ b/llvm/include/llvm-macro.h
@@ -0,0 +1,88 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_MACRO_H
+#define __LLVM_MACRO_H
+
+#if defined(CONFIG_SOFTMMU)
+#define SaveStates()    SaveGlobals(COHERENCE_GLOBAL, LastInst)
+#else
+#define SaveStates()
+#endif
+
+#define CONST8(a)       ConstantInt::get(Int8Ty, a)
+#define CONST16(a)      ConstantInt::get(Int16Ty, a)
+#define CONST32(a)      ConstantInt::get(Int32Ty, a)
+#define CONST64(a)      ConstantInt::get(Int64Ty, a)
+#define CONST128(a)     ConstantInt::get(Int128Ty, a)
+#define CONSTPtr(a)     ConstantInt::get(IntPtrTy, a)
+
+#define FPCONST32(a)    ConstantFP::get(FloatTy, a)
+#define FPCONST64(a)    ConstantFP::get(DoubleTy, a)
+#define FPCONST80(a)    ConstantFP::get(FP80Ty, a)
+#define FPCONST128(a)   ConstantFP::get(FP128Ty, a)
+
+#define ICMP(a,b,pred)  new ICmpInst(LastInst, pred, a, b, "")
+
+#define AND(a,b)        BinaryOperator::Create(Instruction::And,  a, b, "", LastInst)
+#define OR(a,b)         BinaryOperator::Create(Instruction::Or,   a, b, "", LastInst)
+#define XOR(a,b)        BinaryOperator::Create(Instruction::Xor,  a, b, "", LastInst)
+#define SHL(a,b)        BinaryOperator::Create(Instruction::Shl,  a, b, "", LastInst)
+#define LSHR(a,b)       BinaryOperator::Create(Instruction::LShr, a, b, "", LastInst)
+#define ASHR(a,b)       BinaryOperator::Create(Instruction::AShr, a, b, "", LastInst)
+#define ADD(a,b)        BinaryOperator::Create(Instruction::Add,  a, b, "", LastInst)
+#define SUB(a,b)        BinaryOperator::Create(Instruction::Sub,  a, b, "", LastInst)
+#define MUL(a,b)        BinaryOperator::Create(Instruction::Mul,  a, b, "", LastInst)
+#define SDIV(a,b)       BinaryOperator::Create(Instruction::SDiv, a, b, "", LastInst)
+#define UDIV(a,b)       BinaryOperator::Create(Instruction::UDiv, a, b, "", LastInst)
+#define SREM(a,b)       BinaryOperator::Create(Instruction::SRem, a, b, "", LastInst)
+#define UREM(a,b)       BinaryOperator::Create(Instruction::URem, a, b, "", LastInst)
+
+#define FADD(a,b)       BinaryOperator::Create(Instruction::FAdd, a, b, "", LastInst)
+#define FSUB(a,b)       BinaryOperator::Create(Instruction::FSub, a, b, "", LastInst)
+#define FMUL(a,b)       BinaryOperator::Create(Instruction::FMul, a, b, "", LastInst)
+#define FDIV(a,b)       BinaryOperator::Create(Instruction::FDiv, a, b, "", LastInst)
+
+#define CAST(a,t)       new BitCastInst(a, t, "", LastInst)
+#define CASTPTR8(a)     CAST(a,Int8PtrTy)
+#define CASTPTR16(a)    CAST(a,Int16PtrTy)
+#define CASTPTR32(a)    CAST(a,Int32PtrTy)
+#define CASTPTR64(a)    CAST(a,Int64PtrTy)
+
+#define ITP(a,t)        new IntToPtrInst(a, t, "", LastInst)
+#define ITP8(a)         ITP(a,Int8PtrTy)
+#define ITP16(a)        ITP(a,Int16PtrTy)
+#define ITP32(a)        ITP(a,Int32PtrTy)
+#define ITP64(a)        ITP(a,Int64PtrTy)
+
+#define TRUNC(a,t)      new TruncInst(a, t, "", LastInst)
+#define TRUNC8(a)       TRUNC(a, Int8Ty)
+#define TRUNC16(a)      TRUNC(a, Int16Ty)
+#define TRUNC32(a)      TRUNC(a, Int32Ty)
+#define TRUNC64(a)      TRUNC(a, Int64Ty)
+
+#define ZEXT(a,t)       new ZExtInst(a, t, "", LastInst)
+#define ZEXT8(a)        ZEXT(a, Int8Ty)
+#define ZEXT16(a)       ZEXT(a, Int16Ty)
+#define ZEXT32(a)       ZEXT(a, Int32Ty)
+#define ZEXT64(a)       ZEXT(a, Int64Ty)
+#define ZEXT128(a)      ZEXT(a, Int128Ty)
+#define SEXT(a,t)       new SExtInst(a, t, "", LastInst)
+#define SEXT8(a)        SEXT(a, Int8Ty)
+#define SEXT16(a)       SEXT(a, Int16Ty)
+#define SEXT32(a)       SEXT(a, Int32Ty)
+#define SEXT64(a)       SEXT(a, Int64Ty)
+#define SEXT128(a)      SEXT(a, Int128Ty)
+
+#define BSWAP16(a)      CreateBSwap(Int16Ty, a, LastInst)
+#define BSWAP32(a)      CreateBSwap(Int32Ty, a, LastInst)
+#define BSWAP64(a)      CreateBSwap(Int64Ty, a, LastInst)
+
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-opc.h b/llvm/include/llvm-opc.h
new file mode 100644
index 0000000..9454dac
--- /dev/null
+++ b/llvm/include/llvm-opc.h
@@ -0,0 +1,494 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_OPC_H
+#define __LLVM_OPC_H
+
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "qemu-types.h"
+#include "llvm-types.h"
+#include "llvm-translator.h"
+#include "llvm.h"
+
+//#define ASSERT
+//#define VERIFY_TB
+
+
+#define IRDebug(idx)     \
+    do {                 \
+        dbg() << DEBUG_ENTRY << "op_" << llvm_op_defs[idx].name << ": " \
+              << llvm_op_defs[idx].nb_oargs << " "   \
+              << llvm_op_defs[idx].nb_iargs << " "   \
+              << llvm_op_defs[idx].nb_cargs << "\n"; \
+    } while (0)
+#define IRError(fmt,args...)  hqemu_error(fmt,##args)
+
+#ifdef ASSERT
+#define AssertType(t)                           \
+    do {                                        \
+        if (!(t))                               \
+            hqemu_error("invalid type.\n");     \
+    } while(0)
+#else
+#define AssertType(t)
+#endif
+
+#define IRAbort()                               \
+    do {                                        \
+        if (!LLEnv->isTraceMode()) {            \
+            Func->dump();                       \
+            hqemu_error("fixme.\n");            \
+        }                                       \
+        Builder->Abort();                       \
+    } while (0)
+
+
+class LLVMTranslator;
+class NotifyInfo;
+class OptimizationInfo;
+
+
+/* Patch flags.
+ * NOTE: patch flags must be synchronized with those in the LLVM backend. */
+enum {
+    PATCH_HQEMU = 0x4182U,
+    PATCH_DUMMY,
+    PATCH_EXIT_TB,
+    PATCH_DIRECT_JUMP,
+    PATCH_TRACE_BLOCK_CHAINING,
+    PATCH_QMMU,
+};
+
+/*
+ * Register is used to describe the pseudo registers used by QEMU TCG op.
+ */
+struct Register {
+    /* Status of the register. */
+    enum {
+        STATE_NONE = 0x0,
+        STATE_REV  = 0x1,   /* Register is reserved */
+        STATE_REG  = 0x2,   /* Register is promoted */
+        STATE_MEM  = 0x4,   /* Register is in CPUArchState memory */
+        STATE_LOC  = 0x8,   /* Register is a local register */
+        STATE_TMP  = 0x10,  /* Register is a tmp register */
+    };
+
+    int State;        /* State of the register */
+    int Base;
+    intptr_t Off;     /* Register offset of CPUArchState */
+    int Size;         /* Register size */
+    std::string Name; /* Name string of this register */
+    bool Dirty;       /* This register is updated or not */
+    Type *Ty;         /* Register type in LLVM */
+    Value *Data;      /* Data value if this regisrer is promoted */
+    Value *AI;        /* Register as Alloca */
+    Register *Alias;
+
+    Register() : State(STATE_NONE), Off(-1), Dirty(false), Ty(nullptr),
+                 Data(nullptr), AI(nullptr), Alias(nullptr) {}
+
+    void set(int base, intptr_t off, std::string name) {
+        Base = base;
+        Off = off;
+        Name = name;
+    }
+    void reset(int state, int size, Type *ty) {
+        State = state;
+        Size = size;
+        Ty = ty;
+        Dirty = false;
+        Data = AI = nullptr;
+    }
+
+    void Promote()            { State |= STATE_REG;  }
+    void Demote()             { State &= ~STATE_REG; }
+
+    Value *getData()          { return Data;      }
+    Register &getAlias()      { return *Alias;    }
+
+    void setState(int state)  { State = state;    }
+    void setData(Value *data, bool dirty = false) {
+        if (Alias) {
+            Alias->setData(data, dirty);
+            return;
+        }
+        Data = data;
+        Dirty = dirty;
+        Promote();
+    }
+    bool isRev()    { return State & STATE_REV; }
+    bool isReg()    { return State & STATE_REG; }
+    bool isMem()    { return State & STATE_MEM; }
+    bool isLocal()  { return State & STATE_LOC; }
+    bool isDirty()  { return Dirty;            }
+    bool isAlias()  { return Alias != nullptr; }
+};
+
+/*
+ * TraceBuilder provides the facilities to build a trace in IRFactory.
+ */
+class TraceBuilder {
+    typedef std::map<target_ulong,
+                     std::pair<GraphNode*, BasicBlock*> > NodeBuildMap;
+    typedef std::vector<std::pair<BranchInst*, GraphNode*> > BranchList;
+
+    IRFactory *IF;
+    OptimizationInfo *Opt;
+    GraphNode *CurrNode;   /* The current CFG node to process */
+    NodeBuildMap Nodes;
+    BranchList Branches;
+    NodeVec NodeQueue;     /* CFG nodes to be translated */
+    NodeSet NodeVisisted;
+    NodeVec NodeUsed;
+    bool Aborted;
+    uint32_t Attribute;
+
+    TraceInfo *Trace;
+
+public:
+    TraceBuilder(IRFactory *IRF, OptimizationInfo *Opt);
+    ~TraceBuilder() {}
+
+    void ConvertToTCGIR(CPUArchState *env);
+    void ConvertToLLVMIR();
+    void Abort();
+    void Finalize();
+    bool isAborted() { return Aborted; }
+
+    OptimizationInfo *getOpt() { return Opt;             }
+    TraceInfo *getTrace()      { return Trace;           }
+    GraphNode *getEntryNode()  { return Opt->getCFG();   }
+    GraphNode *getCurrNode()   { return CurrNode;        }
+    unsigned getNumNodes()     { return Nodes.size();    }
+    std::string getPCString(GraphNode *Node) {
+        std::stringstream ss;
+        ss << std::hex << Node->getGuestPC();
+        return ss.str();
+    }
+
+    GraphNode *getNextNode()  {
+        if (NodeQueue.empty())
+            return nullptr;
+        CurrNode = NodeQueue.back();
+        NodeQueue.pop_back();
+
+        if (NodeVisisted.find(CurrNode) != NodeVisisted.end())
+            return getNextNode();
+
+        NodeVisisted.insert(CurrNode);
+        NodeUsed.push_back(CurrNode);
+        return CurrNode;
+    }
+
+    target_ulong getGuestPC(GraphNode *Node) {
+#if defined(TARGET_I386)
+        return Node->getTB()->pc - Node->getTB()->cs_base;
+#else
+        return Node->getTB()->pc;
+#endif
+    }
+    void setUniqueNode(GraphNode *Node) {
+        target_ulong gpc = getGuestPC(Node);
+        if (Nodes.find(gpc) == Nodes.end())
+            Nodes[gpc] = std::make_pair(Node, nullptr);
+    }
+    void setBasicBlock(GraphNode *Node, BasicBlock *BB) {
+        target_ulong gpc = getGuestPC(Node);
+        if (Nodes.find(gpc) == Nodes.end())
+            hqemu_error("internal error.\n");
+        Nodes[gpc].second = BB;
+    }
+    void setBranch(BranchInst *BI, GraphNode *Node) {
+        Branches.push_back(std::make_pair(BI, Node));
+        target_ulong gpc = getGuestPC(Node);
+        if (!Nodes[gpc].second)
+            NodeQueue.push_back(Node);
+    }
+    GraphNode *getNode(target_ulong gpc) {
+        return Nodes.find(gpc) == Nodes.end() ? nullptr : Nodes[gpc].first;
+    }
+    BasicBlock *getBasicBlock(GraphNode *Node) {
+        target_ulong gpc = getGuestPC(Node);
+        if (Nodes.find(gpc) == Nodes.end())
+            hqemu_error("internal error.\n");
+        return Nodes[gpc].second;
+    }
+    void addAttribute(uint32_t Attr) {
+        Attribute |= Attr;
+    }
+};
+
+
+#define META_CONST  "const"
+#define META_GVA    "gva"
+#define META_LOOP   "loop"
+#define META_EXIT   "exit"
+#define META_CC     "cc"
+
+class MDFactory {
+    uint32_t UID;
+    LLVMContext &Context;
+    MDNode *Dummy;
+
+    ConstantInt *getUID() {
+        return ConstantInt::get(IntegerType::get(Context, 32), UID++);
+    }
+
+public:
+    MDFactory(Module *M);
+    ~MDFactory();
+
+    MDNode *getMDNode(ArrayRef<ConstantInt*> V);
+    DebugLoc getDebugLoc(unsigned Line, unsigned Col, Function *F,
+                         ArrayRef<ConstantInt*> Meta);
+
+    void setConst(Instruction *I)       { I->setMetadata(META_CONST, Dummy); }
+    void setGuestMemory(Instruction *I) { I->setMetadata(META_GVA, Dummy);   }
+    void setLoop(Instruction *I)        { I->setMetadata(META_LOOP, Dummy);  }
+    void setExit(Instruction *I)        { I->setMetadata(META_EXIT, Dummy);  }
+    void setCondition(Instruction *I)   { I->setMetadata(META_CC, Dummy);    }
+
+    static bool isConst(Instruction *I) {
+        return I->getMetadata(META_CONST);
+    }
+    static bool isGuestMemory(Instruction *I) {
+        return I->getMetadata(META_GVA);
+    }
+    static bool isLoop(Instruction *I) {
+        return I->getMetadata(META_LOOP);
+    }
+    static bool isExit(Instruction *I) {
+        return I->getMetadata(META_EXIT);
+    }
+    static bool isCondition(Instruction *I) {
+        return I->getMetadata(META_CC);
+    }
+
+    static void setConstStatic(LLVMContext &Context, Instruction *I,
+                               ArrayRef<ConstantInt*> V);
+};
+
+/*
+ * IRFactory conducts QEMU TCG opcodes to LLVM IR conversion.
+ */
+class IRFactory {
+    typedef std::map<std::pair<intptr_t, Type *>, Value *> StatePtrMap;
+    typedef std::map<TCGArg, BasicBlock *> LabelMap;
+
+    enum {
+        COHERENCE_NONE = 0,
+        COHERENCE_GLOBAL,
+        COHERENCE_ALL,
+    };
+
+    bool InitOnce;
+
+    /* Basic types */
+    Type *VoidTy;
+    IntegerType *Int8Ty;
+    IntegerType *Int16Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *Int128Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int16PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Type *FloatTy;
+    Type *DoubleTy;
+    Type *FP80Ty;
+    Type *FP128Ty;
+
+    ConstantInt *ExitAddr;
+
+    LLVMTranslator &Translator; /* Uplink to the LLVMTranslator instance */
+    LLVMContext *Context;       /* Translator local context */
+    Module *Mod;                /* The LLVM module */
+    ExecutionEngine *EE;        /* The JIT compiler */
+    EventListener *Listener;    /* The JIT listener */
+    JITEventListener *IntelJIT; /* The Intel JIT listener */
+    const DataLayout *DL;       /* Data layout */
+    TraceBuilder *Builder;
+    MDFactory *MF;
+    MCDisasm *HostDisAsm;
+
+    HelperMap &Helpers;
+    std::vector<BaseRegister> &BaseReg;  /* TCG base register */
+    std::vector<Register> Reg;           /* TCG virtual registers */
+    LabelMap Labels;                     /* TCG labels */
+    int Segment;
+    GuestBaseRegister &GuestBaseReg;     /* Reserved guest base register */
+
+    Function *Func;          /* The container of LLVM IR to be translated */
+    BasicBlock *InitBB;      /* BasicBlock for variable decalaration */
+    BasicBlock *CurrBB;      /* Current BasicBlock to insert LLVM IR */
+    BasicBlock *ExitBB;      /* Temp BasicBlock as the exit-function stub */
+    BranchInst *LastInst;    /* Position to insert LLVM IR */
+
+    Instruction *CPU;           /* Base register with (char*) type */
+    Instruction *CPUStruct;     /* Base register with (struct CPUArchState*) type */
+    Instruction *GEPInsertPos;  /* Position to insert GEP instruction */
+
+    StatePtrMap StatePtr;
+    IVec InlineCalls;    /* Helpers to be inlined */
+    std::map<std::string, BasicBlock*> CommonBB;
+    IVec IndirectBrs;
+    IVec toErase;
+    BBVec toSink;
+    std::set<Function *> ClonedFuncs;
+    bool runPasses;
+
+    void CreateJIT();
+    void DeleteJIT();
+
+    /* Initialize basic types used during IR conversion. */
+    void InitializeTypes();
+
+    /* Store dirty states back to CPU state in the memory. */
+    void SaveGlobals(int level, Instruction *InsertPos);
+
+    /* Sync PC to CPU state in the memory. */
+    void CreateStorePC(Instruction *InsertPos);
+
+    /* Get or insert the pointer to the CPU state. */
+    Value *StatePointer(Register &reg);
+    Value *StatePointer(Register &reg, intptr_t Off, Type *PTy);
+
+    /* Load value from the CPU state in the memory. */
+    Value *LoadState(Register &reg);
+    void StoreState(Register &reg, Instruction *InsertPos);
+
+    /* Load/Store data from/to the guest memory. */
+    Value *QEMULoad(Value *AddrL, Value *AddrH, TCGMemOpIdx oi);
+    void QEMUStore(Value *Data, Value *AddrL, Value *AddrH, TCGMemOpIdx oi);
+
+    Value *ConvertCPUType(Function *F, int Idx, Instruction *InsertPos);
+    Value *ConvertCPUType(Function *F, int Idx, BasicBlock *InsertPos);
+
+    Value *ConvertEndian(Value *V, int opc);
+    Value *getExtendValue(Value *V, Type *Ty, int opc);
+    Value *getTruncValue(Value *V, int opc);
+    int getSizeInBits(int opc) {
+        return 8 * (1 << (opc & MO_SIZE));
+    }
+
+    Value *ConcatTLBVersion(Value *GVA);
+
+    /* Return the LLVM instruction that stores PC. For the guest's register
+     * size larger than the host, replace the multiple store-PC instructions
+     * to one single store-PC instruction. */
+    StoreInst *getStorePC();
+
+    /* Create both chaining and exiting stubs. */
+    void InsertLinkAndExit(Instruction *InsertPos);
+
+    /* Create exit stub */
+    void InsertExit(uintptr_t RetVal, bool setExit = false);
+
+    /* Find the next node of a trace according to the brach pc.
+     * Return null if we cannot find one. */
+    GraphNode *findNextNode(target_ulong pc);
+
+    /* Perform internal linking of basic blocks to form a region. */
+    void TraceLink(StoreInst *SI);
+
+    /* Link basic blocks of direct branch. */
+    void TraceLinkDirectJump(GraphNode *NextNode, StoreInst *SI);
+    void TraceLinkDirectJump(StoreInst *SI);
+
+    /* Link basic blocks of indirect branch. */
+    void TraceLinkIndirectJump(GraphNode *NextNode, StoreInst *SI);
+
+    /* Insert code for IBTC hash table lookup. */
+    void InsertLookupIBTC(GraphNode *CurrNode);
+
+    /* Insert code for CPBL hash table lookup. */
+    void InsertLookupCPBL(GraphNode *CurrNode);
+
+    void TraceValidateCPBL(GraphNode *NextNode, StoreInst *StorePC);
+
+    /* Insert bswap intrinsic instruction. */
+    Value *CreateBSwap(Type *Ty, Value *V, Instruction *InsertPos);
+
+    /* Given the size, return its PointerType. */
+    PointerType *getPointerTy(int Size, unsigned AS = 0);
+
+    /* Analyze a helper function to determine if it will be inlined or not. */
+    int AnalyzeInlineCost(CallSite CS);
+
+    /* Perform helper function inlining. */
+    void ProcessInline();
+
+    void VerifyFunction(Function &F);
+
+    /* Legalize LLVM IR before running the pre-defined passes. */
+    void PreProcess();
+
+    void Optimize();
+
+    /* Legalize LLVM IR after running the pre-defined passes. */
+    void PostProcess();
+
+    void FinalizeObject();
+
+    void InitializeLLVMPasses(legacy::FunctionPassManager *FPM);
+
+    uint32_t setRestorePoint(TCGMemOpIdx oi) {
+        if (oi != (uint16_t)oi)
+            hqemu_error("key value too large.\n");
+        return (NI.setRestorePoint() << 16) | oi;
+    }
+
+public:
+    typedef void (IRFactory::*FuncPtr)(const TCGArg *);
+
+    NotifyInfo &NI;             /* Info to pass among translator and JIT */
+
+    /* QEMU TCG IR to LLVM IR converion routines. */
+#define DEF(name, oargs, iargs, cargs, flags) void op_ ## name(const TCGArg *);
+#include "tcg-opc.h"
+#undef DEF
+
+    IRFactory(LLVMTranslator *Trans);
+    ~IRFactory();
+
+    void CreateSession(TraceBuilder *builder);
+    void DeleteSession();
+
+    /* Prepare the initial LLVM Function, BasicBlocks and variables. */
+    void CreateFunction();
+    void CreateBlock();
+
+    /* Start LLVM JIT compilation. */
+    void Compile();
+
+    /* Set instruction BI to jump to the basic block BB. */
+    void setSuccessor(BranchInst *BI, BasicBlock *BB);
+
+    /* Get function pointer of the IR converion routines. */
+    void *getOpcFunc();
+
+    Function *ResolveFunction(std::string Name);
+
+    LLVMTranslator &getTranslator()   { return Translator;   }
+    LLVMContext &getContext()         { return *Context;     }
+    const DataLayout *getDL()         { return DL;           }
+    MDFactory *getMDFactory()         { return MF;           }
+    HelperMap &getHelpers()           { return Helpers;      }
+    TraceInfo *getTrace()             { return Builder->getTrace(); }
+    Value *getGuestBase()             { return GuestBaseReg.Base;   }
+    Instruction *getDefaultCPU(Function &F);
+
+public:
+    static bool isStateOfPC(intptr_t Off);
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-pass.h b/llvm/include/llvm-pass.h
new file mode 100644
index 0000000..75bcf4a
--- /dev/null
+++ b/llvm/include/llvm-pass.h
@@ -0,0 +1,205 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_PASS_H
+#define __LLVM_PASS_H
+
+#include <map>
+#include <vector>
+#include "llvm-types.h"
+
+class IRFactory;
+
+
+static inline Value *getPointerOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        return LI->getPointerOperand();
+    if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        return SI->getPointerOperand();
+    return nullptr;
+}
+
+static inline Value *getValueOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        return LI;
+    if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        return SI->getValueOperand();
+    return nullptr;
+}
+
+static inline unsigned getAddressSpaceOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        return LI->getPointerAddressSpace();
+    if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        return SI->getPointerAddressSpace();
+    return -1;
+}
+
+/* A CPU state reference. */
+struct StateRef {
+    StateRef(intptr_t Start, intptr_t End, Instruction *I)
+        : Start(Start), End(End), I(I) {}
+    intptr_t Start;
+    intptr_t End;
+    Instruction *I;
+
+    intptr_t getSize() {
+        return End - Start;
+    }
+    Type *getType() {
+        return getValueOperand(I)->getType();
+    }
+};
+
+/* A group of references to a CPU state. */
+struct StateData {
+    intptr_t Start;
+    intptr_t End;
+    std::vector<StateRef*> Refs;
+
+    void reset(StateRef &Ref) {
+        Start = Ref.Start;
+        End = Ref.End;
+        Refs.clear();
+        Refs.push_back(&Ref);
+    }
+    void insert(StateRef &Ref) {
+        End = std::max(End, Ref.End);
+        Refs.push_back(&Ref);
+    }
+};
+
+typedef std::map<intptr_t, intptr_t> StateRange;
+typedef std::vector<StateData> StateList;
+typedef std::vector<CallInst*> CallList;
+
+/*
+ * The purpose of StateAnalyzer is to analyze loads/stores of CPU states and
+ * group loads/stores of the same CPU state into the same bucket (StateData).
+ */
+class StateAnalyzer {
+    const DataLayout *DL;
+    std::vector<StateRef> StateRefs;
+    CallList Calls;
+    StateList States;
+
+    /* Sort state references by the state offset. */
+    void sortStateRefs() {
+        if (StateRefs.empty())
+            return;
+        std::sort(StateRefs.begin(), StateRefs.end(),
+                  [](const StateRef &lhs, const StateRef &rhs) -> bool {
+                     return lhs.Start < rhs.Start;
+                  });
+    }
+
+public:
+    StateAnalyzer(const DataLayout *DL) : DL(DL) {}
+
+    void clear() {
+        StateRefs.clear();
+        Calls.clear();
+        States.clear();
+    }
+
+    /* Add a CPU state reference. */
+    void addStateRef(Instruction *I, intptr_t Off) {
+        Type *Ty = getValueOperand(I)->getType();
+        intptr_t Start = Off;
+        intptr_t End = Off + DL->getTypeSizeInBits(Ty) / 8;
+        StateRefs.push_back(StateRef(Start, End, I));
+    }
+
+    /* Add a helper function call. */
+    void addCall(CallInst *CI) {
+        Calls.push_back(CI);
+    }
+
+    /* Return non-overlapped ranges of states. */
+    void computeStateRange(StateRange &Reads, StateRange &Writes) {
+        computeState();
+        if (StateRefs.empty())
+            return;
+
+        const uint8_t READ  = 0x1;
+        const uint8_t WRITE = 0x2;
+        for (auto &State : States) {
+            uint8_t RW = 0;
+            for (auto &Ref : State.Refs)
+                RW |= isa<LoadInst>(Ref->I) ? READ : WRITE;
+            if (RW & READ)
+                Reads[State.Start] = State.End;
+            if (RW & WRITE)
+                Writes[State.Start] = State.End;
+        }
+    }
+
+    /* Compute referenced states and group instructions. */
+    void computeState() {
+        /* Sort state refs by the offset. */
+        sortStateRefs();
+        if (StateRefs.empty())
+            return;
+
+        StateData State;
+        State.reset(StateRefs.front());
+        for (unsigned i = 1, e = StateRefs.size(); i != e; ++i) {
+            StateRef &Next = StateRefs[i];
+            if (State.End <= Next.Start) {
+                /* The next reference is not overlapped with the previous
+                 * reference. A new state is found. */
+                States.push_back(State);
+                /* Reset Curr to the next state. */
+                State.reset(Next);
+            } else {
+                /* Overlap and merge. */
+                State.insert(Next);
+            }
+        }
+        /* The last state. */
+        States.push_back(State);
+    }
+
+    StateList &getStateList() {
+        return States;
+    }
+
+    CallList &getCalls() {
+        return Calls;
+    }
+};
+
+
+namespace llvm {
+/* Passes */
+FunctionPass *createReplaceIntrinsic();
+FunctionPass *createFastMathPass();
+FunctionPass *createProfileExec(IRFactory *IF);
+FunctionPass *createStateMappingPass(IRFactory *IF);
+FunctionPass *createRedundantStateElimination(IRFactory *IF);
+FunctionPass *createCombineGuestMemory(IRFactory *IF);
+FunctionPass *createCombineCasts(IRFactory *IF);
+FunctionPass *createCombineZExtTrunc();
+FunctionPass *createSimplifyPointer(IRFactory *IF);
+
+void initializeReplaceIntrinsicPass(llvm::PassRegistry&);
+void initializeFastMathPassPass(llvm::PassRegistry&);
+void initializeProfileExecPass(llvm::PassRegistry&);
+void initializeStateMappingPassPass(llvm::PassRegistry&);
+void initializeRedundantStateEliminationPass(llvm::PassRegistry&);
+void initializeCombineGuestMemoryPass(llvm::PassRegistry&);
+void initializeCombineCastsPass(llvm::PassRegistry&);
+void initializeCombineZExtTruncPass(llvm::PassRegistry&);
+void initializeSimplifyPointerPass(llvm::PassRegistry&);
+
+/* Analysis */
+void initializeInnerLoopAnalysisWrapperPassPass(llvm::PassRegistry&);
+}
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-soft-perfmon.h b/llvm/include/llvm-soft-perfmon.h
new file mode 100644
index 0000000..c55201e
--- /dev/null
+++ b/llvm/include/llvm-soft-perfmon.h
@@ -0,0 +1,74 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_SOFT_PERFMON_H
+#define __LLVM_SOFT_PERFMON_H
+
+#include "utils.h"
+
+#define MAX_SPM_THREADS 256
+
+#define SPM_NONE      (uint64_t)0
+#define SPM_BASIC     ((uint64_t)1 << 0)
+#define SPM_TRACE     ((uint64_t)1 << 1)
+#define SPM_CACHE     ((uint64_t)1 << 2)
+#define SPM_PASS      ((uint64_t)1 << 3)
+#define SPM_HPM       ((uint64_t)1 << 4)
+#define SPM_EXIT      ((uint64_t)1 << 5)
+#define SPM_HOTSPOT   ((uint64_t)1 << 6)
+#define SPM_ALL       SPM_BASIC | SPM_TRACE | SPM_CACHE | SPM_PASS | SPM_HPM | \
+                      SPM_EXIT | SPM_HOTSPOT
+#define SPM_NUM       9
+
+
+/*
+ * Software Performance Monitor (SPM)
+ */
+class SoftwarePerfmon {
+public:
+    typedef void (*ExitFuncPtr)(void);
+
+    uint64_t Mode;         /* Profile level */
+    uint64_t NumInsns;     /* Number of instructions */
+    uint64_t NumBranches;  /* Number of branches */
+    uint64_t NumLoads;     /* Number of memory loads */
+    uint64_t NumStores;    /* Number of memory stores */
+    uint64_t NumTraceExits;    /* Count of trace exits */
+    uint64_t SampleTime;   /* Process time of the sampling handler. */
+    unsigned CoverSet;
+    std::vector<std::vector<uint64_t> *> SampleListVec;
+
+    SoftwarePerfmon()
+        : Mode(SPM_NONE), NumInsns(0), NumBranches(0), NumLoads(0), NumStores(0),
+          NumTraceExits(0), SampleTime(0), CoverSet(90) {}
+    SoftwarePerfmon(std::string &ProfileLevel) : SoftwarePerfmon() {
+        ParseProfileMode(ProfileLevel);
+    }
+
+    bool isEnabled() {
+        return Mode != SPM_NONE;
+    }
+
+    void registerExitFn(ExitFuncPtr F) {
+        ExitFunc.push_back(F);
+    }
+
+    void printProfile();
+
+private:
+    std::vector<ExitFuncPtr> ExitFunc;
+
+    void ParseProfileMode(std::string &ProfileLevel);
+    void printBlockProfile();
+    void printTraceProfile();
+};
+
+extern SoftwarePerfmon *SP;
+
+#endif /* __LLVM_SOFT_PERFMON_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-state.h b/llvm/include/llvm-state.h
new file mode 100644
index 0000000..e573073
--- /dev/null
+++ b/llvm/include/llvm-state.h
@@ -0,0 +1,194 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file implements the basic optimization schemes including indirect
+ *   branch target cache (IBTC), indirect branch chain (IB chain), and trace
+ *   profiling and prediction routines.
+ */
+
+#ifndef __LLVM_STATE_H
+#define __LLVM_STATE_H
+
+#define COPY_STATE(_dst, _src, _e) do { _dst->_e = _src->_e; } while(0)
+
+/*
+ * The following data structure and routine are used to save/restore the states
+ * of CPUArchState. Only the states that could affect decoding the guest binary by
+ * the TCG front-end are saved/restored. Such states are saved when translating
+ * the block at the first time because the states could change later and are
+ * restored to the saved values when the block is decoded again during the
+ * trace formation.
+ */
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+typedef struct i386_env {
+    int singlestep_enabled;
+    uint32_t hflags;
+    target_ulong eflags;
+} cpustate;
+#elif defined(TARGET_ARM)
+typedef struct arm_env {
+    int singlestep_enabled;
+    uint32_t pstate;
+    uint32_t aarch64;
+    struct {
+        uint32_t c15_cpar;
+        uint64_t scr_el3;
+    } cp15;
+    uint32_t uncached_cpsr;
+    uint64_t features;
+} cpustate;
+#elif defined(TARGET_PPC) || defined(TARGET_PPC64)
+typedef struct ppc_env {
+    int singlestep_enabled;
+    target_ulong msr;
+    int mmu_idx;
+    uint32_t flags;
+    uint64_t insns_flags;
+    uint64_t insns_flags2;
+    target_ulong hflags;
+} cpustate;
+#elif defined(TARGET_SH4)
+typedef struct sh4_env {
+    int singlestep_enabled;
+    uint32_t sr;	/* status register */
+    uint32_t fpscr;	/* floating point status/control register */
+    uint32_t features;
+} cpustate;
+#elif defined(TARGET_M68K)
+typedef struct m68k_env {
+    int singlestep_enabled;
+    uint32_t sr;	/* status register */
+    uint32_t fpcr;	/* floating point status/control register */
+} cpustate;
+#elif defined(TARGET_MIPS)
+typedef struct mips_env {
+    int singlestep_enabled;
+    target_ulong btarget;
+} cpustate;
+#else
+typedef struct dummy_env {
+    int dummy;
+} cpustate;
+#endif
+
+static inline void tcg_save_state(CPUArchState *env, TranslationBlock *tb)
+{
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct i386_env *s = new struct i386_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, hflags);
+    COPY_STATE(s, env, eflags);
+#elif defined(TARGET_ARM)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct arm_env *s = new struct arm_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, cp15.c15_cpar);
+    COPY_STATE(s, env, cp15.scr_el3);
+    COPY_STATE(s, env, uncached_cpsr);
+    COPY_STATE(s, env, features);
+    COPY_STATE(s, env, pstate);
+    COPY_STATE(s, env, aarch64);
+#elif defined(TARGET_PPC) || defined(TARGET_PPC64)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct ppc_env *s = new struct ppc_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, msr);
+    COPY_STATE(s, env, mmu_idx);
+    COPY_STATE(s, env, flags);
+    COPY_STATE(s, env, insns_flags);
+    COPY_STATE(s, env, insns_flags2);
+    COPY_STATE(s, env, hflags);
+#elif defined(TARGET_SH4)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct sh4_env *s = new struct sh4_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, sr);
+    COPY_STATE(s, env, fpscr);
+    COPY_STATE(s, env, features);
+#elif defined(TARGET_M68K)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct m68k_env *s = new struct m68k_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, sr);
+    COPY_STATE(s, env, fpcr);
+#elif defined(TARGET_MIPS)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct mips_env *s = new struct mips_env;
+    COPY_STATE(s, cpu, singlestep_enabled);
+    COPY_STATE(s, env, btarget);
+#else
+    void *s = nullptr;
+#endif
+
+    tb->state = (void *)s;
+}
+
+/*
+ * tcg_restore_state()
+ *  Reset states to those when the block is first translated.
+ */
+static inline void tcg_copy_state(CPUArchState *env, TranslationBlock *tb)
+{
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct i386_env *i386e = (struct i386_env *)tb->state;
+    COPY_STATE(cpu, i386e, singlestep_enabled);
+    COPY_STATE(env, i386e, hflags);
+    COPY_STATE(env, i386e, eflags);
+#elif defined(TARGET_ARM)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct arm_env *arme = (struct arm_env *)tb->state;
+    COPY_STATE(cpu, arme, singlestep_enabled);
+    COPY_STATE(env, arme, cp15.c15_cpar);
+    COPY_STATE(env, arme, cp15.scr_el3);
+    COPY_STATE(env, arme, uncached_cpsr);
+    COPY_STATE(env, arme, features);
+    COPY_STATE(env, arme, pstate);
+    COPY_STATE(env, arme, aarch64);
+#elif defined(TARGET_PPC) || defined(TARGET_PPC64)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct ppc_env *ppce = (struct ppc_env *)tb->state;
+    COPY_STATE(cpu, ppce, singlestep_enabled);
+    COPY_STATE(env, ppce, msr);
+    COPY_STATE(env, ppce, mmu_idx);
+    COPY_STATE(env, ppce, flags);
+    COPY_STATE(env, ppce, insns_flags);
+    COPY_STATE(env, ppce, insns_flags2);
+    COPY_STATE(env, ppce, hflags);
+#elif defined(TARGET_SH4)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct sh4_env *sh4e = (struct sh4_env *)tb->state;
+    COPY_STATE(cpu, sh4e, singlestep_enabled);
+    COPY_STATE(env, sh4e, sr);
+    COPY_STATE(env, sh4e, fpscr);
+    COPY_STATE(env, sh4e, features);
+#elif defined(TARGET_M68K)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct m68k_env *m68ke = (struct m68k_env *)tb->state;
+    COPY_STATE(cpu, m68ke, singlestep_enabled);
+    COPY_STATE(env, m68ke, sr);
+    COPY_STATE(env, m68ke, fpcr);
+#elif defined(TARGET_MIPS)
+    CPUState *cpu = ENV_GET_CPU(env);
+    struct mips_env *mipse = (struct mips_env *)tb->state;
+    COPY_STATE(cpu, mipse, singlestep_enabled);
+    COPY_STATE(env, mipse, btarget);
+#endif
+}
+
+static inline void delete_state(TranslationBlock *tb)
+{
+    delete (cpustate *)tb->state;
+    tb->state = nullptr;
+}
+
+#undef COPY_STATE
+#endif  /* __LLVM_STATE_H */
+
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/llvm-target.h b/llvm/include/llvm-target.h
new file mode 100644
index 0000000..1784942
--- /dev/null
+++ b/llvm/include/llvm-target.h
@@ -0,0 +1,116 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_TARGET_H
+#define __LLVM_TARGET_H
+
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm-types.h"
+#include "llvm-translator.h"
+
+#ifndef __PRI64_PREFIX
+#  if __WORDSIZE == 64
+#    define __PRI64_PREFIX  "l"
+#  else
+#    define __PRI64_PREFIX  "ll"
+#  endif
+#endif
+
+#if TARGET_LONG_BITS == 32
+#  define PRId  "d"
+#  define PRIx  "x"
+#else
+#  define PRId  __PRI64_PREFIX "d"
+#  define PRIx  __PRI64_PREFIX "x"
+#endif
+
+#define PRId64  __PRI64_PREFIX "d"
+#define PRIu64  __PRI64_PREFIX "u"
+
+class code_ostream {
+    char *OutBufStart;
+    char *OutBufCur;
+public:
+    void Skip(unsigned Size) {
+        OutBufCur += Size;
+    }
+
+    code_ostream(uintptr_t Ptr)
+        : OutBufStart((char *)Ptr), OutBufCur((char *)Ptr) {}
+    code_ostream &operator<<(char C) {
+        *OutBufCur = C;
+        OutBufCur++;
+        return *this;
+    }
+    code_ostream &operator<<(unsigned char C) {
+        *(unsigned char *)OutBufCur = C;
+        OutBufCur++;
+        return *this;
+    }
+    code_ostream &operator<<(unsigned int C) {
+        *(unsigned int *)OutBufCur = C;
+        OutBufCur += sizeof(unsigned int);
+        return *this;
+    }
+    code_ostream &operator<<(unsigned long C) {
+        *(unsigned long *)OutBufCur = C;
+        OutBufCur += sizeof(unsigned long);
+        return *this;
+    }
+};
+
+static inline void EmitByte(code_ostream &OS, unsigned char C)
+{
+    OS << (char)C;
+}
+static inline void EmitConstant(code_ostream &OS, uint64_t Val, unsigned Size)
+{
+    for (unsigned i = 0; i != Size; ++i) {
+        EmitByte(OS, Val & 255);
+        Val >>= 8;
+    }
+}
+
+/*
+ * EventListener is used by the JIT to notify clients about significant events
+ * during compilation.
+ */
+class EventListener : public JITEventListener {
+    NotifyInfo &NI;
+
+public:
+    EventListener(NotifyInfo &NI) : NI(NI) {}
+    ~EventListener() {}
+    virtual void NotifyFunctionEmitted(const Function &F, void *Code, size_t Size,
+                                       const EmittedFunctionDetails &Details);
+#if defined(LLVM_V35)
+    virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+#else
+    virtual void NotifyObjectEmitted(const object::ObjectFile &Obj,
+                                     const RuntimeDyld::LoadedObjectInfo &L);
+#endif
+};
+
+
+const char *getMMUFName(const void *func);
+bool isMMUFunction(std::string &Name);
+bool isLMTFunction(std::string &Name);
+bool isIllegalHelper(const void *func);
+bool isLibcall(std::string &Name);
+bool isSoftFPcall(std::string &Name);
+void AddDependentSymbols(LLVMTranslator *Translator);
+Value *StripPointer(Value *Ptr);
+Value *StripPointerWithConstantOffset(const DataLayout *DL, Value *Ptr,
+                                      APInt &Offset, Value *GuestBase);
+Value *getBaseWithConstantOffset(const DataLayout *DL, Value *Ptr, intptr_t &Offset);
+void ProcessErase(IVec &toErase);
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/llvm-translator.h b/llvm/include/llvm-translator.h
new file mode 100644
index 0000000..d1d92c5
--- /dev/null
+++ b/llvm/include/llvm-translator.h
@@ -0,0 +1,270 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_TRANSLATOR_H
+#define __LLVM_TRANSLATOR_H
+
+#include <map>
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm-types.h"
+#include "llvm-pass.h"
+#include "llvm.h"
+
+
+class OptimizationInfo;
+class EventListener;
+class NotifyInfo;
+class IRFactory;
+class TraceBuilder;
+
+
+/*
+ * BaseRegister is used to describe the `reserved' registers by QEMU TCG.
+ * Ex: R14 for the x86 host or R7 for the ARM host.
+ */
+struct BaseRegister {
+    BaseRegister() : Base(nullptr) {}
+    int RegNo;          /* Register number */
+    std::string Name;   /* Register name string */
+    Type *Ty;           /* Type (struct CPUArchState) */
+    Instruction *Base;  /* CallInst to retrieve basereg */
+};
+
+struct GuestBaseRegister {
+    GuestBaseRegister() : Name(""), Base(nullptr) {}
+    std::string Name;   /* Register name string */
+    Value *Base;        /* CallInst to retrieve basereg */
+};
+
+/*
+ * Information of helper functions defined in llvm-helper.h.
+ */
+struct HelperInfo {
+    HelperInfo()
+        : ConflictSize(0), mayConflictArg(false), hasNestedCall(false) {}
+
+    struct ArgInfo {
+        unsigned ConstantWeight;  /* Weight if the argument is a constant */
+        unsigned AllocaWeight;    /* Weight if the argument is a alloca */
+        ArgInfo(unsigned CWeight, unsigned AWeight)
+            : ConstantWeight(CWeight), AllocaWeight(AWeight) {}
+    };
+
+    Function *Func;          /* Function symbol to be inlined */
+    Function *FuncNoInline;  /* Function symbol not to be inlined */
+    std::vector<std::pair<Instruction*, intptr_t> > States;
+    std::vector<CallInst*> NestedCalls;
+    StateRange StateUse;
+    StateRange StateDef;
+    CodeMetrics Metrics;     /* Inlining metrics */
+    std::vector<ArgInfo> ArgumentWeights;  /* Weight of the function arguments */
+    intptr_t ConflictSize;
+
+    bool mayConflictArg;     /* Arguments conflict with state mapping or not */
+    bool hasNestedCall;      /* This function has nested function or not */
+
+    void CalculateMetrics(Function *F);
+
+    void insertState(StateRange &Range, bool isWrite) {
+        if (isWrite)
+            StateDef.insert(Range.begin(), Range.end());
+        else
+            StateUse.insert(Range.begin(), Range.end());
+    }
+};
+
+/* 
+ * NotifyInfo is used to pass information between LLVMTranslator, IRFactory and
+ * the JIT listener.
+ */
+class NotifyInfo {
+#define MAX_CHAINSLOT   256
+public:
+    struct SlotInfo {
+        size_t Key;
+        uintptr_t Addr;
+    };
+
+    struct PatchInfo {
+        PatchInfo(unsigned ty, unsigned idx, uintptr_t addr)
+            : Type(ty), Idx(idx), Addr(addr) {}
+        unsigned Type;
+        unsigned Idx;
+        uintptr_t Addr;
+    };
+
+    NotifyInfo() : Func(nullptr) {
+        ChainSlot = new SlotInfo[MAX_CHAINSLOT];
+    }
+    ~NotifyInfo() {
+        delete ChainSlot;
+    }
+
+    Function *Func;        /* LLVM Function of this translation unit */
+    TCGOp *Op;
+    TranslationBlock *TB;
+    uint16_t NumInsts;
+    RestoreVec Restore;
+    unsigned NumChainSlot;
+    SlotInfo *ChainSlot;
+
+    uint32_t Size;         /* Size of the translated host code */
+    uint8_t *Code;         /* Start PC of the translated host code */
+    std::vector<PatchInfo> Patches;
+
+    void reset() {
+        Restore.clear();
+        Patches.clear();
+        NumInsts = 0;
+        NumChainSlot = 0;
+    }
+    unsigned setChainSlot(size_t Key) {
+        if (NumChainSlot >= MAX_CHAINSLOT)
+            hqemu_error("run out of chain slot.\n");
+        unsigned Curr = NumChainSlot;
+        ChainSlot[NumChainSlot++].Key = Key;
+        return Curr;
+    }
+    uintptr_t getChainSlotAddr(unsigned Idx) {
+        if (NumChainSlot >= MAX_CHAINSLOT)
+            hqemu_error("invalid chain slot index.\n");
+        return (uintptr_t)&ChainSlot[Idx].Addr;
+    }
+    void addPatch(unsigned Type, unsigned Idx, uintptr_t Addr) {
+        Patches.push_back(PatchInfo(Type, Idx, Addr));
+    }
+    void setOp(TCGOp *op) { Op = op; }
+    void setTB(TranslationBlock *tb) {
+        TB = tb;
+        NumInsts = 0;
+    }
+    uint32_t setRestorePoint() {
+        uint32_t Idx = Restore.size();
+        if (Idx != (uint16_t)Idx)
+            hqemu_error("key value too large.\n");
+        Restore.push_back(std::make_pair(TB->id, NumInsts));
+        return Idx;
+    }
+};
+
+/*
+ * LLVM Translator
+ */
+class LLVMTranslator {
+    unsigned MyID;           /* Translator ID */
+    CPUArchState *Env;
+
+    /* Basic types */
+    Type *VoidTy;
+    IntegerType *Int8Ty;
+    IntegerType *Int16Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *Int128Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int16PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Type *FloatTy;
+    Type *DoubleTy;
+    PointerType *FloatPtrTy;
+    PointerType *DoublePtrTy;
+
+    LLVMContext Context;     /* Translator local context */
+    Module *Mod;             /* The LLVM module */
+    const DataLayout *DL;    /* Data layout */
+    NotifyInfo NI;           /* Info to set/use by the JIT listener */
+
+    std::vector<BaseRegister> BaseReg;  /* Reserved base registers */
+    GuestBaseRegister GuestBaseReg;     /* Reserved guest base register */
+    FlatType StateType;      /* Offset and type of guest registers */
+    TCGHelperMap TCGHelpers;
+    HelperMap Helpers;
+    std::set<std::string> ConstHelpers;
+    SymbolMap Symbols;
+
+    MCDisasm *GuestDisAsm;
+    MCDisasm *HostDisAsm;
+
+    IRFactory *IF;           /* TCG-to-LLVM IR converter */
+
+    /* Initialize the LLVM module. */
+    void InitializeModule();
+
+    /* Create the JIT compiler. */
+    void InitializeJIT();
+
+    /* Initialize required LLVM types. */
+    void InitializeType();
+
+    /* Setup guest and host dependent structures. */
+    void InitializeTarget();
+
+    /* Setup special registers. */
+    void DefineSpecialReg(std::map<Type*, Type*> &SpecialReg);
+
+    /* Convert the CPUArchState structure type to a list of primitive types. */
+    void FlattenCPUState(Type *Ty, intptr_t &Off, std::map<Type*, Type*> &SpecialReg);
+
+    /* Initialize helper functions. */
+    void InitializeHelpers();
+
+    /* Analyze and optimize a helper function. */
+    bool OptimizeHelper(HelperInfo &Helper);
+
+    void InitializeDisasm();
+
+    void InitializeConstHelpers();
+
+    void Commit(TraceBuilder &Builder);
+
+    void Abort(TraceBuilder &Builder);
+
+    void dump(CPUArchState *env, TranslationBlock *tb);
+
+    LLVMTranslator(unsigned id, CPUArchState *env);
+
+public:
+    ~LLVMTranslator();
+
+    void GenBlock(CPUArchState *env, OptimizationInfo *Opt);
+    void GenTrace(CPUArchState *env, OptimizationInfo *Opt);
+
+    unsigned getID()            { return MyID;      }
+    LLVMContext *getContext()   { return &Context;  }
+    Module *getModule()         { return Mod;       }
+    NotifyInfo &getNotifyInfo() { return NI;        }
+    std::vector<BaseRegister> &getBaseReg() { return BaseReg;      }
+    GuestBaseRegister &getGuestBaseReg()    { return GuestBaseReg; }
+    TCGHelperMap &getTCGHelpers()      { return TCGHelpers;   }
+    HelperMap &getHelpers()            { return Helpers;      }
+    std::set<std::string> &getConstHelpers()     { return ConstHelpers; }
+    FlatType &getStateType()    { return StateType; } 
+    SymbolMap &getSymbols()     { return Symbols;   }
+    MCDisasm *getHostDisAsm()   { return HostDisAsm;}
+
+    void AddSymbol(std::string Name, void *FP) {
+        Symbols[Name] = (uintptr_t)FP;
+    }
+
+    /* Create the LLVMTranslator instrance. */
+    static LLVMTranslator *CreateLLVMTranslator(int id, CPUArchState *env) {
+        return new LLVMTranslator(id, env);
+    }
+
+    /* Show guest assembly code for each compiled TB. */
+    void printAsm(CPUArchState *env, TranslationBlock *tb);
+
+    /* Show TCG micro ops for each compiled TB. */
+    void printOp(CPUArchState *env, TranslationBlock *tb);
+};
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm-types.h b/llvm/include/llvm-types.h
new file mode 100644
index 0000000..1b8d09c
--- /dev/null
+++ b/llvm/include/llvm-types.h
@@ -0,0 +1,127 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_TYPES_H
+#define __LLVM_TYPES_H
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#if defined(LLVM_V35)
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#elif defined(LLVM_V38)
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/Object/SymbolSize.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#elif defined(LLVM_V39)
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/Object/SymbolSize.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/MemoryObject.h"
+#else
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/Object/SymbolSize.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#endif
+
+#include <vector>
+#include <set>
+#include <map>
+#include "llvm-macro.h"
+#include "qemu-types.h"
+
+using namespace llvm;
+
+class HelperInfo;
+
+typedef std::vector<TranslationBlock *> TBVec;
+typedef std::vector<std::pair<BlockID, uint16_t> > RestoreVec;
+typedef std::map<uintptr_t, std::string> TCGHelperMap;   /* <func_ptr, func_name> */
+typedef std::map<std::string, HelperInfo*> HelperMap;
+typedef std::map<std::string, uintptr_t> SymbolMap;
+typedef std::map<intptr_t, Type *> FlatType;       /* <state_off, state_ty> */
+typedef std::vector<Instruction *> IVec;
+typedef std::vector<BasicBlock *> BBVec;
+
+
+static inline const DataLayout *getDataLayout(Module *Mod) {
+#if defined(LLVM_V35)
+    return Mod->getDataLayout();
+#else
+    return &Mod->getDataLayout();
+#endif
+}
+
+static inline AllocaInst *CreateAlloca(Type *Ty, unsigned AddrSpace,
+                                       const Twine &Name,
+                                       Instruction *InsertBefore = nullptr) {
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39)
+    return new AllocaInst(Ty, Name, InsertBefore);
+#else
+    return new AllocaInst(Ty, AddrSpace, Name, InsertBefore);
+#endif
+}
+
+static inline AllocaInst *CreateAlloca(Type *Ty, unsigned AddrSpace,
+                                       Value *ArraySize = nullptr,
+                                       const Twine &Name = "",
+                                       Instruction *InsertBefore = nullptr) {
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39)
+    return new AllocaInst(Ty, ArraySize, Name, InsertBefore);
+#else
+    return new AllocaInst(Ty, AddrSpace, ArraySize, Name, InsertBefore);
+#endif
+}
+
+static inline void InlineFunc(CallInst *CI) {
+#if defined(LLVM_V38) || defined(LLVM_V39)
+    AssumptionCacheTracker ACT;
+    InlineFunctionInfo IFI(nullptr, &ACT);
+#else
+    InlineFunctionInfo IFI;
+#endif
+    InlineFunction(CI, IFI);
+}
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/llvm.h b/llvm/include/llvm.h
new file mode 100644
index 0000000..67bff2f
--- /dev/null
+++ b/llvm/include/llvm.h
@@ -0,0 +1,278 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __LLVM_H
+#define __LLVM_H
+
+#include <memory>
+#include <vector>
+#include "llvm/ADT/STLExtras.h"
+#include "llvm-types.h"
+#include "llvm-debug.h"
+#include "utils.h"
+
+#if defined(ENABLE_MCJIT)
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "MCJITMemoryManager.h"
+typedef class DefaultMCJITMemoryManager MemoryManager;
+#else
+#if defined(LLVM_V35)
+#include "JIT.h"
+#include "JITMemoryManager.h"
+#else
+#  error "LLVM version >3.5 supports MCJIT only. ENABLE_MCJIT must be enabled."
+#endif
+typedef class DefaultJITMemoryManager MemoryManager;
+#endif
+
+
+extern cl::OptionCategory CategoryHQEMU;
+
+class LLVMTranslator;
+class OptimizationInfo;
+class TranslatedCode;
+
+typedef std::unique_ptr<OptimizationInfo> OptRequest;
+
+
+/*
+ * LLVMEnv is the top level container of whole LLVM translation environment
+ * which manages the LLVM translator(s) and globally shared resources. The
+ * LLVMEnv instance must be initialized before using the underlying transaltion
+ * service and can only be initialized ONCE.
+ */
+class LLVMEnv {
+public:
+    typedef std::vector<TranslatedCode *> TransCodeList;
+    typedef std::map<uintptr_t, TranslatedCode *> TransCodeMap;
+    typedef std::vector<uintptr_t> ChainSlot;
+    typedef std::pair<size_t, uintptr_t> SlotInfo;
+
+private:
+    std::shared_ptr<MemoryManager> MM;        /* Trace cache manager */
+    unsigned NumTranslator;                   /* The amount of LLVM translators */
+    std::vector<LLVMTranslator *> Translator; /* LLVM translators */
+    std::vector<pthread_t> HelperThread;      /* LLVM translation threads */
+    std::vector<CPUState *> ThreadEnv;
+
+    TransCodeList TransCode;  /* Translated traces. */
+    TransCodeMap SortedCode;  /* Sorted traces in code cache address order. */
+    ChainSlot ChainPoint;     /* Address of stubs for trace-to-block linking */
+
+    bool UseThreading; /* Whether multithreaded translators are used or not. */
+    unsigned NumFlush;
+
+    LLVMEnv();
+
+    /* Parse the command line options. */
+    void ParseCommandLineOptions();
+
+    /* Test whether HQEMU is running in Intel VTune. */
+    void ProbeIntelVTune();
+
+public:
+    QemuMutex mutex;
+
+    ~LLVMEnv();
+
+    /* Start/stop/restart LLVM translators and worker threads. */
+    void CreateTranslator();
+    void DeleteTranslator();
+    void RestartTranslator();
+    void StartThread();
+    void StopThread();
+
+    /* Get the LLVM translator with index. */
+    LLVMTranslator *getTranslator(unsigned ID) {
+        if (ID >= Translator.size())
+            hqemu_error("invalid translator ID.\n");
+        return Translator[ID];
+    }
+
+    /* Acquire and lock the first LLVM translator. */
+    LLVMTranslator *AcquireSingleTranslator();
+
+    /* Release the first LLVM translator. */
+    void ReleaseSingleTranslator();
+
+    /* Get CPUState of the LLVM translator with index. */
+    CPUState *getThreadEnv(int ID)              { return ThreadEnv[ID];  }
+
+    std::vector<pthread_t> &getHelperThread()   { return HelperThread;   }
+    std::shared_ptr<MemoryManager> getMemoryManager() { return MM;       }
+    TransCodeList &getTransCode()               { return TransCode;      }
+    TransCodeMap &getSortedCode()               { return SortedCode;     }
+    ChainSlot &getChainPoint()                  { return ChainPoint;     }
+    TraceID insertTransCode(TranslatedCode *TC);
+    SlotInfo getChainSlot();
+
+    bool isThreading()     { return UseThreading;      }
+    void incNumFlush()     { NumFlush++;               }
+    unsigned getNumFlush() { return NumFlush;          }
+
+    /*
+     * static public members
+     */
+    static bool InitOnce;  /* LLVMEnv is initialized or not? */
+    static int TransMode;
+    static uint8_t *TraceCache;
+    static size_t TraceCacheSize;
+    static bool RunWithVTune;
+
+    static void CreateLLVMEnv();
+    static void DeleteLLVMEnv();
+    static int OptimizeBlock(CPUArchState *env, OptRequest Request);
+    static int OptimizeTrace(CPUArchState *env, OptRequest Request);
+    static void setTransMode(int Mode) { TransMode = Mode; }
+    static int isTraceMode() {
+        return (TransMode == TRANS_MODE_HYBRIDS ||
+                TransMode == TRANS_MODE_HYBRIDM);
+    }
+};
+
+class QueueManager {
+    std::vector<Queue *> ActiveQueue;
+    Queue *CurrentQueue;
+
+public:
+    QueueManager();
+    ~QueueManager();
+    void Enqueue(OptimizationInfo *Opt);
+    void *Dequeue();
+    void Flush();
+};
+
+/*
+ * OptimizationInfo is the description to an optimization request. It consists
+ * of the optimization mode and the control-flow-graph of the trace.
+ */
+class OptimizationInfo {
+public:
+    typedef std::set<TranslationBlock *> TraceNode;
+    typedef std::map<TranslationBlock *, TraceNode> TraceEdge;
+
+    ~OptimizationInfo() {
+        if (CFG)
+            GraphNode::DeleteCFG(CFG);
+    }
+
+    void ComposeCFG();
+    GraphNode *getCFG()    { return CFG;      }
+    bool isTrace()         { return !isBlock; }
+
+    static OptRequest CreateRequest(TranslationBlock *tb) {
+        return OptRequest(new OptimizationInfo(tb));
+    }
+    static OptRequest CreateRequest(TBVec &trace, int idx) {
+        return OptRequest(new OptimizationInfo(trace, idx));
+    }
+    static OptRequest CreateRequest(TranslationBlock *head, TraceEdge &edges) {
+        return OptRequest(new OptimizationInfo(head, edges));
+    }
+
+private:
+    TBVec Trace;       /* Trace of a list of TBs */
+    int LoopHeadIdx;   /* Index to the loopback block */
+    bool isUserTrace;  /* Trace of all user-mode blocks */
+    bool isBlock;      /* Trace of a single block */
+    GraphNode *CFG;    /* CFG of the trace */
+
+    OptimizationInfo(TranslationBlock *tb)
+        : isUserTrace(true), isBlock(true) {
+        Trace.push_back(tb);
+        LoopHeadIdx = -1;
+        CFG = new GraphNode(tb);
+    }
+    OptimizationInfo(TBVec &trace, int idx)
+        : isUserTrace(true), isBlock(false), CFG(nullptr) {
+        if (trace.empty())
+            hqemu_error("trace length cannot be zero.\n");
+        Trace = trace;
+        LoopHeadIdx = idx;
+    }
+    OptimizationInfo(TranslationBlock *HeadTB, TraceEdge &Edges);
+
+    void SearchCycle(TraceNode &SearchNodes, TraceNode &Nodes,
+                     TraceEdge &Edges, TBVec &Visited, int Depth);
+    void ExpandTrace(TranslationBlock *HeadTB, TraceEdge &Edges);
+};
+
+class TraceInfo {
+public:
+    TBVec TBs;
+    unsigned NumLoop;
+    unsigned NumExit;
+    unsigned NumIndirectBr;
+    uint64_t **ExecCount;
+    uint64_t TransTime;
+    uint32_t Attribute;
+
+    TraceInfo(NodeVec &Nodes, uint32_t Attr = A_None)
+        : NumLoop(0), NumExit(0), NumIndirectBr(0), ExecCount(nullptr),
+          TransTime(0), Attribute(Attr)
+    {
+        if (Nodes.empty())
+            hqemu_error("number of nodes cannot be zero.\n");
+        for (unsigned i = 0, e = Nodes.size(); i != e; ++i)
+            TBs.push_back(Nodes[i]->getTB());
+    }
+
+    TranslationBlock *getEntryTB() { return TBs[0]; }
+    target_ulong getEntryPC() { return TBs[0]->pc; }
+    unsigned getNumBlock()    { return TBs.size(); }
+    void setTransTime(struct timeval *start, struct timeval *end) {
+        struct timeval t;
+        timersub(end, start, &t);
+        TransTime = t.tv_sec * 1e6 + t.tv_usec;
+    }
+    bool hasAttribute(uint32_t Attr) {
+        return Attribute & Attr;
+    }
+};
+
+struct ChainInfo {
+    std::vector<uintptr_t> Chains;
+    std::vector<BlockID> DepTraces;
+
+    void insertChain(uintptr_t addr) {
+        Chains.push_back(addr);
+    }
+    void insertDepTrace(BlockID id) {
+        DepTraces.push_back(id);
+    }
+    static ChainInfo *get(TranslationBlock *tb) {
+        if (!tb->chain)
+            tb->chain = (ChainInfo *)new ChainInfo;
+        return (ChainInfo *)tb->chain;
+    }
+    static void free(TranslationBlock *tb) {
+        delete (ChainInfo *)tb->chain;
+        tb->chain = nullptr;
+    }
+};
+
+class TranslatedCode {
+public:
+    TranslatedCode() : Trace(nullptr), SampleCount(0) {}
+    ~TranslatedCode() {
+        if (Trace)
+            delete Trace;
+    }
+
+    bool Active;
+    uint32_t Size;             /* Size of the translated host code */
+    uint8_t *Code;             /* Start PC of the translated host code */
+    TranslationBlock *EntryTB; /* The entry block of the region */
+    RestoreVec Restore;
+    TraceInfo *Trace;
+    uint64_t SampleCount;
+};
+
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/optimization.h b/llvm/include/optimization.h
new file mode 100644
index 0000000..bdafb3a
--- /dev/null
+++ b/llvm/include/optimization.h
@@ -0,0 +1,261 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __OPTIMIZATION_H
+#define __OPTIMIZATION_H
+
+#include <iostream>
+#include <list>
+#include "qemu-types.h"
+
+
+extern "C" TranslationBlock *tbs;
+
+/*
+ * Instruction TLB (iTLB)
+ */
+#define ITLB_CACHE_BITS     (10)
+#define ITLB_CACHE_SIZE     (1U << ITLB_CACHE_BITS)
+#define ITLB_CACHE_MASK     (ITLB_CACHE_SIZE - 1)
+
+class ITLB {
+    struct itlb_t { tb_page_addr_t paddr; };
+    itlb_t Cache[ITLB_CACHE_SIZE];
+
+public:
+    ITLB() { reset(); }
+    ~ITLB() {}
+
+    inline itlb_t &cache(target_ulong vaddr) {
+        return Cache[(vaddr >> TARGET_PAGE_BITS) & ITLB_CACHE_MASK];
+    }
+    void reset() {
+        for (unsigned i = 0; i < ITLB_CACHE_SIZE; ++i)
+            Cache[i].paddr = (tb_page_addr_t)-1;
+    }
+    void flush(target_ulong vaddr) {
+        cache(vaddr).paddr = (tb_page_addr_t)-1;
+    }
+    void insert(target_ulong vaddr, tb_page_addr_t paddr) {
+        cache(vaddr).paddr = paddr;
+    }
+    tb_page_addr_t get(target_ulong vaddr) {
+        return cache(vaddr).paddr;
+    }
+};
+
+
+/*
+ * Indirect Branch Target Cache (IBTC)
+ */
+#define IBTC_CACHE_BITS     (16)
+#define IBTC_CACHE_SIZE     (1U << IBTC_CACHE_BITS)
+#define IBTC_CACHE_MASK     (IBTC_CACHE_SIZE - 1)
+
+class IBTC {
+    typedef std::pair<target_ulong, TranslationBlock *> ibtc_t;
+    ibtc_t Cache[IBTC_CACHE_SIZE];
+    bool NeedUpdate;
+    uint64_t Total;         /* Total access count */
+    uint64_t Miss;          /* Miss count */
+
+public:
+    IBTC() : NeedUpdate(false), Total(0), Miss(0) { reset(); }
+    ~IBTC() {}
+
+    inline ibtc_t &cache(target_ulong pc) {
+        return Cache[(pc >> 2) & IBTC_CACHE_MASK];
+    }
+    void reset() {
+        for (unsigned i = 0; i < IBTC_CACHE_SIZE; ++i)
+            Cache[i].first = (target_ulong)-1;
+    }
+    void remove(TranslationBlock *tb) {
+        ibtc_t &c = cache(tb->pc);
+        if (c.first == tb->pc)
+            c.first = (target_ulong)-1;
+    }
+    void insert(target_ulong pc, TranslationBlock *tb) {
+        cache(pc) = std::make_pair(pc, tb);
+    }
+    TranslationBlock *get(target_ulong pc) {
+        ibtc_t &c = cache(pc);
+        return (c.first == pc) ? c.second : nullptr;
+    }
+    void setUpdate()   { NeedUpdate = true;  }
+    void resetUpdate() { NeedUpdate = false; }
+    bool needUpdate()  { return NeedUpdate;  }
+    inline void incTotal() { Total++; }
+    inline void incMiss()  { Miss++;  }
+    void dump() {
+        double HitRate = (double)(Total - Miss) * 100 / Total;
+        std::cerr << "\nibtc.miss = " << Miss << "/" << Total <<
+                     "  (hit rate=" << HitRate << "%)\n";
+    }
+};
+
+/*
+ * Cross-Page Block Linking (CPBL)
+ */
+class CPBL {
+    uint64_t Total;             /* Total access count */
+    uint64_t Miss;              /* Miss count */
+    uint64_t ValidateTotal;     /* Total validation count  */
+    uint64_t ValidateMiss;      /* Miss validation count */
+public:
+    CPBL() : Total(0), Miss(0), ValidateTotal(0), ValidateMiss(0) {}
+
+    inline void incTotal()  { Total++; }
+    inline void incMiss()   { Miss++; }
+    inline void incValidateTotal() { ValidateTotal++; }
+    inline void incValidateMiss()  { ValidateMiss++;  }
+    void dump() {
+        double HitRate = (double)(Total - Miss) * 100 / Total;
+        double HitRate2 = (double)(ValidateTotal - ValidateMiss) * 100 / Total;
+        std::cerr << "cpbl.miss = " << Miss << "/" << Total << 
+                     "  (hit rate=" << HitRate << "%)\n" <<
+                     "validate.miss = " << ValidateMiss << "/" << ValidateTotal <<
+                     "  (hit rate=" << HitRate2 << "%)\n";
+    }
+};
+
+/*
+ * Large Page Table
+ *
+ * This handling is to track every large page created by the guest system.
+ * Once a `possibly' large page is invalidated, do a search with the tracked
+ * pages to determine if it is really a large page invalidation. If it cannot
+ * be found, this is a false alert and we can fall back to the default-size
+ * page flushing. Otherwise, SoftTLB, IBTC/CPBL optimization, etc. are
+ * partial or full cleanup due to the true large page flushing.
+ */
+#define MAX_NUM_LARGEPAGE   (1024)
+
+class LargePageTable {
+    typedef std::pair<target_ulong, target_ulong> PTE;
+    typedef std::list<PTE> PTEList;
+    PTEList Used;
+    PTEList Free;
+    CPUState *CS;
+    uint64_t Total;
+    uint64_t Miss;
+
+public:
+    LargePageTable(CPUState *cpu) : Total(0), Miss(0) {
+        CS = cpu;
+        Used.clear();
+        Free.resize(MAX_NUM_LARGEPAGE);
+    }
+    ~LargePageTable() {}
+
+    enum {
+        SEARCH = 0,
+        FLUSH,
+    };
+
+    void reset() {
+        Free.splice(Free.end(), Used);
+    }
+    void remove(PTEList::iterator I) {
+        Free.splice(Free.begin(), Used, I);
+    }
+    void allocate(PTE pte) {
+        /* If the free list is empty, we need to clear softtlb by calling
+         * tlb_flush() which will then invoke LTP::reset() to clear LPT. */
+        if (Free.empty())
+            tlb_flush(CS, 0);
+        Free.front() = pte;
+        Used.splice(Used.begin(), Free, Free.begin());
+    }
+    void insert(target_ulong addr, target_ulong size) {
+        for (PTEList::iterator I = Used.begin(), E = Used.end(); I != E; ++I) {
+            if (I->first == (addr & I->second)) {
+                Used.splice(Used.begin(), Used, I);
+                return;
+            }
+        }
+        target_ulong mask = ~(size - 1);
+        allocate(PTE(addr & mask, mask));
+    }
+    bool search(target_ulong addr, bool mode, target_ulong *addrp,
+                target_ulong *sizep) {
+        for (PTEList::iterator I = Used.begin(), E = Used.end(); I != E; ++I) {
+            if (I->first != (addr & I->second))
+                continue;
+            *addrp = I->first;
+            *sizep = ~I->second + 1;
+            if (mode == FLUSH)
+                remove(I);
+            return true;
+        }
+        return false;
+    }
+    void incTotal() { Total++; }
+    void incMiss()  { Miss++;  }
+    void dump() {
+        double Rate = (double)(Total - Miss) * 100 / Total;
+        std::cerr << "lpt.miss = " << Miss << "/" << Total << 
+                     " (false flushing=" << Rate << "% #pages=" <<
+                     Used.size() << ")\n";
+    }
+};
+
+
+class BaseTracer;
+
+struct CPUOptimization {
+    CPUOptimization(CPUState *cpu, BaseTracer *tracer)
+        : lpt(LargePageTable(cpu)), pt(tracer) {}
+
+    ITLB itlb;          /* instruction TLB */
+    IBTC ibtc;          /* indirect branch target cache */
+    CPBL cpbl;          /* cross-page block linking */
+    LargePageTable lpt; /* large page handling */
+    BaseTracer *pt;     /* processor tracer */
+};
+
+
+static inline int isUserTB(TranslationBlock *tb) {
+    int is_user = 1;
+#if defined(CONFIG_SOFTMMU)
+#if defined(TARGET_ALPHA)
+    is_user = (tb->flags & TB_FLAGS_USER_MODE);
+#elif defined(TARGET_ARM)
+    is_user = ((ARM_TBFLAG_MMUIDX(tb->flags) & 3) == 0);
+#elif defined(TARGET_I386)
+    is_user = ((tb->flags >> HF_CPL_SHIFT) & 3) == 3;
+#elif defined(TARGET_MIPS)
+    is_user = (tb->flags & MIPS_HFLAG_UM);
+#elif defined(TARGET_PPC)
+    is_user = ((tb->flags >> MSR_PR) & 1);
+#else
+#error "unsupported processor type"
+#endif
+#endif
+    return is_user;
+}
+
+static inline ITLB &cpu_get_itlb(CPUArchState *env) {
+    return ((CPUOptimization *)env->opt_link)->itlb;
+}
+static inline IBTC &cpu_get_ibtc(CPUArchState *env) {
+    return ((CPUOptimization *)env->opt_link)->ibtc;
+}
+static inline CPBL &cpu_get_cpbl(CPUArchState *env) {
+    return ((CPUOptimization *)env->opt_link)->cpbl;
+}
+static inline LargePageTable &cpu_get_lpt(CPUArchState *env) {
+    return ((CPUOptimization *)env->opt_link)->lpt;
+}
+static inline BaseTracer *cpu_get_tracer(CPUArchState *env) {
+    return ((CPUOptimization *)env->opt_link)->pt;
+}
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/pmu/arm/arm-events.h b/llvm/include/pmu/arm/arm-events.h
new file mode 100644
index 0000000..b3bb1d7
--- /dev/null
+++ b/llvm/include/pmu/arm/arm-events.h
@@ -0,0 +1,35 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __ARM_EVENTS_H
+#define __ARM_EVENTS_H
+
+#include <vector>
+#include "pmu/pmu.h"
+
+namespace pmu {
+
+class PMUEvent;
+
+#if defined(__arm__)
+#define pmu_mb()    ((void(*)(void))0xffff0fa0)()
+#define pmu_rmb()   ((void(*)(void))0xffff0fa0)()
+#define pmu_wmb()   ((void(*)(void))0xffff0fa0)()
+#elif defined(__aarch64__)
+#define pmu_mb()    asm volatile("dmb ish" ::: "memory")
+#define pmu_rmb()   asm volatile("dmb ishld" ::: "memory")
+#define pmu_wmb()   asm volatile("dmb ishst" ::: "memory")
+#endif 
+
+
+int ARMInit(void);
+
+} /* namespace pmu */
+
+#endif /* __ARM_EVENTS_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/perf_event.h b/llvm/include/pmu/perf_event.h
new file mode 100644
index 0000000..81fed4a
--- /dev/null
+++ b/llvm/include/pmu/perf_event.h
@@ -0,0 +1,992 @@
+/*
+ * This file is copied from linux-4.11/include/uapi/linux/perf_event.h.
+ *
+ * Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
+ *
+ * Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#ifndef _UAPI_LINUX_PERF_EVENT_H
+#define _UAPI_LINUX_PERF_EVENT_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
+	PERF_TYPE_BREAKPOINT			= 5,
+
+	PERF_TYPE_MAX,				/* non-ABI */
+};
+
+/*
+ * Generalized performance event event_id types, used by the
+ * attr.event_id parameter of the sys_perf_event_open()
+ * syscall:
+ */
+enum perf_hw_id {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
+	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
+	PERF_COUNT_HW_REF_CPU_CYCLES		= 9,
+
+	PERF_COUNT_HW_MAX,			/* non-ABI */
+};
+
+/*
+ * Generalized hardware cache events:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+	PERF_COUNT_HW_CACHE_NODE		= 6,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
+};
+
+/*
+ * Special "software" events provided by the kernel, even if the hardware
+ * does not support performance events. These events measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
+	PERF_COUNT_SW_DUMMY			= 9,
+	PERF_COUNT_SW_BPF_OUTPUT		= 10,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_event_sample_format {
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_READ			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_RAW				= 1U << 10,
+	PERF_SAMPLE_BRANCH_STACK		= 1U << 11,
+	PERF_SAMPLE_REGS_USER			= 1U << 12,
+	PERF_SAMPLE_STACK_USER			= 1U << 13,
+	PERF_SAMPLE_WEIGHT			= 1U << 14,
+	PERF_SAMPLE_DATA_SRC			= 1U << 15,
+	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
+	PERF_SAMPLE_TRANSACTION			= 1U << 17,
+	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+
+	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+};
+
+/*
+ * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
+ *
+ * If the user does not pass priv level information via branch_sample_type,
+ * the kernel uses the event's priv level. Branch and event priv levels do
+ * not have to match. Branch priv level is checked for permissions.
+ *
+ * The branch types can be combined, however BRANCH_ANY covers all types
+ * of branches and therefore it supersedes all the other types.
+ */
+enum perf_branch_sample_type_shift {
+	PERF_SAMPLE_BRANCH_USER_SHIFT		= 0, /* user branches */
+	PERF_SAMPLE_BRANCH_KERNEL_SHIFT		= 1, /* kernel branches */
+	PERF_SAMPLE_BRANCH_HV_SHIFT		= 2, /* hypervisor branches */
+
+	PERF_SAMPLE_BRANCH_ANY_SHIFT		= 3, /* any branch types */
+	PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT	= 4, /* any call branch */
+	PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT	= 5, /* any return branch */
+	PERF_SAMPLE_BRANCH_IND_CALL_SHIFT	= 6, /* indirect calls */
+	PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT	= 7, /* transaction aborts */
+	PERF_SAMPLE_BRANCH_IN_TX_SHIFT		= 8, /* in transaction */
+	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
+	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
+
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
+	PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT	= 12, /* indirect jumps */
+	PERF_SAMPLE_BRANCH_CALL_SHIFT		= 13, /* direct call */
+
+	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
+	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
+
+	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
+};
+
+enum perf_branch_sample_type {
+	PERF_SAMPLE_BRANCH_USER		= 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
+	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
+
+	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
+
+	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_JUMP	= 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
+	PERF_SAMPLE_BRANCH_CALL		= 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT,
+
+	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
+
+	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
+};
+
+#define PERF_SAMPLE_BRANCH_PLM_ALL \
+	(PERF_SAMPLE_BRANCH_USER|\
+	 PERF_SAMPLE_BRANCH_KERNEL|\
+	 PERF_SAMPLE_BRANCH_HV)
+
+/*
+ * Values to determine ABI of the registers dump.
+ */
+enum perf_sample_regs_abi {
+	PERF_SAMPLE_REGS_ABI_NONE	= 0,
+	PERF_SAMPLE_REGS_ABI_32		= 1,
+	PERF_SAMPLE_REGS_ABI_64		= 2,
+};
+
+/*
+ * Values for the memory transaction event qualifier, mostly for
+ * abort events. Multiple bits can be set.
+ */
+enum {
+	PERF_TXN_ELISION        = (1 << 0), /* From elision */
+	PERF_TXN_TRANSACTION    = (1 << 1), /* From transaction */
+	PERF_TXN_SYNC           = (1 << 2), /* Instruction is related */
+	PERF_TXN_ASYNC          = (1 << 3), /* Instruction not related */
+	PERF_TXN_RETRY          = (1 << 4), /* Retry possible */
+	PERF_TXN_CONFLICT       = (1 << 5), /* Conflict abort */
+	PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */
+	PERF_TXN_CAPACITY_READ  = (1 << 7), /* Capacity read abort */
+
+	PERF_TXN_MAX	        = (1 << 8), /* non-ABI */
+
+	/* bits 32..63 are reserved for the abort code */
+
+	PERF_TXN_ABORT_MASK  = (0xffffffffULL << 32),
+	PERF_TXN_ABORT_SHIFT = 32,
+};
+
+/*
+ * The format of the data returned by read() on a perf event fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ *	{ u64		value;
+ *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
+ *	  { u64		id;           } && PERF_FORMAT_ID
+ *	} && !PERF_FORMAT_GROUP
+ *
+ *	{ u64		nr;
+ *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
+ *	  { u64		value;
+ *	    { u64	id;           } && PERF_FORMAT_ID
+ *	  }		cntr[nr];
+ *	} && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_event_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
+
+	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+#define PERF_ATTR_SIZE_VER1	72	/* add: config2 */
+#define PERF_ATTR_SIZE_VER2	80	/* add: branch_sample_type */
+#define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
+					/* add: sample_stack_user */
+#define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
+#define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
+
+/*
+ * Hardware event_id to monitor via a performance monitoring event:
+ *
+ * @sample_max_stack: Max number of frame pointers in a callchain,
+ *		      should be < /proc/sys/kernel/perf_event_max_stack
+ */
+struct perf_event_attr {
+
+	/*
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	uint32_t type;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	uint32_t size;
+
+	/*
+	 * Type specific configuration information.
+	 */
+	uint64_t config;
+
+	union {
+		uint64_t sample_period;
+		uint64_t sample_freq;
+	};
+
+	uint64_t sample_type;
+	uint64_t read_format;
+
+	uint64_t disabled       :  1, /* off by default        */
+		 inherit	:  1, /* children inherit it   */
+		 pinned	        :  1, /* must always be on PMU */
+		 exclusive      :  1, /* only group on PMU     */
+		 exclude_user   :  1, /* don't count user      */
+		 exclude_kernel :  1, /* ditto kernel          */
+		 exclude_hv     :  1, /* ditto hypervisor      */
+		 exclude_idle   :  1, /* don't count when idle */
+		 mmap           :  1, /* include mmap data     */
+		 comm	        :  1, /* include comm data     */
+		 freq           :  1, /* use freq, not period  */
+		 inherit_stat   :  1, /* per task counts       */
+		 enable_on_exec :  1, /* next exec enables     */
+		 task           :  1, /* trace fork/exit       */
+		 watermark      :  1, /* wakeup_watermark      */
+		 /*
+		  * precise_ip:
+		  *
+		  *  0 - SAMPLE_IP can have arbitrary skid
+		  *  1 - SAMPLE_IP must have constant skid
+		  *  2 - SAMPLE_IP requested to have 0 skid
+		  *  3 - SAMPLE_IP must have 0 skid
+		  *
+		  *  See also PERF_RECORD_MISC_EXACT_IP
+		  */
+		 precise_ip     :  2, /* skid constraint       */
+		 mmap_data      :  1, /* non-exec mmap data    */
+		 sample_id_all  :  1, /* sample_type all events */
+
+		 exclude_host   :  1, /* don't count in host   */
+		 exclude_guest  :  1, /* don't count in guest  */
+
+		 exclude_callchain_kernel : 1, /* exclude kernel callchains */
+		 exclude_callchain_user   : 1, /* exclude user callchains */
+		 mmap2          :  1, /* include mmap with inode data     */
+		 comm_exec      :  1, /* flag comm events that are due to an exec */
+		 use_clockid    :  1, /* use @clockid for time fields */
+		 context_switch :  1, /* context switch data */
+		 write_backward :  1, /* Write ring buffer from end to beginning */
+		 __reserved_1   : 36;
+
+	union {
+		uint32_t wakeup_events;	  /* wakeup every n events */
+		uint32_t wakeup_watermark; /* bytes before wakeup   */
+	};
+
+	uint32_t bp_type;
+	union {
+		uint64_t bp_addr;
+		uint64_t config1; /* extension of config */
+	};
+	union {
+		uint64_t bp_len;
+		uint64_t config2; /* extension of config1 */
+	};
+	uint64_t branch_sample_type; /* enum perf_branch_sample_type */
+
+	/*
+	 * Defines set of user regs to dump on samples.
+	 * See asm/perf_regs.h for details.
+	 */
+	uint64_t sample_regs_user;
+
+	/*
+	 * Defines size of the user stack to dump on samples.
+	 */
+	uint32_t sample_stack_user;
+
+	int32_t	clockid;
+	/*
+	 * Defines set of regs to dump for each sample
+	 * state captured on:
+	 *  - precise = 0: PMU interrupt
+	 *  - precise > 0: sampled instruction
+	 *
+	 * See asm/perf_regs.h for details.
+	 */
+	uint64_t sample_regs_intr;
+
+	/*
+	 * Wakeup watermark for AUX area
+	 */
+	uint32_t aux_watermark;
+	uint16_t sample_max_stack;
+	uint16_t __reserved_2;	/* align to uint64_t */
+};
+
+#define perf_flags(attr)	(*(&(attr)->read_format + 1))
+
+/*
+ * Ioctls that can be done on a perf event fd:
+ */
+#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
+#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, uint64_t)
+#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ID		_IOR('$', 7, uint64_t *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, uint32_t)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, uint32_t)
+
+enum perf_event_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_event_mmap_page {
+	uint32_t version;		/* version number of this structure */
+	uint32_t compat_version;	/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw events in user-space.
+	 *
+	 *   u32 seq, time_mult, time_shift, index, width;
+	 *   u64 count, enabled, running;
+	 *   u64 cyc, time_offset;
+	 *   s64 pmc = 0;
+	 *
+	 *   do {
+	 *     seq = pc->lock;
+	 *     barrier()
+	 *
+	 *     enabled = pc->time_enabled;
+	 *     running = pc->time_running;
+	 *
+	 *     if (pc->cap_usr_time && enabled != running) {
+	 *       cyc = rdtsc();
+	 *       time_offset = pc->time_offset;
+	 *       time_mult   = pc->time_mult;
+	 *       time_shift  = pc->time_shift;
+	 *     }
+	 *
+	 *     index = pc->index;
+	 *     count = pc->offset;
+	 *     if (pc->cap_user_rdpmc && index) {
+	 *       width = pc->pmc_width;
+	 *       pmc = rdpmc(index - 1);
+	 *     }
+	 *
+	 *     barrier();
+	 *   } while (pc->lock != seq);
+	 *
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
+	 */
+	uint32_t lock;			/* seqlock for synchronization */
+	uint32_t index;			/* hardware event identifier */
+	int64_t	 offset;		/* add to hardware event value */
+	uint64_t time_enabled;		/* time event active */
+	uint64_t time_running;		/* time event on cpu */
+	union {
+		uint64_t capabilities;
+		struct {
+			uint64_t cap_bit0		: 1, /* Always 0, deprecated, see commit 860f085b74e9 */
+				 cap_bit0_is_deprecated	: 1, /* Always 1, signals that bit 0 is zero */
+
+				 cap_user_rdpmc		: 1, /* The RDPMC instruction can be used to read counts */
+				 cap_user_time		: 1, /* The time_* fields are used */
+				 cap_user_time_zero	: 1, /* The time_zero field is used */
+				 cap_____res		: 59;
+		};
+	};
+
+	/*
+	 * If cap_user_rdpmc this field provides the bit-width of the value
+	 * read using the rdpmc() or equivalent instruction. This can be used
+	 * to sign extend the result like:
+	 *
+	 *   pmc <<= 64 - width;
+	 *   pmc >>= 64 - width; // signed shift right
+	 *   count += pmc;
+	 */
+	uint16_t pmc_width;
+
+	/*
+	 * If cap_usr_time the below fields can be used to compute the time
+	 * delta since time_enabled (in ns) using rdtsc or similar.
+	 *
+	 *   u64 quot, rem;
+	 *   u64 delta;
+	 *
+	 *   quot = (cyc >> time_shift);
+	 *   rem = cyc & (((u64)1 << time_shift) - 1);
+	 *   delta = time_offset + quot * time_mult +
+	 *              ((rem * time_mult) >> time_shift);
+	 *
+	 * Where time_offset,time_mult,time_shift and cyc are read in the
+	 * seqcount loop described above. This delta can then be added to
+	 * enabled and possible running (if index), improving the scaling:
+	 *
+	 *   enabled += delta;
+	 *   if (index)
+	 *     running += delta;
+	 *
+	 *   quot = count / running;
+	 *   rem  = count % running;
+	 *   count = quot * enabled + (rem * enabled) / running;
+	 */
+	uint16_t time_shift;
+	uint32_t time_mult;
+	uint64_t time_offset;
+	/*
+	 * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated
+	 * from sample timestamps.
+	 *
+	 *   time = timestamp - time_zero;
+	 *   quot = time / time_mult;
+	 *   rem  = time % time_mult;
+	 *   cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
+	 *
+	 * And vice versa:
+	 *
+	 *   quot = cyc >> time_shift;
+	 *   rem  = cyc & (((u64)1 << time_shift) - 1);
+	 *   timestamp = time_zero + quot * time_mult +
+	 *               ((rem * time_mult) >> time_shift);
+	 */
+	uint64_t time_zero;
+	uint32_t size;			/* Header size up to __reserved[] fields. */
+
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	uint8_t	__reserved[118*8+4];	/* align to 1k. */
+
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading the @data_head value should issue an smp_rmb(),
+	 * after reading this value.
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data, after issueing
+	 * an smp_mb() to separate the data read from the ->data_tail store.
+	 * In this case the kernel will not over-write unread data.
+	 *
+	 * See perf_output_put_handle() for the data ordering.
+	 *
+	 * data_{offset,size} indicate the location and size of the perf record
+	 * buffer within the mmapped area.
+	 */
+	uint64_t data_head;		/* head in the data section */
+	uint64_t data_tail;		/* user-space written tail */
+	uint64_t data_offset;		/* where the buffer starts */
+	uint64_t data_size;		/* data buffer size */
+
+	/*
+	 * AUX area is defined by aux_{offset,size} fields that should be set
+	 * by the userspace, so that
+	 *
+	 *   aux_offset >= data_offset + data_size
+	 *
+	 * prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+	 *
+	 * Ring buffer pointers aux_{head,tail} have the same semantics as
+	 * data_{head,tail} and same ordering rules apply.
+	 */
+	uint64_t aux_head;
+	uint64_t aux_tail;
+	uint64_t aux_offset;
+	uint64_t aux_size;
+};
+
+#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
+#define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
+#define PERF_RECORD_MISC_KERNEL			(1 << 0)
+#define PERF_RECORD_MISC_USER			(2 << 0)
+#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
+
+/*
+ * Indicates that /proc/PID/maps parsing are truncated by time out.
+ */
+#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
+/*
+ * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
+ * different events so can reuse the same bit position.
+ * Ditto PERF_RECORD_MISC_SWITCH_OUT.
+ */
+#define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
+#define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
+#define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
+/*
+ * Indicates that the content of PERF_SAMPLE_IP points to
+ * the actual instruction that triggered the event. See also
+ * perf_event_attr::precise_ip.
+ */
+#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
+/*
+ * Reserve the last bit to indicate some extended misc field
+ */
+#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
+
+struct perf_event_header {
+	uint32_t type;
+	uint16_t misc;
+	uint16_t size;
+};
+
+enum perf_event_type {
+
+	/*
+	 * If perf_event_attr.sample_id_all is set then all event types will
+	 * have the sample_type selected fields related to where/when
+	 * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU,
+	 * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed
+	 * just after the perf_event_header and the fields already present for
+	 * the existing fields, i.e. at the end of the payload. That way a newer
+	 * perf.data file will be supported by older perf tools, with these new
+	 * optional fields being ignored.
+	 *
+	 * struct sample_id {
+	 * 	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 * 	{ u64			time;     } && PERF_SAMPLE_TIME
+	 * 	{ u64			id;       } && PERF_SAMPLE_ID
+	 * 	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 * 	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 *	{ u64			id;	  } && PERF_SAMPLE_IDENTIFIER
+	 * } && perf_event_attr::sample_id_all
+	 *
+	 * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.  The
+	 * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed
+	 * relative to header.size.
+	 */
+
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_MMAP			= 1,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				id;
+	 *	u64				lost;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_LOST			= 2,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	char				comm[];
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_COMM			= 3,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_EXIT			= 4,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				id;
+	 *	u64				stream_id;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_THROTTLE			= 5,
+	PERF_RECORD_UNTHROTTLE			= 6,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_FORK			= 7,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, tid;
+	 *
+	 *	struct read_format		values;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_READ			= 8,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	#
+	 *	# Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
+	 *	# The advantage of PERF_SAMPLE_IDENTIFIER is that its position
+	 *	# is fixed relative to header.
+	 *	#
+	 *
+	 *	{ u64			id;	  } && PERF_SAMPLE_IDENTIFIER
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+	 *
+	 *	{ u64			nr,
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 *	#
+	 *	# The RAW record below is opaque data wrt the ABI
+	 *	#
+	 *	# That is, the ABI doesn't make any promises wrt to
+	 *	# the stability of its content, it may vary depending
+	 *	# on event, hardware, kernel version and phase of
+	 *	# the moon.
+	 *	#
+	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 *	#
+	 *
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 *
+	 *	{ u64                   nr;
+	 *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
+	 *
+	 * 	{ u64			abi; # enum perf_sample_regs_abi
+	 * 	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+	 *
+	 * 	{ u64			size;
+	 * 	  char			data[size];
+	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
+	 *
+	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
+	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
+	 *	{ u64			abi; # enum perf_sample_regs_abi
+	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 * };
+	 */
+	PERF_RECORD_SAMPLE			= 9,
+
+	/*
+	 * The MMAP2 records are an augmented version of MMAP, they add
+	 * maj, min, ino numbers to be used to uniquely identify each mapping
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	u32				maj;
+	 *	u32				min;
+	 *	u64				ino;
+	 *	u64				ino_generation;
+	 *	u32				prot, flags;
+	 *	char				filename[];
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_MMAP2			= 10,
+
+	/*
+	 * Records that new data landed in the AUX buffer part.
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u64				aux_offset;
+	 * 	u64				aux_size;
+	 *	u64				flags;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_AUX				= 11,
+
+	/*
+	 * Indicates that instruction trace has started
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 * };
+	 */
+	PERF_RECORD_ITRACE_START		= 12,
+
+	/*
+	 * Records the dropped/lost sample number.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u64				lost;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_LOST_SAMPLES		= 13,
+
+	/*
+	 * Records a context switch in or out (flagged by
+	 * PERF_RECORD_MISC_SWITCH_OUT). See also
+	 * PERF_RECORD_SWITCH_CPU_WIDE.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH			= 14,
+
+	/*
+	 * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
+	 * next_prev_tid that are the next (switching out) or previous
+	 * (switching in) pid/tid.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				next_prev_pid;
+	 *	u32				next_prev_tid;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
+
+	PERF_RECORD_MAX,			/* non-ABI */
+};
+
+#define PERF_MAX_STACK_DEPTH		127
+#define PERF_MAX_CONTEXTS_PER_STACK	  8
+
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (uint64_t)-32,
+	PERF_CONTEXT_KERNEL		= (uint64_t)-128,
+	PERF_CONTEXT_USER		= (uint64_t)-512,
+
+	PERF_CONTEXT_GUEST		= (uint64_t)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (uint64_t)-2176,
+	PERF_CONTEXT_GUEST_USER		= (uint64_t)-2560,
+
+	PERF_CONTEXT_MAX		= (uint64_t)-4095,
+};
+
+/**
+ * PERF_RECORD_AUX::flags bits
+ */
+#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
+
+#define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
+#define PERF_FLAG_FD_OUTPUT		(1UL << 1)
+#define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
+#define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
+
+union perf_mem_data_src {
+	uint64_t val;
+	struct {
+		uint64_t mem_op:5,	/* type of opcode */
+			 mem_lvl:14,	/* memory hierarchy level */
+			 mem_snoop:5,	/* snoop mode */
+			 mem_lock:2,	/* lock instr */
+			 mem_dtlb:7,	/* tlb access */
+			 mem_rsvd:31;
+	};
+};
+
+/* type of opcode (load/store/prefetch,code) */
+#define PERF_MEM_OP_NA		0x01 /* not available */
+#define PERF_MEM_OP_LOAD	0x02 /* load instruction */
+#define PERF_MEM_OP_STORE	0x04 /* store instruction */
+#define PERF_MEM_OP_PFETCH	0x08 /* prefetch */
+#define PERF_MEM_OP_EXEC	0x10 /* code (execution) */
+#define PERF_MEM_OP_SHIFT	0
+
+/* memory hierarchy (memory level, hit or miss) */
+#define PERF_MEM_LVL_NA		0x01  /* not available */
+#define PERF_MEM_LVL_HIT	0x02  /* hit level */
+#define PERF_MEM_LVL_MISS	0x04  /* miss level  */
+#define PERF_MEM_LVL_L1		0x08  /* L1 */
+#define PERF_MEM_LVL_LFB	0x10  /* Line Fill Buffer */
+#define PERF_MEM_LVL_L2		0x20  /* L2 */
+#define PERF_MEM_LVL_L3		0x40  /* L3 */
+#define PERF_MEM_LVL_LOC_RAM	0x80  /* Local DRAM */
+#define PERF_MEM_LVL_REM_RAM1	0x100 /* Remote DRAM (1 hop) */
+#define PERF_MEM_LVL_REM_RAM2	0x200 /* Remote DRAM (2 hops) */
+#define PERF_MEM_LVL_REM_CCE1	0x400 /* Remote Cache (1 hop) */
+#define PERF_MEM_LVL_REM_CCE2	0x800 /* Remote Cache (2 hops) */
+#define PERF_MEM_LVL_IO		0x1000 /* I/O memory */
+#define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
+#define PERF_MEM_LVL_SHIFT	5
+
+/* snoop mode */
+#define PERF_MEM_SNOOP_NA	0x01 /* not available */
+#define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
+#define PERF_MEM_SNOOP_HIT	0x04 /* snoop hit */
+#define PERF_MEM_SNOOP_MISS	0x08 /* snoop miss */
+#define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
+#define PERF_MEM_SNOOP_SHIFT	19
+
+/* locked instruction */
+#define PERF_MEM_LOCK_NA	0x01 /* not available */
+#define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
+#define PERF_MEM_LOCK_SHIFT	24
+
+/* TLB access */
+#define PERF_MEM_TLB_NA		0x01 /* not available */
+#define PERF_MEM_TLB_HIT	0x02 /* hit level */
+#define PERF_MEM_TLB_MISS	0x04 /* miss level */
+#define PERF_MEM_TLB_L1		0x08 /* L1 */
+#define PERF_MEM_TLB_L2		0x10 /* L2 */
+#define PERF_MEM_TLB_WK		0x20 /* Hardware Walker*/
+#define PERF_MEM_TLB_OS		0x40 /* OS fault handler */
+#define PERF_MEM_TLB_SHIFT	26
+
+#define PERF_MEM_S(a, s) \
+	(((uint64_t)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
+
+/*
+ * single taken branch record layout:
+ *
+ *      from: source instruction (may not always be a branch insn)
+ *        to: branch target
+ *   mispred: branch target was mispredicted
+ * predicted: branch target was predicted
+ *
+ * support for mispred, predicted is optional. In case it
+ * is not supported mispred = predicted = 0.
+ *
+ *     in_tx: running in a hardware transaction
+ *     abort: aborting a hardware transaction
+ *    cycles: cycles from last branch (or 0 if not supported)
+ */
+struct perf_branch_entry {
+	uint64_t from;
+	uint64_t to;
+	uint64_t mispred:1,  /* target mispredicted */
+		 predicted:1,/* target predicted */
+		 in_tx:1,    /* in transaction */
+		 abort:1,    /* transaction abort */
+		 cycles:16,  /* cycle count to last branch */
+		 reserved:44;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/llvm/include/pmu/pmu-events.h b/llvm/include/pmu/pmu-events.h
new file mode 100644
index 0000000..2c31ae9
--- /dev/null
+++ b/llvm/include/pmu/pmu-events.h
@@ -0,0 +1,131 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __PMU_EVENTS_H
+#define __PMU_EVENTS_H
+
+#include <list>
+#include <vector>
+#include <signal.h>
+#include "pmu-global.h"
+#include "pmu.h"
+
+namespace pmu {
+
+#define PMU_MAX_EVENTS  (1024)
+
+class Timer;
+
+/* Mode of the event. */
+enum {
+    MODE_NONE                = 0,
+    MODE_COUNTER             = ((uint32_t)1U << 1),
+    MODE_SAMPLE              = ((uint32_t)1U << 2),
+    MODE_SAMPLE_IP           = ((uint32_t)1U << 3),
+    MODE_SAMPLE_READ         = ((uint32_t)1U << 4),
+};
+
+/* State of the event. */
+enum {
+    STATE_STOP       = 0,
+    STATE_START      = ((uint32_t)1U << 1),
+    STATE_GOTO_STOP  = ((uint32_t)1U << 2),
+    STATE_GOTO_START = ((uint32_t)1U << 3),
+};
+
+/* Sampling mmap buffer information. */
+struct MMap {
+    void *Base;
+    uint64_t Size;
+    uint64_t Prev;
+};
+
+/* Event. */
+struct PMUEvent {
+    PMUEvent() : Hndl(0), Mode(MODE_NONE), State(STATE_STOP) {}
+
+    Handle Hndl;          /* Unique handle value */
+    int Mode;             /* Event mode */
+    int State;            /* Current event state */
+    std::vector<int> FD;  /* Opened fd(s) of this event */
+    MMap Data;            /* mmap data info */
+    MMap Aux;             /* mmap aux info */
+    uint64_t Watermark;   /* The bytes before wakeup */
+    /* Overflow handling function pointer */
+    union {
+        void *OverflowHandler;
+        SampleHandlerTy SampleHandler;
+    };
+    void *Opaque;         /* Opaque pointer passed to the overflow handler. */
+
+    int getFD() { return FD[0]; }   /* Group leader fd */
+};
+
+/*
+ * Event Manager.
+ */
+class EventManager {
+    typedef std::list<PMUEvent *> EventList;
+
+    PMUEvent Events[PMU_MAX_EVENTS]; /* Pre-allocated events */
+    EventList FreeEvents;            /* Free events */
+    EventList SampleEvents;          /* Sampling events */
+    Timer *EventTimer;               /* Timer for sampling events. */
+    std::vector<PMUEvent *> ChangedEvents;
+
+public:
+    EventManager();
+    ~EventManager();
+
+    /* Return the event of the input handle. */
+    PMUEvent *GetEvent(Handle Hndl);
+
+    /* Add a counting event and return its handle. */
+    Handle AddEvent(int fd);
+
+    /* Add a sampling event and return its handle. */
+    Handle AddSampleEvent(unsigned NumFDs, int *FD, uint64_t DataSize, void *Data,
+                          uint32_t Mode, SampleConfig &Config);
+
+    /* Notify that an event is started. */
+    void StartEvent(PMUEvent *Event, bool ShouldLock = true);
+
+    /* Notify that an event is stopped. */
+    void StopEvent(PMUEvent *Event, bool ShouldLock = true);
+
+    /* Notify that an event is deleted. */
+    void DeleteEvent(PMUEvent *Event);
+
+    /* Stop the event manager. */
+    void Pause();
+
+    /* Restart the event manager. */
+    void Resume();
+
+    friend void DefaultHandler(int signum, siginfo_t *info, void *data);
+};
+
+/* Interval timer. */
+class Timer {
+    timer_t T;
+
+public:
+    Timer(int Signum, int TID);
+    ~Timer();
+
+    /* Start a timer that expires just once.  */
+    void Start();
+
+    /* Stop a timer.*/
+    void Stop();
+};
+
+} /* namespace pmu */
+
+#endif /* __PMU_EVENTS_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/pmu-global.h b/llvm/include/pmu/pmu-global.h
new file mode 100644
index 0000000..ed059a4
--- /dev/null
+++ b/llvm/include/pmu/pmu-global.h
@@ -0,0 +1,52 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __PMU_GLOBAL_H
+#define __PMU_GLOBAL_H
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "pmu/x86/x86-events.h"
+#elif defined(__arm__) || defined(__aarch64__)
+#include "pmu/arm/arm-events.h"
+#elif defined(_ARCH_PPC) || defined(_ARCH_PPC64)
+#include "pmu/ppc/ppc-events.h"
+#endif
+
+#include "pmu/pmu-utils.h"
+#include "pmu/pmu.h"
+
+namespace pmu {
+
+#define PMU_SIGNAL_NUM     SIGIO
+#define PMU_SAMPLE_PERIOD  1e6
+#define PMU_SAMPLE_PAGES   4
+
+class EventManager;
+
+/* Pre-defined event identity. */
+struct EventID {
+    int Type;     /* Perf major type: hardware/software/etc */
+    int Config;   /* Perf type specific configuration information */
+};
+
+/* System-wide configuration. */
+struct GlobalConfig {
+    int PageSize;       /* Host page size */
+    int SignalReceiver; /* TID of the signal receiver */
+    uint32_t Timeout;   /* Timer period in nanosecond */
+    int PerfVersion;    /* Perf version used in this PMU tool */
+    int OSPerfVersion;  /* Perf version used in the OS kernel */
+};
+
+extern EventManager *EventMgr;
+extern GlobalConfig SysConfig;
+
+} /* namespace pmu */
+
+#endif /* __PMU_GLOBAL_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/pmu-utils.h b/llvm/include/pmu/pmu-utils.h
new file mode 100644
index 0000000..5e3e014
--- /dev/null
+++ b/llvm/include/pmu/pmu-utils.h
@@ -0,0 +1,106 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __PMU_UTILS_H
+#define __PMU_UTILS_H
+
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include "perf_event.h"
+
+#ifndef ACCESS_ONCE
+#define ACCESS_ONCE(x) (*(volatile decltype(x) *)&(x))
+#endif
+
+namespace pmu {
+
+static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
+                                      int cpu, int group_fd,
+                                      unsigned long flags) {
+    return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+static inline void perf_attr_init(struct perf_event_attr *attr, int type,
+                                  int config) {
+    memset(attr, 0, sizeof(struct perf_event_attr));
+    attr->type = type;
+    attr->config = config;
+    attr->size = sizeof(struct perf_event_attr);
+    attr->disabled = 1;
+    attr->exclude_kernel = 1;
+    attr->exclude_guest = 1;
+    attr->exclude_hv = 1;
+}
+
+static inline int perf_event_start(int fd) {
+    return ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static inline int perf_event_stop(int fd) {
+    return ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+static inline int perf_event_reset(int fd) {
+    return ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+}
+
+static inline int perf_event_set_filter(int fd, const char *arg) {
+    return ioctl(fd, PERF_EVENT_IOC_SET_FILTER, (void *)arg);
+}
+
+static inline uint64_t perf_read_data_head(void *header) {
+    struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)header;
+    uint64_t head = ACCESS_ONCE(pc->data_head);
+    pmu_rmb();
+    return head;
+}
+
+static inline void perf_write_data_tail(void *header, uint64_t val) {
+    struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)header;
+    pmu_mb();
+    pc->data_tail = val;
+}
+
+static inline uint64_t perf_read_aux_head(void *header) {
+    struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)header;
+    uint64_t head = ACCESS_ONCE(pc->aux_head);
+    pmu_rmb();
+    return head;
+}
+
+static inline void perf_write_aux_tail(void *header, uint64_t val) {
+    struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)header;
+    pmu_mb();
+    pc->aux_tail = val;
+}
+
+static inline int isPowerOf2(uint64_t value) {
+    if (!value)
+        return 0;
+    return !(value & (value - 1));
+}
+
+/* Convert system errno to PMU error code. */
+static inline int ErrorCode(int err)
+{
+    switch (err) {
+        case EPERM:
+        case EACCES: return PMU_EPERM;
+        case ENOMEM: return PMU_ENOMEM;
+        default:     return PMU_EEVENT;
+    }
+}
+
+} /* namespace pmu */
+
+#endif /* __PMU_UTILS_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/pmu.h b/llvm/include/pmu/pmu.h
new file mode 100644
index 0000000..89a7c98
--- /dev/null
+++ b/llvm/include/pmu/pmu.h
@@ -0,0 +1,170 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *  Hardware Performance Monitoring Unit (PMU), C++ interfaces.
+ */
+
+#ifndef __PMU_H
+#define __PMU_H
+
+#include <vector>
+#include <memory>
+#include <stdint.h>
+
+namespace pmu {
+
+#define PMU_GROUP_EVENTS   (8)
+#define PMU_TIMER_PERIOD   (400)     /* micro-second */
+#define PMU_INVALID_HNDL   ((Handle)-1)
+
+typedef unsigned Handle;
+/* Sampling event overflow handling. */
+typedef std::vector<uint64_t> SampleList;
+typedef std::unique_ptr<SampleList> SampleDataPtr;
+typedef void (*SampleHandlerTy)(Handle Hndl, SampleDataPtr Data, void *Opaque);
+
+/* Error code. */
+enum {
+    PMU_OK       = 0,   /* No error */
+    PMU_EINVAL   = -1,  /* Invalid argument */
+    PMU_ENOMEM   = -2,  /* Insufficient memory */
+    PMU_ENOEVENT = -3,  /* Pre-defined event not available */
+    PMU_EEVENT   = -4,  /* Hardware event error */
+    PMU_EPERM    = -5,  /* Permission denied */
+    PMU_EINTER   = -6,  /* Internal error */
+    PMU_EDECODER = -7,  /* Instruction trace decoder error */
+};
+
+/* Pre-defined event code. */
+enum {
+    /* Basic events */
+    PMU_CPU_CYCLES = 0,
+    PMU_REF_CPU_CYCLES,
+    PMU_INSTRUCTIONS,
+    PMU_LLC_REFERENCES,
+    PMU_LLC_MISSES,
+    PMU_BRANCH_INSTRUCTIONS,
+    PMU_BRANCH_MISSES,
+    /* Instruction cache events */
+    PMU_ICACHE_HITS,
+    PMU_ICACHE_MISSES,
+    /* Memory instruction events */
+    PMU_MEM_LOADS,
+    PMU_MEM_STORES,
+
+    PMU_EVENT_MAX,
+};
+
+/* PMU initial configuration. */
+struct PMUConfig {
+    /* Input */
+    int SignalReceiver; /* TID of the signal receiver. 0 for auto-select. */
+    uint32_t Timeout;   /* Timer period in micro-second. 0 for auto-select.  */
+
+    /* Output */
+    int PerfVersion;   /* Perf version used in this PMU tool */
+    int OSPerfVersion; /* Perf version used in the OS kernel */
+};
+
+/* Config for sampling with one or multiple event(s).*/
+struct SampleConfig {
+    unsigned NumEvents; /* Number of events in the event group */
+    unsigned EventCode[PMU_GROUP_EVENTS]; /* Event group. The 1st event is the leader. */
+    unsigned NumPages;  /* Number of pages as the sample buffer size. (must be 2^n) */
+    uint64_t Period;    /* Sampling period of the group leader. */
+    uint64_t Watermark; /* Bytes before wakeup. 0 for every timer period. */
+    SampleHandlerTy SampleHandler; /* User handler routine */
+    void *Opaque;       /* An opaque pointer passed to the overflow handler. */
+};
+
+/* Config for sampling with only one event. */
+struct Sample1Config {
+    unsigned EventCode; /* Pre-defined event to trigger counter overflow */
+    unsigned NumPages;  /* Number of pages as the sample buffer size. (must be 2^n) */
+    uint64_t Period;    /* Sampling period */
+    uint64_t Watermark; /* Bytes before wakeup. 0 for every timer period. */
+    SampleHandlerTy SampleHandler; /* User handler routine */
+    void *Opaque;       /* An opaque pointer passed to the overflow handler. */
+};
+
+/*
+ * PMU main tools.
+ */
+class PMU {
+    PMU()  = delete;
+    ~PMU() = delete;
+
+public:
+    /* Initialize the PMU module. */
+    static int Init(PMUConfig &Config);
+
+    /* Finalize the PMU module. */
+    static int Finalize(void);
+
+    /* Stop the PMU module. When the PMU module is paused, the user can continue
+     * to use counting events, but the overflow handler will not be invoked. */
+    static int Pause(void);
+
+    /* Restart the PMU module. After the PMU module is resumed, the overflow
+     * handler will be invoked. */
+    static int Resume(void);
+
+    /* Start a counting/sampling/tracing event. */
+    static int Start(Handle Hndl);
+
+    /* Stop a counting/sampling/tracing event. */
+    static int Stop(Handle Hndl);
+
+    /* Reset the hardware counter. */
+    static int Reset(Handle Hndl);
+
+    /* Remove an event. */
+    static int Cleanup(Handle Hndl);
+
+    /* Start/stop a sampling/tracing event without acquiring a lock.
+     * Note that these two function should only be used within the overflow
+     * handler. Since the overflow handling is already in a locked section,
+     * acquiring a lock is not required. */
+    static int StartUnlocked(Handle Hndl);
+    static int StopUnlocked(Handle Hndl);
+
+    /* Open an event using the pre-defined event code. */
+    static int CreateEvent(unsigned EventCode, Handle &Hndl);
+
+    /* Open an event using the raw event number and umask value.
+     * The raw event code is computed as (RawEvent | (Umask << 8)). */
+    static int CreateRawEvent(unsigned RawEvent, unsigned Umask, Handle &Hndl);
+
+    /* Open a sampling event, with the 1st EventCode as the interrupt event.
+     * The sample data will be recorded in a vector of type 'uint64_t'.
+     * The following vector shows the data format of sampling with N events:
+     *     { pc, val1, val2, ..., valN,      # 1st sample
+     *       ...
+     *       pc, val1, val2, ..., valN };    # nth sample
+     *
+     * Note that ownwership of the output vector is transferred to the user.
+     * It is the user's responsibility to free the resource of the vector. */
+    static int CreateSampleEvent(SampleConfig &Config, Handle &Hndl);
+
+    /* Generate an IP histogram, using EventCode as the interrupt event.
+     * The IP histogram will be recorded in a vector of type 'uint64_t' with
+     * the format: { pc1, pc2, pc3, ..., pcN }.
+     * Note that ownwership of the output vector is transferred to the user.
+     * It is the user's responsibility to free the resource of the vector. */
+    static int CreateSampleIP(Sample1Config &Config, Handle &Hndl);
+
+    /* Read value from the hardware counter. */
+    static int ReadEvent(Handle Hndl, uint64_t &Value);
+
+    /* Convert error code to string. */
+    static const char *strerror(int ErrCode);
+};
+
+} /* namespace pmu */
+
+#endif /* __PMU_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/ppc/ppc-events.h b/llvm/include/pmu/ppc/ppc-events.h
new file mode 100644
index 0000000..f48e10d
--- /dev/null
+++ b/llvm/include/pmu/ppc/ppc-events.h
@@ -0,0 +1,30 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __PPC_EVENTS_H
+#define __PPC_EVENTS_H
+
+#include <vector>
+#include "pmu/pmu.h"
+
+namespace pmu {
+
+class PMUEvent;
+
+#if defined(_ARCH_PPC) || defined(_ARCH_PPC64)
+#define pmu_mb()     __asm__ __volatile__ ("sync" : : : "memory")
+#define pmu_rmb()    __asm__ __volatile__ ("sync" : : : "memory")
+#define pmu_wmb()    __asm__ __volatile__ ("sync" : : : "memory")
+#endif 
+
+int PPCInit(void);
+
+} /* namespace pmu */
+
+#endif /* __PPC_EVENTS_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/pmu/x86/x86-events.h b/llvm/include/pmu/x86/x86-events.h
new file mode 100644
index 0000000..c6fdb95
--- /dev/null
+++ b/llvm/include/pmu/x86/x86-events.h
@@ -0,0 +1,38 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __X86_EVENTS_H
+#define __X86_EVENTS_H
+
+#include <vector>
+#include "pmu/pmu.h"
+
+namespace pmu {
+
+class PMUEvent;
+
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define pmu_mb()    asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define pmu_rmb()   asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define pmu_wmb()   asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define pmu_mb()    asm volatile("mfence" ::: "memory")
+#define pmu_rmb()   asm volatile("lfence" ::: "memory")
+#define pmu_wmb()   asm volatile("sfence" ::: "memory")
+#endif 
+
+int X86Init(void);
+
+} /* namespace pmu */
+
+#endif /* __X86_EVENTS_H */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/include/qemu-types.h b/llvm/include/qemu-types.h
new file mode 100644
index 0000000..f2430e0
--- /dev/null
+++ b/llvm/include/qemu-types.h
@@ -0,0 +1,33 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __QEMU_TYPES_H
+#define __QEMU_TYPES_H
+
+extern "C" {
+#include "cpu.h"
+#include "exec/tb-hash.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "tcg/tcg.h"
+#include "qemu/atomic.h"
+#include "hqemu.h"
+
+extern uint8_t *tb_ret_addr;
+extern uint8_t *ibtc_ret_addr;
+
+}
+
+#ifdef inline
+#undef inline
+#endif
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/tcg-opc-vector.h b/llvm/include/tcg-opc-vector.h
new file mode 100644
index 0000000..bc03ea1
--- /dev/null
+++ b/llvm/include/tcg-opc-vector.h
@@ -0,0 +1,80 @@
+DEF(vector_start, 0, 0, 0, 0)
+
+DEF(vmov_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vload_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vstore_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vsitofp_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vuitofp_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfptosi_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfptoui_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vadd_i8_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i16_128, 0, 0, 0,  TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i8_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i16_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vadd_i32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vsub_i8_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i16_128, 0, 0, 0,  TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i8_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i16_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vsub_i32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vadd_f32_128, 0, 0, 0, 0)
+DEF(vadd_f64_128, 0, 0, 0, 0)
+DEF(vadd_f32_64, 0, 0, 0, 0)
+DEF(vpadd_f32_128, 0, 0, 0, 0)
+DEF(vpadd_f64_128, 0, 0, 0, 0)
+DEF(vpadd_f32_64, 0, 0, 0, 0)
+DEF(vsub_f32_128, 0, 0, 0, 0)
+DEF(vsub_f64_128, 0, 0, 0,0)
+DEF(vsub_f32_64, 0, 0, 0, 0)
+DEF(vabd_f32_128, 0, 0, 0 ,0)
+DEF(vabd_f64_128, 0, 0, 0 ,0)
+DEF(vabd_f32_64, 0, 0, 0, 0)
+
+DEF(vfma_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfma_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfma_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfms_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfms_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vfms_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vmul_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmul_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmul_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmla_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmla_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmla_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmls_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmls_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vmls_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vdiv_f32_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vdiv_f64_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vdiv_f32_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vand_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vand_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbic_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbic_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vorr_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vorr_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vorn_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vorn_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(veor_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(veor_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vbif_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbif_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbit_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbit_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbsl_128, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+DEF(vbsl_64, 0, 0, 0, TCG_OPF_SIDE_EFFECTS)
+
+DEF(vector_end, 0, 0, 0, 0)
diff --git a/llvm/include/tracer.h b/llvm/include/tracer.h
new file mode 100644
index 0000000..2813e0e
--- /dev/null
+++ b/llvm/include/tracer.h
@@ -0,0 +1,109 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __TRACE_H
+#define __TRACE_H
+
+#include <vector>
+#include <iostream>
+#include "qemu-types.h"
+#include "optimization.h"
+#include "utils.h"
+
+
+/* 
+ * Base processor tracer
+ */
+class BaseTracer {
+public:
+    CPUArchState *Env;
+    void *Perf;
+
+    BaseTracer(CPUArchState *env) : Env(env), Perf(nullptr) {}
+    virtual ~BaseTracer() {}
+    virtual void Reset() {}
+    virtual void Record(uintptr_t next_tb, TranslationBlock *tb) {}
+
+    /* Create and return the tracer object based on LLVM_MODE. */
+    static BaseTracer *CreateTracer(CPUArchState *env);
+
+    /* Release the trace resources. */
+    static void DeleteTracer(CPUArchState *env);
+};
+
+
+/*
+ * Trace of a single basic block
+ */
+class SingleBlockTracer : public BaseTracer {
+    TranslationBlock *TB;
+
+public:
+    SingleBlockTracer(CPUArchState *env);
+
+    void Record(uintptr_t next_tb, TranslationBlock *tb) override;
+};
+
+
+/*
+ * Trace with NET trace formation algorithm
+ */
+#define NET_PROFILE_THRESHOLD 50
+#if defined(CONFIG_SOFTMMU)
+#  define NET_PREDICT_THRESHOLD 16
+#else
+#  define NET_PREDICT_THRESHOLD 64
+#endif
+class NETTracer : public BaseTracer {
+    bool isTraceHead(uintptr_t next_tb, TranslationBlock *tb, bool NewTB);
+
+public:
+    typedef std::vector<TranslationBlock *> TBVec;
+    TBVec TBs;
+
+    NETTracer(CPUArchState *env, int Mode);
+    ~NETTracer();
+
+    void Reset() override;
+    void Record(uintptr_t next_tb, TranslationBlock *tb) override;
+    inline void Profile(TranslationBlock *tb);
+    inline void Predict(TranslationBlock *tb);
+};
+
+/* Return the address of the patch point to the trace code. */
+static inline uintptr_t tb_get_jmp_entry(TranslationBlock *tb) {
+    return (uintptr_t)tb->tc_ptr + tb->patch_jmp;
+}
+/* Return the initial jump target address of the patch point. */
+static inline uintptr_t tb_get_jmp_next(TranslationBlock *tb) {
+    return (uintptr_t)tb->tc_ptr + tb->patch_next;
+}
+static inline SingleBlockTracer &getSingleBlockTracer(CPUArchState *env) {
+    return *static_cast<SingleBlockTracer *>(cpu_get_tracer(env));
+}
+static inline NETTracer &getNETTracer(CPUArchState *env) {
+    return *static_cast<NETTracer *>(cpu_get_tracer(env));
+}
+
+static inline void delete_image(TranslationBlock *tb)
+{
+#if defined(CONFIG_LLVM) && defined(CONFIG_SOFTMMU)
+    delete (char *)tb->image;
+    tb->image = nullptr;
+#endif
+}
+
+static inline bool update_tb_mode(TranslationBlock *tb, int from, int to) {
+    if (tb->mode != from)
+        return false;
+    return Atomic<int>::testandset(&tb->mode, from, to);
+}
+
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/include/utils.h b/llvm/include/utils.h
new file mode 100644
index 0000000..90b36d9
--- /dev/null
+++ b/llvm/include/utils.h
@@ -0,0 +1,260 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef __UTILS_H
+#define __UTILS_H
+
+#include <cstdint>
+#include <cstdlib>
+#include <sstream>
+#include <iomanip>
+#include <set>
+#include <map>
+#include <vector>
+#include "qemu-types.h"
+
+
+#ifndef timersub
+# define timersub(a, b, result)                                               \
+  do {                                                                        \
+    (result)->tv_sec = (a)->tv_sec - (b)->tv_sec;                             \
+    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec;                          \
+    if ((result)->tv_usec < 0) {                                              \
+      --(result)->tv_sec;                                                     \
+      (result)->tv_usec += 1000000;                                           \
+    }                                                                         \
+  } while (0)
+#endif
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#define USE_PTHREAD_MUTEX
+#endif
+
+#if defined(USE_PTHREAD_MUTEX)
+#  define hqemu_lock_t           pthread_mutex_t
+#  define hqemu_lock_init(lock)  pthread_mutex_init(lock, nullptr)
+#  define hqemu_lock(lock)       pthread_mutex_lock(lock)
+#  define hqemu_unlock(lock)     pthread_mutex_unlock(lock)
+#else
+#  define hqemu_lock_t           volatile int
+#  define hqemu_lock_init(lock)  do { *lock = 0; } while(0)
+#  define hqemu_lock(lock)    \
+      do {                    \
+          while (!Atomic<int>::testandset(lock,0,1)) { \
+              while(*(lock)) _mm_pause();              \
+          }                                            \
+      } while(0)
+#  define hqemu_unlock(lock)  \
+      do {                    \
+          barrier();          \
+          *(lock) = 0;        \
+      } while(0)
+#endif  /* USE_PTHREAD_MUTEX */
+
+
+/*
+ * Atomic Utilities
+ */
+template<class T>
+class Atomic {
+public:
+    static T inc_return(volatile T *p) {
+        return __sync_fetch_and_add(p, 1) + 1;
+    }
+    static bool testandset(volatile T *p, T _old, T _new) {
+        return __sync_bool_compare_and_swap(p, _old, _new);
+    }
+};
+
+
+/*
+ * Mutex
+ */
+namespace hqemu {
+class Mutex {
+    hqemu_lock_t M;
+public:
+    Mutex() { hqemu_lock_init(&M); }
+    inline void acquire() { hqemu_lock(&M);   }
+    inline void release() { hqemu_unlock(&M); }
+};
+
+class MutexGuard {
+    Mutex &M;
+public:
+    MutexGuard(Mutex &M) : M(M) { M.acquire(); }
+    ~MutexGuard() { M.release(); }
+};
+};
+
+
+/*
+ * GraphNode is used to describe the information of one node in a CFG.
+ */
+class GraphNode;
+typedef std::vector<GraphNode *> NodeVec;
+typedef std::set<GraphNode *> NodeSet;
+
+class GraphNode {
+    TranslationBlock *TB;
+    NodeVec Children;
+
+public:
+    GraphNode(TranslationBlock *tb) : TB(tb) {}
+
+    TranslationBlock *getTB()   { return TB;        }
+    target_ulong getGuestPC()   { return TB->pc;    }
+    NodeVec &getChildren()      { return Children;  }
+    void insertChild(GraphNode *Node) {
+        Children.push_back(Node);
+    }
+
+    static void DeleteCFG(GraphNode *Root);
+};
+
+/*
+ * ControlFlowGraph is used to build the whole program control flow graph (CFG).
+ * GlobalCFG uses this structure to maintain a whole program CFG connected by
+ * direct branches.
+ */
+class ControlFlowGraph {
+    hqemu::Mutex lock;
+
+public:
+    typedef std::vector<TranslationBlock *> TBVec;
+    typedef std::map<TranslationBlock*, TBVec> SuccMap;
+    SuccMap SuccCFG;
+
+    ControlFlowGraph() {}
+
+    hqemu::Mutex &getLock() { return lock; }
+    TBVec &getSuccessor(TranslationBlock *tb) {
+        return SuccCFG[tb];
+    }
+
+    void reset() {
+        hqemu::MutexGuard locked(lock);
+        SuccCFG.clear();
+    }
+    void insertLink(TranslationBlock *src, TranslationBlock *dst) {
+        hqemu::MutexGuard locked(lock);
+        SuccCFG[src].push_back(dst);
+    }
+};
+
+
+/*
+ * Queue
+ */
+#if defined(__x86_64__)
+#define LOCK_FREE
+#endif
+
+#ifdef LOCK_FREE
+struct pointer_t {
+    struct node_t *ptr;
+    unsigned long int count;
+};
+
+struct node_t {
+    struct pointer_t next;
+    void *value;
+};
+
+/* Lock-free MS-queue */
+class Queue {
+    struct queue_t {
+        struct pointer_t head;
+        struct pointer_t tail;
+    };
+
+    node_t *new_node(void *value) {
+        node_t *node = new node_t;
+        node->next.ptr = nullptr;
+        node->value = value;
+        return node;
+    }
+    void delete_node(node_t *node) {
+        delete node;
+    }
+
+    queue_t Q;
+
+public:
+    Queue();
+    void enqueue(void *data);
+    void *dequeue();
+};
+#else
+class Queue {
+    struct node_t  {
+        struct node_t *next;
+        void *value;
+        node_t(void *v) : next(nullptr), value(v) {}
+    };
+    struct queue_t  {
+        struct node_t *head;
+        struct node_t *tail;
+    };
+
+    pthread_mutex_t lock;
+    queue_t Q;
+
+public:
+    Queue();
+    void enqueue(void *data);
+    void *dequeue();
+};
+#endif
+
+
+class UUID {
+    static uint64_t uuid;
+
+public:
+#if defined(__x86_64__)
+    static uint64_t gen() {
+        uint64_t i = 1;
+        asm volatile("lock; xaddq %0, %1"
+            : "+r" (i), "+m" (uuid) :: "memory");
+        return i + 1;
+    }
+#else
+    static uint64_t gen() {
+        static pthread_mutex_t uuid_lock = PTHREAD_MUTEX_INITIALIZER;
+        pthread_mutex_lock(&uuid_lock);
+        uint64_t id = uuid++;
+        pthread_mutex_unlock(&uuid_lock);
+        return id;
+    }
+#endif
+};
+
+/* Return the string of a hexadecimal number. */
+template <class T>
+static inline std::string toHexString(T Num) {
+    std::stringstream ss;
+    ss << "0x" << std::hex << Num;
+    return ss.str();
+}
+
+/* Return the string of a zero extended number. */
+template <class T>
+static inline std::string toZextStr(T Num) {
+    std::stringstream ss;
+    ss << std::setfill('0') << std::setw(16) << Num;
+    return ss.str();
+}
+
+/* Misc utilities */
+pid_t gettid();
+void patch_jmp(volatile uintptr_t patch_addr, volatile uintptr_t addr);
+void patch_jmp(volatile uintptr_t patch_addr, volatile void *addr);
+
+#endif
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/llvm-annotate.cpp b/llvm/llvm-annotate.cpp
new file mode 100644
index 0000000..040c771
--- /dev/null
+++ b/llvm/llvm-annotate.cpp
@@ -0,0 +1,136 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "xml/tinyxml2.h"
+#include "optimization.h"
+#include "llvm-debug.h"
+#include "llvm-annotate.h"
+
+
+using namespace tinyxml2;
+static hqemu::Mutex Lock;
+
+#if defined(CONFIG_USER_ONLY)
+extern "C" const char *filename;
+#endif
+
+AnnotationFactory::AnnotationFactory()
+{
+#if defined(CONFIG_USER_ONLY)
+    int ret;
+    MetaFile = std::string(filename).append(".xml");
+    ret = ParseXML(MetaFile.c_str());
+    if (ret != 0)
+        return;
+#endif
+}
+
+AnnotationFactory::~AnnotationFactory()
+{
+    for (auto L : Loops)
+        delete L.second;
+}
+
+static inline const char *getAttrName(XMLElement *Attr)
+{
+    return Attr->Name();
+}
+
+static inline const char *getAttrValue(XMLElement *Attr)
+{
+    return Attr->FirstChild() ? Attr->FirstChild()->ToText()->Value() : "";
+}
+
+static LoopMetadata *ParseXMLLoop(XMLElement *LoopNode)
+{
+    if (LoopNode == nullptr)
+        return nullptr;
+
+    LoopMetadata *LoopMD = new LoopMetadata();
+    XMLElement *Attr = LoopNode->FirstChildElement();
+    while (Attr) {
+        std::string Name = getAttrName(Attr);
+        const char *Val = getAttrValue(Attr);
+        if (strlen(Val) == 0)
+            goto next;
+
+        if (Name == "address")
+            LoopMD->Address = (target_ulong)strtoull(Val, nullptr, 16);
+        else if (Name == "length")
+            LoopMD->Length = (uint32_t)strtoul(Val, nullptr, 10);
+        else if (Name == "vs")
+            LoopMD->VS = (uint32_t)strtoul(Val, nullptr, 10);
+        else if (Name == "vf")
+            LoopMD->VF = (uint32_t)strtoul(Val, nullptr, 10);
+        else if (Name == "distance") {
+            LoopMD->Distance = atoi(Val);
+            if (LoopMD->Distance == 0)
+                LoopMD->Distance = INT_MAX;
+        }
+        else if (Name == "start")    LoopMD->Start = atoi(Val);
+        else if (Name == "end")      LoopMD->End = atoi(Val);
+        else if (Name == "stride")   LoopMD->Stride = atoi(Val);
+next:
+        Attr = Attr->NextSiblingElement();
+    }
+
+    if (LoopMD->Address == (target_ulong)-1) {
+        delete LoopMD;
+        return nullptr;
+    }
+
+    return LoopMD;
+}
+
+int AnnotationFactory::ParseXML(const char *name)
+{
+    XMLDocument Doc;
+    XMLElement *RootNode, *LoopNode;
+
+    if (Doc.LoadFile(name) != 0) {
+        dbg() << DEBUG_ANNOTATE << "Disable annotation support."
+              << " (cannot find " << name << ")\n";
+        return 1;
+    }
+
+    dbg() << DEBUG_ANNOTATE << "Found an annotation file " << name << "\n";
+
+    /* A legal annoation should be embedded within the <hqemu> tag. For example:
+     *   <hqemu><loop><addr>...</addr></loop></hqemu> */
+    RootNode = Doc.FirstChildElement("hqemu");
+    if (RootNode == nullptr)
+        return 1;
+
+    LoopNode = RootNode->FirstChildElement("loop");
+    while (LoopNode) {
+        LoopMetadata *LoopMD = ParseXMLLoop(LoopNode);
+        if (LoopMD)
+            Loops[LoopMD->Address] = LoopMD;
+        LoopNode = LoopNode->NextSiblingElement();
+    }
+
+    dbg() << DEBUG_ANNOTATE
+          << "Found " << Loops.size() << " loop annotation(s).\n";
+    return 0;
+}
+
+LoopMetadata *AnnotationFactory::getLoopAnnotation(target_ulong addr)
+{
+    hqemu::MutexGuard locked(Lock);
+
+    if (Loops.find(addr) == Loops.end())
+        return nullptr;
+    return Loops[addr];
+}
+
+bool AnnotationFactory::hasLoopAnnotation(target_ulong addr)
+{
+    hqemu::MutexGuard locked(Lock);
+    return Loops.count(addr) ? true : false;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-debug.cpp b/llvm/llvm-debug.cpp
new file mode 100644
index 0000000..e5d715a
--- /dev/null
+++ b/llvm/llvm-debug.cpp
@@ -0,0 +1,229 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm-debug.h"
+#include "llvm.h"
+
+
+static const Target *getTarget(std::string TripleName)
+{
+    /* Get the target specific parser. */
+    std::string Error;
+    const Target *TheTarget = TargetRegistry::lookupTarget(
+            TripleName.c_str(), Error);
+    if (!TheTarget)
+        return nullptr;
+
+    return TheTarget;
+}
+
+MCDisasm *MCDisasm::CreateMCDisasm(std::string TripleName, bool isHost)
+{
+    if (TripleName.empty() || TripleName == "UnknownArch")
+        return nullptr;
+
+    const Target *TheTarget = getTarget(TripleName);
+    if (!TheTarget)
+        return nullptr;
+
+    return new MCDisasm(TheTarget, TripleName, isHost);
+}
+
+MCDisasm::MCDisasm(const llvm::Target *TheTarget, std::string TripleName,
+                   bool isHost)
+    : HostDisAsm(isHost), NoShowRawInsn(false)
+{
+    const char *triple = TripleName.c_str();
+    Triple TheTriple(Triple::normalize(TripleName));
+
+    switch (TheTriple.getArch()) {
+    case Triple::x86:
+    case Triple::x86_64:
+        NoShowRawInsn = true;
+        break;
+    default:
+        NoShowRawInsn = false;
+        break;
+    }
+
+    const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(TripleName);
+    if (!MRI)
+        hqemu_error("no register info for target %s.\n", triple);
+    const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(*MRI, TripleName);
+    if (!MAI)
+        hqemu_error("no assembly info for target %s\n", triple);
+    const MCSubtargetInfo *STI = TheTarget->createMCSubtargetInfo(TripleName, "", "");
+    if (!STI)
+        hqemu_error("no subtarget info for target %s\n", triple);
+    const MCInstrInfo *MII = TheTarget->createMCInstrInfo();
+    if (!MII)
+        hqemu_error("no instruction info for target %s\n", triple);
+
+    MCContext Ctx(MAI, MRI, nullptr);
+    const MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI, Ctx);
+
+    if (!DisAsm)
+        hqemu_error("no disassembler for target %s\n", TripleName.c_str());
+
+    const MCInstrAnalysis *MIA = TheTarget->createMCInstrAnalysis(MII);
+
+    int AsmPrinterVariant = MAI->getAssemblerDialect();
+#if defined(LLVM_V35)
+    MCInstPrinter *IP = TheTarget->createMCInstPrinter(
+            AsmPrinterVariant, *MAI, *MII, *MRI, *STI);
+#else
+    MCInstPrinter *IP = TheTarget->createMCInstPrinter(Triple(TripleName),
+            AsmPrinterVariant, *MAI, *MII, *MRI);
+#endif
+    if (!IP)
+        hqemu_error("no instruction printer for target %s\n", TripleName.c_str());
+
+    IP->setPrintImmHex(true);
+
+    this->DisAsm = DisAsm;
+    this->STI = STI;
+    this->IP = IP;
+    this->MIA = MIA;
+}
+
+MCDisasm::~MCDisasm()
+{
+}
+
+
+void MCDisasm::DumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS)
+{
+    if (NoShowRawInsn)
+        return;
+
+    static const char hex_rep[] = "0123456789abcdef";
+    OS << "  ";
+    for (auto I = bytes.rbegin(), E = bytes.rend(); I != E; ++I) {
+        char c = *I;
+        OS << hex_rep[(c & 0xF0) >> 4];
+        OS << hex_rep[c & 0xF];
+        OS << ' ';
+    }
+}
+
+#if defined(LLVM_V35)
+class DisasmMemoryObject : public MemoryObject {
+    uint8_t *Bytes;
+    uint64_t Size;
+    uint64_t BasePC;
+public:
+    DisasmMemoryObject(uint8_t *bytes, uint64_t size, uint64_t basePC) :
+                       Bytes(bytes), Size(size), BasePC(basePC) {}
+
+    uint64_t getBase() const override { return BasePC; }
+    uint64_t getExtent() const override { return Size; }
+
+    int readByte(uint64_t Addr, uint8_t *Byte) const override {
+        if (Addr - BasePC >= Size)
+            return -1;
+        *Byte = Bytes[Addr - BasePC];
+        return 0;
+    }
+    ArrayRef<uint8_t> slice(size_t N, size_t M) const {
+        return makeArrayRef<uint8_t>(Bytes+N, M);
+    }
+};
+
+void MCDisasm::PrintInAsm(uint64_t Addr, uint64_t Size, uint64_t GuestAddr)
+{
+    uint64_t Len;
+    DisasmMemoryObject MemoryObject((uint8_t *)Addr, Size, Addr);
+
+    for (uint64_t Start = 0; Start < Size; Start += Len) {
+        MCInst Inst;
+        std::string Str;
+        raw_string_ostream OS(Str);
+        if (DisAsm->getInstruction(Inst, Len, MemoryObject,
+                                   Addr + Start, nulls(), nulls())) {
+            OS << format("0x%08" PRIx64 ":", GuestAddr);
+
+            DumpBytes(MemoryObject.slice(Start, Len), OS);
+            IP->printInst(&Inst, OS, "");
+
+            if (MIA && (MIA->isCall(Inst) || MIA->isUnconditionalBranch(Inst) ||
+                MIA->isConditionalBranch(Inst))) {
+                uint64_t Target;
+                if (MIA->evaluateBranch(Inst, GuestAddr, Len, Target)) {
+                    OS << " <" << format("0x%08" PRIx64, Target) << ">";
+                    if (HostDisAsm) {
+                        if (Target == (uint64_t)tb_ret_addr)
+                            OS << " !tb_ret_addr";
+                    }
+                }
+            }
+        } else {
+            OS << "\t<internal disassembler error>";
+            if (Len == 0)
+                Len = 1;
+        }
+
+        DM.debug() << OS.str() << "\n";
+        GuestAddr += Len;
+    }
+}
+#else
+void MCDisasm::PrintInAsm(uint64_t Addr, uint64_t Size, uint64_t GuestAddr)
+{
+    uint64_t Len;
+    ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(Addr), Size);
+
+    for (uint64_t Start = 0; Start < Size; Start += Len) {
+        MCInst Inst;
+        std::string Str;
+        raw_string_ostream OS(Str);
+        if (DisAsm->getInstruction(Inst, Len, Bytes.slice(Start),
+                                   Addr + Start, nulls(), nulls())) {
+            OS << format("0x%08" PRIx64 ":", GuestAddr);
+
+            DumpBytes(Bytes.slice(Start, Len), OS);
+            IP->printInst(&Inst, OS, "", *STI);
+
+            if (MIA && (MIA->isCall(Inst) || MIA->isUnconditionalBranch(Inst) ||
+                MIA->isConditionalBranch(Inst))) {
+                uint64_t Target;
+                if (MIA->evaluateBranch(Inst, GuestAddr, Len, Target)) {
+                    OS << " <" << format("0x%08" PRIx64, Target) << ">";
+                    if (HostDisAsm) {
+                        if (Target == (uint64_t)tb_ret_addr)
+                            OS << " !tb_ret_addr";
+                    }
+                }
+            }
+        } else {
+            OS << "\t<internal disassembler error>";
+            if (Len == 0)
+                Len = 1;
+        }
+
+        DM.debug() << OS.str() << "\n";
+        GuestAddr += Len;
+    }
+}
+#endif
+
+void MCDisasm::PrintOutAsm(uint64_t Addr, uint64_t Size)
+{
+    auto &OS = DM.debug();
+    OS << "\nOUT: [size=" << Size << "]\n";
+    PrintInAsm(Addr, Size, Addr);
+    OS << "\n";
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-hard-perfmon.cpp b/llvm/llvm-hard-perfmon.cpp
new file mode 100644
index 0000000..051ee02
--- /dev/null
+++ b/llvm/llvm-hard-perfmon.cpp
@@ -0,0 +1,289 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <string.h>
+#include "config-target.h"
+#include "tracer.h"
+#include "llvm.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-hard-perfmon.h"
+
+using namespace pmu;
+
+
+HardwarePerfmon::HardwarePerfmon() : MonThreadID(-1), MonThreadStop(true)
+{
+}
+
+HardwarePerfmon::~HardwarePerfmon()
+{
+    if (LLVMEnv::RunWithVTune)
+        return;
+
+    PMU::Finalize();
+
+    if (!MonThreadStop)
+        MonThreadStop = true;
+}
+
+/* Set up HPM with the monitor thread id */
+void HardwarePerfmon::Init(int monitor_thread_tid)
+{
+    if (LLVMEnv::RunWithVTune)
+        return;
+
+    MonThreadID = monitor_thread_tid;
+
+#if defined(ENABLE_HPM_THREAD)
+    /* Start HPM thread. */
+    StartMonThread();
+#else
+    /* If we attempt to profile hotspot but do not run the HPM translation mode,
+     * we enable the HPM monitor thread for the hotspot profiling in order to
+     * avoid deadlock. */
+    if (SP->Mode & SPM_HOTSPOT)
+        StartMonThread();
+#endif
+
+    /* Initialize the PMU tools. */
+    PMUConfig Config;
+    memset(&Config, 0, sizeof(PMUConfig));
+    Config.SignalReceiver = MonThreadID;
+    Config.Timeout = 400;
+    int EC = PMU::Init(Config);
+    if (EC != PMU_OK) {
+        dbg() << DEBUG_HPM << "Failed to initialize PMU (" << PMU::strerror(EC)
+              << ").\n";
+        return;
+    }
+}
+
+/* Stop the monitor. */
+void HardwarePerfmon::Pause()
+{
+    if (LLVMEnv::RunWithVTune)
+        return;
+
+    PMU::Pause();
+}
+
+/* Restart the monitor. */
+void HardwarePerfmon::Resume()
+{
+    if (LLVMEnv::RunWithVTune)
+        return;
+
+    PMU::Resume();
+}
+
+/* Start monitor thread. */
+void HardwarePerfmon::StartMonThread()
+{
+    /* Start HPM thread. */
+    MonThreadID = -1;
+    MonThreadStop = false;
+    MonThread = std::thread(
+                    [=]() { MonitorFunc(); }
+                );
+
+    MonThread.detach();
+    while (MonThreadID == -1)
+        usleep(200);
+}
+
+/* Monitor thread routine. */
+void HardwarePerfmon::MonitorFunc()
+{
+    MonThreadID = gettid();
+    copy_tcg_context();
+
+    while (!MonThreadStop)
+        usleep(10000);
+}
+
+static void CoverSetHandler(Handle Hndl, std::unique_ptr<SampleList> DataPtr,
+                            void *Opaque)
+{
+    /* Just attach the sampled IPs to the profile list. The soft-perfmon will
+     * release the resource later. */
+    SP->SampleListVec.push_back(DataPtr.release());
+}
+
+void HardwarePerfmon::RegisterThread(BaseTracer *Tracer)
+{
+    hqemu::MutexGuard Locked(Lock);
+
+    dbg() << DEBUG_HPM << "Register thread " << gettid() << ".\n";
+
+    if (LLVMEnv::RunWithVTune)
+        return;
+
+    PerfmonData *Perf = new PerfmonData(gettid());
+    Perf->MonitorBasic(HPM_INIT);
+    Perf->MonitorCoverSet(HPM_INIT);
+
+    Tracer->Perf = static_cast<void *>(Perf);
+}
+
+void HardwarePerfmon::UnregisterThread(BaseTracer *Tracer)
+{
+    hqemu::MutexGuard Locked(Lock);
+
+    dbg() << DEBUG_HPM << "Unregister thread " << gettid() << ".\n";
+
+    if (LLVMEnv::RunWithVTune)
+        return;
+    if (!Tracer->Perf)
+        return;
+
+    auto Perf = static_cast<PerfmonData *>(Tracer->Perf);
+    Perf->MonitorBasic(HPM_FINALIZE);
+    Perf->MonitorCoverSet(HPM_FINALIZE);
+
+    delete Perf;
+    Tracer->Perf = nullptr;
+}
+
+void HardwarePerfmon::NotifyCacheEnter(BaseTracer *Tracer)
+{
+    hqemu::MutexGuard Locked(Lock);
+
+    if (!Tracer->Perf)
+        return;
+    auto Perf = static_cast<PerfmonData *>(Tracer->Perf);
+    Perf->MonitorBasic(HPM_START);
+}
+
+void HardwarePerfmon::NotifyCacheLeave(BaseTracer *Tracer)
+{
+    hqemu::MutexGuard Locked(Lock);
+
+    if (!Tracer->Perf)
+        return;
+    auto Perf = static_cast<PerfmonData *>(Tracer->Perf);
+    Perf->MonitorBasic(HPM_STOP);
+}
+
+/*
+ * PerfmonData
+ */
+PerfmonData::PerfmonData(int tid) : TID(tid)
+{
+}
+
+PerfmonData::~PerfmonData()
+{
+}
+
+void PerfmonData::MonitorBasic(HPMControl Ctl)
+{
+    if (!(SP->Mode & SPM_HPM))
+        return;
+
+    switch (Ctl) {
+    case HPM_INIT:
+        if (PMU::CreateEvent(PMU_INSTRUCTIONS, ICountHndl) == PMU_OK) {
+            dbg() << DEBUG_HPM << "Register event: # instructions.\n";
+            PMU::Start(ICountHndl);
+        }
+        if (PMU::CreateEvent(PMU_BRANCH_INSTRUCTIONS, BranchHndl) == PMU_OK) {
+            dbg() << DEBUG_HPM << "Register event: # branch instructions.\n";
+            PMU::Start(BranchHndl);
+        }
+        if (PMU::CreateEvent(PMU_MEM_LOADS, MemLoadHndl) == PMU_OK) {
+            dbg() << DEBUG_HPM << "Register event: # load instructions.\n";
+            PMU::Start(MemLoadHndl);
+        }
+        if (PMU::CreateEvent(PMU_MEM_STORES, MemStoreHndl) == PMU_OK) {
+            dbg() << DEBUG_HPM << "Register event: # store instructions.\n";
+            PMU::Start(MemStoreHndl);
+        }
+        break;
+    case HPM_FINALIZE:
+    {
+        uint64_t NumInsns = 0, NumBranches = 0, NumLoads = 0, NumStores = 0;
+        if (ICountHndl != PMU_INVALID_HNDL) {
+            PMU::ReadEvent(ICountHndl, NumInsns);
+            PMU::Cleanup(ICountHndl);
+        }
+        if (BranchHndl != PMU_INVALID_HNDL) {
+            PMU::ReadEvent(BranchHndl, NumBranches);
+            PMU::Cleanup(BranchHndl);
+        }
+        if (MemLoadHndl != PMU_INVALID_HNDL) {
+            PMU::ReadEvent(MemLoadHndl, NumLoads);
+            PMU::Cleanup(MemLoadHndl);
+        }
+        if (MemStoreHndl != PMU_INVALID_HNDL) {
+            PMU::ReadEvent(MemStoreHndl, NumStores);
+            PMU::Cleanup(MemStoreHndl);
+        }
+
+        SP->NumInsns += NumInsns;
+        SP->NumBranches += NumBranches;
+        SP->NumLoads += NumLoads;
+        SP->NumStores += NumStores;
+        break;
+    }
+    case HPM_START:
+        if (BranchHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(BranchHndl, LastNumBranches);
+        if (MemLoadHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(MemLoadHndl, LastNumLoads);
+        if (MemStoreHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(MemStoreHndl, LastNumStores);
+        break;
+    case HPM_STOP:
+    {
+        uint64_t NumBranches = 0, NumLoads = 0, NumStores = 0;
+        if (BranchHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(BranchHndl, NumBranches);
+        if (MemLoadHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(MemLoadHndl, NumLoads);
+        if (MemStoreHndl != PMU_INVALID_HNDL)
+            PMU::ReadEvent(MemStoreHndl, NumStores);
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+void PerfmonData::MonitorCoverSet(HPMControl Ctl)
+{
+    if (!(SP->Mode & SPM_HOTSPOT))
+        return;
+
+    switch (Ctl) {
+    case HPM_INIT: {
+        Sample1Config IPConfig;
+        memset(&IPConfig, 0, sizeof(Sample1Config));
+        IPConfig.EventCode = PMU_INSTRUCTIONS;
+        IPConfig.NumPages = 4;
+        IPConfig.Period = 1e5;
+        IPConfig.Watermark = IPConfig.NumPages * getpagesize() / 2;
+        IPConfig.SampleHandler = CoverSetHandler;
+        IPConfig.Opaque = static_cast<void *>(this);
+
+        if (PMU::CreateSampleIP(IPConfig, CoverSetHndl) == PMU_OK) {
+            dbg() << DEBUG_HPM << "Register event: cover set sampling.\n";
+            PMU::Start(CoverSetHndl);
+        }
+        break;
+    }
+    case HPM_FINALIZE:
+        if (CoverSetHndl != PMU_INVALID_HNDL)
+            PMU::Cleanup(CoverSetHndl);
+        break;
+    case HPM_START:
+    case HPM_STOP:
+    default:
+        break;
+    }
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-opc-mmu.cpp b/llvm/llvm-opc-mmu.cpp
new file mode 100644
index 0000000..9d2e60f
--- /dev/null
+++ b/llvm/llvm-opc-mmu.cpp
@@ -0,0 +1,344 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file provides LLVM IR generator in terms of basic block and trace.
+ */
+
+#include "llvm-debug.h"
+#include "llvm.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "utils.h"
+
+#if defined(CONFIG_SOFTMMU)
+extern "C" {
+extern const void * const llvm_ld_helpers[16];
+extern const void * const llvm_st_helpers[16];
+};
+#endif
+
+
+#if defined(CONFIG_USER_ONLY)
+Value *IRFactory::QEMULoad(Value *AddrL, Value *AddrH, TCGMemOpIdx oi)
+{
+    TCGMemOp opc = get_memop(oi);
+    Value *Base = AddrL;
+    PointerType *PtrTy = getPointerTy(getSizeInBits(opc), Segment);
+    LoadInst *LI;
+
+    if (GUEST_BASE == 0 || Segment != 0) {
+        Base = ITP(Base, PtrTy);
+        LI = new LoadInst(Base, "", true, LastInst);
+    } else {
+        Base = ITP(Base, Int8PtrTy);
+        Base = GetElementPtrInst::CreateInBounds(Base, GuestBaseReg.Base, "", LastInst);
+        if (Base->getType() != PtrTy)
+            Base = CAST(Base, PtrTy);
+        LI = new LoadInst(Base, "", true, LastInst);
+    }
+    MF->setGuestMemory(LI);
+
+    return ConvertEndian(LI, opc);
+}
+
+void IRFactory::QEMUStore(Value *Data, Value *AddrL, Value *AddrH, TCGMemOpIdx oi)
+{
+    TCGMemOp opc = get_memop(oi);
+    Value *Base = AddrL;
+    PointerType *PtrTy = getPointerTy(getSizeInBits(opc), Segment);
+    StoreInst *SI;
+
+    Data = ConvertEndian(Data, opc);
+
+    if (GUEST_BASE == 0 || Segment != 0) {
+        Base = ITP(Base, PtrTy);
+        SI = new StoreInst(Data, Base, true, LastInst);
+    } else {
+        Base = ITP(Base, Int8PtrTy);
+        Base = GetElementPtrInst::CreateInBounds(Base, GuestBaseReg.Base, "", LastInst);
+        if (Base->getType() != PtrTy)
+            Base = CAST(Base, PtrTy);
+        SI = new StoreInst(Data, Base, true, LastInst);
+    }
+    MF->setGuestMemory(SI);
+}
+
+#else /* !CONFIG_USER_ONLY */
+
+inline long getTLBOffset(int mem_index)
+{
+    long Offset = 0;
+
+    switch (mem_index) {
+#if NB_MMU_MODES > 0
+    case 0: Offset = offsetof(CPUArchState, tlb_table[0][0]); break;
+#endif
+#if NB_MMU_MODES > 1
+    case 1: Offset = offsetof(CPUArchState, tlb_table[1][0]); break;
+#endif
+#if NB_MMU_MODES > 2
+    case 2: Offset = offsetof(CPUArchState, tlb_table[2][0]); break;
+#endif
+#if NB_MMU_MODES > 3
+    case 3: Offset = offsetof(CPUArchState, tlb_table[3][0]); break;
+#endif
+#if NB_MMU_MODES > 4
+    case 4: Offset = offsetof(CPUArchState, tlb_table[4][0]); break;
+#endif
+#if NB_MMU_MODES > 5
+    case 5: Offset = offsetof(CPUArchState, tlb_table[5][0]);
+#endif
+    default:
+        IRError("%s: internal error. mem_index=%d\n", __func__, mem_index);
+    }
+
+    return Offset;
+}
+
+Value *IRFactory::ConcatTLBVersion(Value *GVA)
+{
+#if defined(ENABLE_TLBVERSION_EXT)
+    GVA = ZEXT64(GVA);
+#endif
+    Type *PtrTy = getPointerTy(DL->getTypeSizeInBits(GVA->getType()));
+    Value *TLBVersion = GetElementPtrInst::CreateInBounds(CPU,
+            CONSTPtr(offsetof(CPUArchState, tlb_version)), "", LastInst);
+    TLBVersion = new BitCastInst(TLBVersion, PtrTy, "", LastInst);
+    TLBVersion = new LoadInst(TLBVersion, "version", true, LastInst);
+    return OR(GVA, TLBVersion);
+}
+
+Value *IRFactory::QEMULoad(Value *AddrL, Value *AddrH, TCGMemOpIdx oi)
+{
+    TCGMemOp opc = get_memop(oi);
+    int mem_index = get_mmuidx(oi);
+    IntegerType *AccessTy;
+    PointerType *GuestPtrTy, *HostPtrTy;
+    int Size, s_bits = opc & MO_SIZE;
+
+    Size = 8 * 1 << s_bits; /* data size (bits) for this load */
+
+    const void *helper = llvm_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
+    Function *MissFunc = ResolveFunction(getMMUFName(helper));
+    if (!MissFunc)
+        IRError("%s: internal error.\n", __func__);
+
+    GuestPtrTy = (TARGET_LONG_BITS == 32) ? Int32PtrTy : Int64PtrTy;
+    HostPtrTy = (TCG_TARGET_REG_BITS == 32) ? Int32PtrTy : Int64PtrTy;
+
+#if defined(ENABLE_TLBVERSION_EXT)
+    GuestPtrTy = Int64PtrTy;
+#endif
+
+    /* Create TLB basic blocks. */
+    BasicBlock *tlb_hit = BasicBlock::Create(*Context, "tlb_hit", Func);
+    BasicBlock *tlb_miss = BasicBlock::Create(*Context, "tlb_miss", Func);
+    BasicBlock *tlb_exit = BasicBlock::Create(*Context, "tlb_exit", Func);
+    toSink.push_back(tlb_miss);
+
+    /* Load compared value in TLB. QEMU uses only addrlo to index the TLB entry. */
+    Value *TLBEntry, *TLBValue, *CPUAddr;
+    AccessTy = (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) ? Int64Ty : Int32Ty;
+    size_t Offset = getTLBOffset(mem_index) + offsetof(CPUTLBEntry, addr_read);
+    TLBEntry = LSHR(AddrL, ConstantInt::get(AccessTy, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
+    TLBEntry = AND(TLBEntry, ConstantInt::get(AccessTy, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS));
+    TLBEntry = ADD(TLBEntry, ConstantInt::get(AccessTy, Offset));
+
+    if (TLBEntry->getType() != IntPtrTy)
+        TLBEntry = new ZExtInst(TLBEntry, IntPtrTy, "", LastInst);
+
+    CPUAddr = new PtrToIntInst(CPU, IntPtrTy, "", LastInst);
+    TLBEntry = ADD(CPUAddr, TLBEntry);
+    TLBValue = new IntToPtrInst(TLBEntry, GuestPtrTy, "", LastInst);
+    TLBValue = new LoadInst(TLBValue, "tlb.read", false, LastInst);
+
+    /* Compare GVA and TLB value. */
+    Value *GVA, *Cond, *GuestPC = AddrL;
+    AccessTy = (TARGET_LONG_BITS == 32) ? Int32Ty : Int64Ty;
+    if (AddrH) { /* guest is 64-bit and host is 32-bit. */
+        GuestPC = SHL(ZEXT64(AddrH), CONST64(32));
+        GuestPC = OR(GuestPC, ZEXT64(AddrL));
+    }
+#if defined(ALIGNED_ONLY)
+    GVA = AND(GuestPC, ConstantInt::get(AccessTy,
+              TARGET_PAGE_MASK | ((1 << s_bits) - 1)));
+#elif defined(ENABLE_TLBVERSION)
+    GVA = ADD(GuestPC, ConstantInt::get(AccessTy, (1 << s_bits) - 1));
+    GVA = AND(GVA, ConstantInt::get(AccessTy, TARGET_PAGE_MASK));
+    GVA = ConcatTLBVersion(GVA);
+#else
+    GVA = ADD(GuestPC, ConstantInt::get(AccessTy, (1 << s_bits) - 1));
+    GVA = AND(GVA, ConstantInt::get(AccessTy, TARGET_PAGE_MASK));
+#endif
+    Cond = ICMP(GVA, TLBValue, ICmpInst::ICMP_EQ);
+    BranchInst::Create(tlb_hit, tlb_miss, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    /* TLB hit. */
+    Value *PhyAddr, *Addend, *HitData, *Addr=AddrL;
+
+    LastInst = BranchInst::Create(tlb_exit, tlb_hit);
+    if (Addr->getType() != IntPtrTy)
+        Addr = new ZExtInst(Addr, IntPtrTy, "", LastInst);
+
+    Offset = offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_read);
+    Addend = ADD(TLBEntry, ConstantInt::get(IntPtrTy, Offset));
+    Addend = new IntToPtrInst(Addend, HostPtrTy, "", LastInst);
+    Addend = new LoadInst(Addend, "tlb.addend", false, LastInst);
+    PhyAddr = ADD(Addr, Addend);
+    PhyAddr = ITP(PhyAddr, getPointerTy(Size));
+    HitData = new LoadInst(PhyAddr, "hit", true, LastInst);
+
+    HitData = ConvertEndian(HitData, opc);
+
+    /* TLB miss. */
+    LastInst = BranchInst::Create(tlb_exit, tlb_miss);
+    SmallVector<Value *, 4> Params;
+    uint32_t restore_val = setRestorePoint(oi);
+    Params.push_back(CPUStruct);
+    Params.push_back(GuestPC);
+    Params.push_back(CONST32(restore_val));
+
+    CallInst *MissCall = CallInst::Create(MissFunc, Params, "", LastInst);
+    Value *MissData = MissCall;
+    switch (opc & MO_SSIZE) {
+    case MO_UB:
+    case MO_SB:
+        if (DL->getTypeSizeInBits(MissData->getType()) != 8)
+            MissData = TRUNC8(MissCall);
+        break;
+    case MO_UW:
+    case MO_SW:
+        if (DL->getTypeSizeInBits(MissData->getType()) != 16)
+            MissData = TRUNC16(MissCall);
+        break;
+    case MO_UL:
+    case MO_SL:
+        if (DL->getTypeSizeInBits(MissData->getType()) != 32)
+            MissData = TRUNC32(MissCall);
+        break;
+    case MO_Q:
+        if (DL->getTypeSizeInBits(MissData->getType()) != 64)
+            MissData = ZEXT64(MissCall);
+        break;
+    default:
+        IRError("%s: invalid size (opc=%d)\n", __func__, opc);
+        break;
+    }
+
+    /* TLB exit. */
+    CurrBB = tlb_exit;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+    PHINode *PH = PHINode::Create(HitData->getType(), 2, "", LastInst);
+    PH->addIncoming(HitData, tlb_hit);
+    PH->addIncoming(MissData, tlb_miss);
+
+    return PH;
+}
+
+void IRFactory::QEMUStore(Value *Data, Value *AddrL, Value *AddrH, TCGMemOpIdx oi)
+{
+    TCGMemOp opc = get_memop(oi);
+    int mem_index = get_mmuidx(oi);
+    IntegerType *AccessTy;
+    PointerType *GuestPtrTy, *HostPtrTy;
+    int Size, s_bits = opc & MO_SIZE;
+
+    Size = 8 * 1 << s_bits; /* data size (bits) for this load */
+
+    const void *helper = llvm_st_helpers[opc & (MO_BSWAP | MO_SIZE)];
+    Function *MissFunc = ResolveFunction(getMMUFName(helper));
+    if (!MissFunc)
+        IRError("%s: internal error.\n", __func__);
+
+    GuestPtrTy = (TARGET_LONG_BITS == 32) ? Int32PtrTy : Int64PtrTy;
+    HostPtrTy = (TCG_TARGET_REG_BITS == 32) ? Int32PtrTy : Int64PtrTy;
+
+#if defined(ENABLE_TLBVERSION_EXT)
+    GuestPtrTy = Int64PtrTy;
+#endif
+
+    /* Create TLB basic blocks. */
+    BasicBlock *tlb_hit = BasicBlock::Create(*Context, "tlb_hit", Func);
+    BasicBlock *tlb_miss = BasicBlock::Create(*Context, "tlb_miss", Func);
+    BasicBlock *tlb_exit = BasicBlock::Create(*Context, "tlb_exit", Func);
+    toSink.push_back(tlb_miss);
+
+    /* Load compared value in TLB. QEMU uses only addrlo to index the TLB entry. */
+    Value *TLBEntry, *TLBValue, *CPUAddr;
+    AccessTy = (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) ? Int64Ty : Int32Ty;
+    size_t Offset = getTLBOffset(mem_index) + offsetof(CPUTLBEntry, addr_write);
+    TLBEntry = LSHR(AddrL, ConstantInt::get(AccessTy, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
+    TLBEntry = AND(TLBEntry, ConstantInt::get(AccessTy, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS));
+    TLBEntry = ADD(TLBEntry, ConstantInt::get(AccessTy, Offset));
+
+    if (TLBEntry->getType() != IntPtrTy)
+        TLBEntry = new ZExtInst(TLBEntry, IntPtrTy, "", LastInst);
+
+    CPUAddr = new PtrToIntInst(CPU, IntPtrTy, "", LastInst);
+    TLBEntry = ADD(CPUAddr, TLBEntry);
+    TLBValue = new IntToPtrInst(TLBEntry, GuestPtrTy, "", LastInst);
+    TLBValue = new LoadInst(TLBValue, "tlb.write", false, LastInst);
+
+    /* Compare GVA and TLB value. */
+    Value *GVA, *Cond, *GuestPC = AddrL;
+    AccessTy = (TARGET_LONG_BITS == 32) ? Int32Ty : Int64Ty;
+    if (AddrH != nullptr) { /* guest is 64-bit and host is 32-bit. */
+        GuestPC = SHL(ZEXT64(AddrH), CONST64(32));
+        GuestPC = OR(GuestPC, ZEXT64(AddrL));
+    }
+#if defined(ALIGNED_ONLY)
+    GVA = AND(GuestPC, ConstantInt::get(AccessTy,
+              TARGET_PAGE_MASK | ((1 << s_bits) - 1)));
+#elif defined(ENABLE_TLBVERSION)
+    GVA = ADD(GuestPC, ConstantInt::get(AccessTy, (1 << s_bits) - 1));
+    GVA = AND(GVA, ConstantInt::get(AccessTy, TARGET_PAGE_MASK));
+    GVA = ConcatTLBVersion(GVA);
+#else
+    GVA = ADD(GuestPC, ConstantInt::get(AccessTy, (1 << s_bits) - 1));
+    GVA = AND(GVA, ConstantInt::get(AccessTy, TARGET_PAGE_MASK));
+#endif
+    Cond = ICMP(GVA, TLBValue, ICmpInst::ICMP_EQ);
+    BranchInst::Create(tlb_hit, tlb_miss, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    /* TLB hit. */
+    Value *PhyAddr, *Addend, *Addr=AddrL;
+
+    LastInst = BranchInst::Create(tlb_exit, tlb_hit);
+    if (Addr->getType() != IntPtrTy)
+        Addr = new ZExtInst(Addr, IntPtrTy, "", LastInst);
+
+    Offset = offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_write);
+    Addend = ADD(TLBEntry, ConstantInt::get(IntPtrTy, Offset));
+    Addend = new IntToPtrInst(Addend, HostPtrTy, "", LastInst);
+    Addend = new LoadInst(Addend, "tlb.addend", false, LastInst);
+    PhyAddr = ADD(Addr, Addend);
+    PhyAddr = ITP(PhyAddr, getPointerTy(Size));
+
+    Value *HitData = ConvertEndian(Data, opc);
+
+    new StoreInst(HitData, PhyAddr, true, LastInst);
+
+    /* TLB miss. */
+    LastInst = BranchInst::Create(tlb_exit, tlb_miss);
+    SmallVector<Value *, 4> Params;
+    uint32_t restore_val = setRestorePoint(oi);
+    Params.push_back(CPUStruct);
+    Params.push_back(GuestPC);
+    Params.push_back(Data);
+    Params.push_back(CONST32(restore_val));
+
+    CallInst::Create(MissFunc, Params, "", LastInst);
+
+    /* TLB exit. */
+    CurrBB = tlb_exit;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+#endif /* CONFIG_USER_ONLY */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-opc-vector.cpp b/llvm/llvm-opc-vector.cpp
new file mode 100644
index 0000000..3ce5f68
--- /dev/null
+++ b/llvm/llvm-opc-vector.cpp
@@ -0,0 +1,943 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file provides TCG vector IR to LLVM IR conversions.
+ */
+
+#include "llvm.h"
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "utils.h"
+
+
+extern TCGOpDef llvm_op_defs[];
+
+
+void IRFactory::op_vector_start(const TCGArg *args)
+{
+    IRError("%s: this function should never be called.\n", __func__);
+}
+
+void IRFactory::op_vector_end(const TCGArg *args)
+{
+    IRError("%s: this function should never be called.\n", __func__);
+}
+
+void IRFactory::op_vmov_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmov_128);
+
+    TCGArg DstOff = args[0];
+    TCGArg SrcOff = args[1];
+    Value *Dst, *Src;
+
+    VectorType *VectorTy = VectorType::get(Int8Ty, 16);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+    Src = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(SrcOff), "", LastInst);
+    Src = new BitCastInst(Src, PtrTy, "", LastInst);
+    Dst = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(DstOff), "", LastInst);
+    Dst = new BitCastInst(Dst, PtrTy, "", LastInst);
+
+    Src = new LoadInst(Src, "", false, LastInst);
+    new StoreInst(Src, Dst, false, LastInst);
+}
+
+void IRFactory::op_vload_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vload_128);
+
+    TCGArg Off = args[0];
+    Register &In = Reg[args[1]];
+    TCGArg Alignment = (args[2] == (TCGArg)-1) ? 4 : args[2] / 8;
+    Value *Base = LoadState(In);
+    LoadInst *LI;
+
+    AssertType(In.Size == 32 || In.Size == 64);
+
+    VectorType *VectorTy = VectorType::get(Int8Ty, 16);
+    PointerType *PtrTy = PointerType::get(VectorTy, Segment);
+
+    if (GUEST_BASE == 0 || Segment != 0) {
+        Base = ITP(Base, PtrTy);
+        LI = new LoadInst(Base, "", true, LastInst);
+    } else {
+        Base = ITP(Base, Int8PtrTy);
+        Base = GetElementPtrInst::CreateInBounds(Base, GuestBaseReg.Base, "", LastInst);
+        if (Base->getType() != PtrTy)
+            Base = CAST(Base, PtrTy);
+        LI = new LoadInst(Base, "", true, LastInst);
+    }
+    LI->setAlignment(Alignment);
+
+    MF->setGuestMemory(LI);
+
+    PtrTy = PointerType::getUnqual(VectorTy);
+    Value *V = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Off), "", LastInst);
+    V = new BitCastInst(V, PtrTy, "", LastInst);
+    new StoreInst(LI, V, false, LastInst);
+}
+
+void IRFactory::op_vstore_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vstore_128);
+
+    TCGArg Off = args[0];
+    Register &In = Reg[args[1]];
+    TCGArg Alignment = (args[2] == (TCGArg)-1) ? 4 : args[2] / 8;
+    Value *Base = LoadState(In);
+    StoreInst *SI;
+
+    AssertType(In.Size == 32 || In.Size == 64);
+
+    VectorType *VectorTy = VectorType::get(Int8Ty, 16);
+    PointerType *PtrTy = nullptr;
+
+    PtrTy = PointerType::getUnqual(VectorTy);
+    Value *V = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Off), "", LastInst);
+    V = new BitCastInst(V, PtrTy, "", LastInst);
+    V = new LoadInst(V, "", false, LastInst);
+
+    PtrTy = PointerType::get(VectorTy, Segment);
+    if (GUEST_BASE == 0 || Segment != 0) {
+        Base = ITP(Base, PtrTy);
+        SI = new StoreInst(V, Base, true, LastInst);
+    } else {
+        Base = ITP(Base, Int8PtrTy);
+        Base = GetElementPtrInst::CreateInBounds(Base, GuestBaseReg.Base, "", LastInst);
+        if (Base->getType() != PtrTy)
+            Base = CAST(Base, PtrTy);
+        SI = new StoreInst(V, Base, true, LastInst);
+    }
+
+    SI->setAlignment(Alignment);
+
+    MF->setGuestMemory(SI);
+}
+
+#define llvm_gen_vop(_Fn,_Num,_Ty)      \
+do {                                    \
+    TCGArg Out = args[0];               \
+    TCGArg In1 = args[1];               \
+    TCGArg In2 = args[2];               \
+    Value *OutPtr, *InPtr1, *InPtr2;    \
+    VectorType *VectorTy = VectorType::get(_Ty, _Num);     \
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy); \
+                                                           \
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst); \
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);                \
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst); \
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);                \
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst); \
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);                \
+                                                                          \
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);           \
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);           \
+    InData1 = _Fn(InData1, InData2);                                      \
+    new StoreInst(InData1, OutPtr, false, LastInst);                      \
+} while (0)
+
+#define llvm_gen_vop2(_Fn1,_Fn2,_Num,_Ty)      \
+do {                                           \
+    TCGArg Out = args[0];                      \
+    TCGArg In1 = args[1];                      \
+    TCGArg In2 = args[2];                      \
+    Value *OutPtr, *InPtr1, *InPtr2;           \
+    VectorType *VectorTy = VectorType::get(_Ty, _Num);     \
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy); \
+                                                           \
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst); \
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);                \
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst); \
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);                \
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst); \
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);                \
+                                                                          \
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);           \
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);           \
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);           \
+    InData1 = _Fn2(InData3, _Fn1(InData1, InData2));                      \
+    new StoreInst(InData1, OutPtr, false, LastInst);                      \
+} while (0)
+
+
+void IRFactory::op_vadd_i8_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i8_128);
+    llvm_gen_vop(ADD, 16, Int8Ty);
+}
+
+void IRFactory::op_vadd_i16_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i16_128);
+    llvm_gen_vop(ADD, 8, Int16Ty);
+}
+
+void IRFactory::op_vadd_i32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i32_128);
+    llvm_gen_vop(ADD, 4, Int32Ty);
+}
+
+void IRFactory::op_vadd_i64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i64_128);
+    llvm_gen_vop(ADD, 2, Int64Ty);
+}
+
+void IRFactory::op_vadd_i8_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i8_64);
+    llvm_gen_vop(ADD, 8, Int8Ty);
+}
+
+void IRFactory::op_vadd_i16_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i16_64);
+    llvm_gen_vop(ADD, 4, Int16Ty);
+}
+
+void IRFactory::op_vadd_i32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_i32_64);
+    llvm_gen_vop(ADD, 2, Int32Ty);
+}
+
+void IRFactory::op_vsub_i8_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i8_128);
+    llvm_gen_vop(SUB, 16, Int8Ty);
+}
+
+void IRFactory::op_vsub_i16_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i16_128);
+    llvm_gen_vop(SUB, 8, Int16Ty);
+}
+
+void IRFactory::op_vsub_i32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i32_128);
+    llvm_gen_vop(SUB, 4, Int32Ty);
+}
+
+void IRFactory::op_vsub_i64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i64_128);
+    llvm_gen_vop(SUB, 2, Int64Ty);
+}
+
+void IRFactory::op_vsub_i8_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i8_64);
+    llvm_gen_vop(SUB, 8, Int8Ty);
+}
+
+void IRFactory::op_vsub_i16_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i16_64);
+    llvm_gen_vop(SUB, 4, Int16Ty);
+}
+
+void IRFactory::op_vsub_i32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_i32_64);
+    llvm_gen_vop(SUB, 2, Int32Ty);
+}
+
+void IRFactory::op_vadd_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_f32_128);
+    llvm_gen_vop(FADD, 4, FloatTy);
+}
+
+void IRFactory::op_vadd_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_f64_128);
+    llvm_gen_vop(FADD, 2, DoubleTy);
+}
+
+void IRFactory::op_vadd_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vadd_f32_64);
+    llvm_gen_vop(FADD, 2, FloatTy);
+}
+
+void IRFactory::op_vpadd_f32_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vpadd_f64_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vpadd_f32_64(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vsub_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_f32_128);
+    llvm_gen_vop(FSUB, 4, FloatTy);
+}
+
+void IRFactory::op_vsub_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_f64_128);
+    llvm_gen_vop(FSUB, 2, DoubleTy);
+}
+
+void IRFactory::op_vsub_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsub_f32_64);
+    llvm_gen_vop(FSUB, 2, FloatTy);
+}
+
+void IRFactory::op_vabd_f32_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vabd_f64_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vabd_f32_64(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vfma_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vfma_f32_128);
+    llvm_gen_vop2(FMUL, FADD, 4, FloatTy);
+}
+
+void IRFactory::op_vfma_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vfma_f64_128);
+    llvm_gen_vop2(FMUL, FADD, 2, DoubleTy);
+}
+
+void IRFactory::op_vfma_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vfma_f32_64);
+    llvm_gen_vop2(FMUL, FADD, 2, FloatTy);
+}
+
+void IRFactory::op_vfms_f32_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vfms_f64_128(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vfms_f32_64(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_vmul_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmul_f32_128);
+    llvm_gen_vop(FMUL, 4, FloatTy);
+}
+
+void IRFactory::op_vmul_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmul_f64_128);
+    llvm_gen_vop(FMUL, 2, DoubleTy);
+}
+
+void IRFactory::op_vmul_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmul_f32_64);
+    llvm_gen_vop(FMUL, 2, FloatTy);
+}
+
+void IRFactory::op_vmla_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmla_f32_128);
+    llvm_gen_vop2(FMUL, FADD, 4, FloatTy);
+}
+
+void IRFactory::op_vmla_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmla_f64_128);
+    llvm_gen_vop2(FMUL, FADD, 2, DoubleTy);
+}
+
+void IRFactory::op_vmla_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmla_f32_64);
+    llvm_gen_vop2(FMUL, FADD, 2, FloatTy);
+}
+
+void IRFactory::op_vmls_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmls_f32_128);
+    llvm_gen_vop2(FMUL, FSUB, 4, FloatTy);
+}
+
+void IRFactory::op_vmls_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmls_f64_128);
+    llvm_gen_vop2(FMUL, FSUB, 2, DoubleTy);
+}
+
+void IRFactory::op_vmls_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vmls_f32_64);
+    llvm_gen_vop2(FMUL, FSUB, 2, FloatTy);
+}
+
+void IRFactory::op_vdiv_f32_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vdiv_f32_128);
+    llvm_gen_vop(FDIV, 4, FloatTy);
+}
+
+void IRFactory::op_vdiv_f64_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vdiv_f64_128);
+    llvm_gen_vop(FDIV, 2, DoubleTy);
+}
+
+void IRFactory::op_vdiv_f32_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vdiv_f32_64);
+    llvm_gen_vop(FDIV, 2, FloatTy);
+}
+
+void IRFactory::op_vand_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vand_128);
+    if (args[1] == args[2]) {
+        op_vmov_128(args);
+        return;
+    }
+    llvm_gen_vop(AND, 4, Int32Ty);
+}
+
+void IRFactory::op_vand_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vand_64);
+    llvm_gen_vop(AND, 2, Int32Ty);
+}
+
+void IRFactory::op_vbic_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbic_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 4);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 4; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    InData2 = XOR(InData2, VecMinusOne);
+    InData1 = AND(InData1, InData2);
+    new StoreInst(InData1, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbic_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbic_64);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 2);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 2; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    InData2 = XOR(InData2, VecMinusOne);
+    InData1 = AND(InData1, InData2);
+    new StoreInst(InData1, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vorr_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vorr_128);
+    if (args[1] == args[2]) {
+        op_vmov_128(args);
+        return;
+    }
+    llvm_gen_vop(OR, 4, Int32Ty);
+}
+
+void IRFactory::op_vorr_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vorr_64);
+    llvm_gen_vop(OR, 2, Int32Ty);
+}
+
+void IRFactory::op_vorn_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vorn_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 4);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 4; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    InData2 = XOR(InData2, VecMinusOne);
+    InData1 = OR(InData1, InData2);
+    new StoreInst(InData1, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vorn_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vorn_64);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 2);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 2; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    InData2 = XOR(InData2, VecMinusOne);
+    InData1 = OR(InData1, InData2);
+    new StoreInst(InData1, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_veor_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_veor_128);
+    llvm_gen_vop(XOR, 4, Int32Ty);
+}
+
+void IRFactory::op_veor_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_veor_64);
+    llvm_gen_vop(XOR, 2, Int32Ty);
+}
+
+void IRFactory::op_vbif_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbif_128);
+
+    /* vbif rd, rn, rm
+     *   operation: rd <- (rd & rm) | (rn & ~rm) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 4);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 4; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData3 = AND(InData3, InData2);
+    InData1 = AND(InData1, XOR(InData2, VecMinusOne));
+    InData3 = OR(InData1, InData3);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbif_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbif_64);
+
+    /* vbif rd, rn, rm
+     *   operation: rd <- (rd & rm) | (rn & ~rm) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 2);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 2; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData3 = AND(InData3, InData2);
+    InData1 = AND(InData1, XOR(InData2, VecMinusOne));
+    InData3 = OR(InData1, InData3);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbit_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbit_128);
+
+    /* vbit rd, rn, rm
+     *   operation: rd <- (rn & rm) | (rd & ~rm) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 4);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 4; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData1 = AND(InData1, InData2);
+    InData3 = AND(InData3, XOR(InData2, VecMinusOne));
+    InData3 = OR(InData1, InData3);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbit_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbit_64);
+
+    /* vbit rd, rn, rm
+     *   operation: rd <- (rn & rm) | (rd & ~rm) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 2);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 2; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData1 = AND(InData1, InData2);
+    InData3 = AND(InData3, XOR(InData2, VecMinusOne));
+    InData3 = OR(InData1, InData3);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbsl_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbsl_128);
+
+    /* vbsl rd, rn, rm
+     *   operation: rd <- (rn & rd) | (rm & ~rd) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 4);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 4; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData1 = AND(InData1, InData3);
+    InData2 = AND(InData2, XOR(InData3, VecMinusOne));
+    InData3 = OR(InData1, InData2);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vbsl_64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vbsl_64);
+
+    /* vbsl rd, rn, rm
+     *   operation: rd <- (rn & rd) | (rm & ~rd) */
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg In2 = args[2];
+    Value *OutPtr, *InPtr1, *InPtr2;
+    VectorType *VectorTy = VectorType::get(Int32Ty, 2);
+    PointerType *PtrTy = PointerType::getUnqual(VectorTy);
+
+    InPtr1 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr1 = new BitCastInst(InPtr1, PtrTy, "", LastInst);
+    InPtr2 = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In2), "", LastInst);
+    InPtr2 = new BitCastInst(InPtr2, PtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, PtrTy, "", LastInst);
+
+    std::vector<Constant *> V;
+    for (int i = 0; i < 2; i++)
+        V.push_back(CONST32(-1U));
+    Value *VecMinusOne = ConstantVector::get(V);
+
+    Value *InData1 = new LoadInst(InPtr1, "", false, LastInst);
+    Value *InData2 = new LoadInst(InPtr2, "", false, LastInst);
+    Value *InData3 = new LoadInst(OutPtr, "", false, LastInst);
+
+    InData1 = AND(InData1, InData3);
+    InData2 = AND(InData2, XOR(InData3, VecMinusOne));
+    InData3 = OR(InData1, InData2);
+    new StoreInst(InData3, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vsitofp_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vsitofp_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg Size = args[2];
+
+    unsigned NumElements = 0;
+    Type *SrcTy = nullptr, *DstTy = nullptr;
+    if (Size == 32) {
+        NumElements = 4;
+        SrcTy = Int32Ty;
+        DstTy = FloatTy;
+    } else if (Size == 64) {
+        NumElements = 2;
+        SrcTy = Int64Ty;
+        DstTy = DoubleTy;
+    } else
+        IRError("%s: invalid element size.\n", __func__);
+
+    Value *OutPtr, *InPtr;
+    VectorType *VectorInt = VectorType::get(SrcTy, NumElements);
+    PointerType *VIntPtrTy = PointerType::getUnqual(VectorInt);
+    VectorType *VectorFP = VectorType::get(DstTy, NumElements);
+    PointerType *VFPPtrTy = PointerType::getUnqual(VectorFP);
+
+    InPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr = new BitCastInst(InPtr, VIntPtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, VFPPtrTy, "", LastInst);
+
+    Value *InData = new LoadInst(InPtr, "", false, LastInst);
+    InData = new SIToFPInst(InData, VectorFP, "", LastInst);
+    new StoreInst(InData, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vuitofp_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vuitofp_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg Size = args[2];
+
+    unsigned NumElements = 0;
+    Type *SrcTy = nullptr, *DstTy = nullptr;
+    if (Size == 32) {
+        NumElements = 4;
+        SrcTy = Int32Ty;
+        DstTy = FloatTy;
+    } else if (Size == 64) {
+        NumElements = 2;
+        SrcTy = Int64Ty;
+        DstTy = DoubleTy;
+    } else
+        IRError("%s: invalid element size.\n", __func__);
+
+    Value *OutPtr, *InPtr;
+    VectorType *VectorInt = VectorType::get(SrcTy, NumElements);
+    PointerType *VIntPtrTy = PointerType::getUnqual(VectorInt);
+    VectorType *VectorFP = VectorType::get(DstTy, NumElements);
+    PointerType *VFPPtrTy = PointerType::getUnqual(VectorFP);
+
+    InPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr = new BitCastInst(InPtr, VIntPtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, VFPPtrTy, "", LastInst);
+
+    Value *InData = new LoadInst(InPtr, "", false, LastInst);
+    InData = new UIToFPInst(InData, VectorFP, "", LastInst);
+    new StoreInst(InData, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vfptosi_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vfptosi_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg Size = args[2];
+
+    unsigned NumElements = 0;
+    Type *SrcTy = nullptr, *DstTy = nullptr;
+    if (Size == 32) {
+        NumElements = 4;
+        SrcTy = FloatTy;
+        DstTy = Int32Ty;
+    } else if (Size == 64) {
+        NumElements = 2;
+        SrcTy = DoubleTy;
+        DstTy = Int64Ty;
+    } else
+        IRError("%s: invalid element size.\n", __func__);
+
+    Value *OutPtr, *InPtr;
+    VectorType *VectorFP = VectorType::get(SrcTy, NumElements);
+    PointerType *VFPPtrTy = PointerType::getUnqual(VectorFP);
+    VectorType *VectorInt = VectorType::get(DstTy, NumElements);
+    PointerType *VIntPtrTy = PointerType::getUnqual(VectorInt);
+
+    InPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr = new BitCastInst(InPtr, VFPPtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, VIntPtrTy, "", LastInst);
+
+    Value *InData = new LoadInst(InPtr, "", false, LastInst);
+    InData = new FPToSIInst(InData, VectorInt, "", LastInst);
+    new StoreInst(InData, OutPtr, false, LastInst);
+}
+
+void IRFactory::op_vfptoui_128(const TCGArg *args)
+{
+    IRDebug(INDEX_op_vfptoui_128);
+
+    TCGArg Out = args[0];
+    TCGArg In1 = args[1];
+    TCGArg Size = args[2];
+
+    unsigned NumElements = 0;
+    Type *SrcTy = nullptr, *DstTy = nullptr;
+    if (Size == 32) {
+        NumElements = 4;
+        SrcTy = FloatTy;
+        DstTy = Int32Ty;
+    } else if (Size == 64) {
+        NumElements = 2;
+        SrcTy = DoubleTy;
+        DstTy = Int64Ty;
+    } else
+        IRError("%s: invalid element size.\n", __func__);
+
+    Value *OutPtr, *InPtr;
+    VectorType *VectorFP = VectorType::get(SrcTy, NumElements);
+    PointerType *VFPPtrTy = PointerType::getUnqual(VectorFP);
+    VectorType *VectorInt = VectorType::get(DstTy, NumElements);
+    PointerType *VIntPtrTy = PointerType::getUnqual(VectorInt);
+
+    InPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(In1), "", LastInst);
+    InPtr = new BitCastInst(InPtr, VFPPtrTy, "", LastInst);
+    OutPtr = GetElementPtrInst::CreateInBounds(CPU, CONSTPtr(Out), "", LastInst);
+    OutPtr = new BitCastInst(OutPtr, VIntPtrTy, "", LastInst);
+
+    Value *InData = new LoadInst(InPtr, "", false, LastInst);
+    InData = new FPToUIInst(InData, VectorInt, "", LastInst);
+    new StoreInst(InData, OutPtr, false, LastInst);
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-opc.cpp b/llvm/llvm-opc.cpp
new file mode 100644
index 0000000..cc8436c
--- /dev/null
+++ b/llvm/llvm-opc.cpp
@@ -0,0 +1,4431 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file provides LLVM IR generator in terms of basic block and trace.
+ */
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm-debug.h"
+#include "llvm-pass.h"
+#include "llvm-translator.h"
+#include "llvm-target.h"
+#include "llvm-state.h"
+#include "llvm-opc.h"
+
+
+#define INLINE_THRESHOLD    100     /* max # inlined instructions */
+#define INLINE_INSTCOUNT    20      /* max instruction count for inlining a small function */
+
+/* Options enabled by default. */
+static cl::opt<bool> DisableStateMapping("disable-sm", cl::init(false),
+    cl::cat(CategoryHQEMU), cl::desc("Disable state mapping"));
+
+/* Options Disabled by default. */
+static cl::opt<bool> EnableSimplifyPointer("enable-simptr", cl::init(false),
+    cl::cat(CategoryHQEMU), cl::desc("Enable SimplifyPointer"));
+
+
+TCGOpDef llvm_op_defs[] = {
+#define DEF(s, oargs, iargs, cargs, flags) \
+    { #s , oargs, iargs, cargs, iargs + oargs + cargs, flags },
+#include "tcg-opc.h"
+#undef DEF
+};
+
+static IRFactory::FuncPtr OpcFunc[] = {
+#define DEF(name, oargs, iargs, cargs, flags) &IRFactory::op_ ## name,
+#include "tcg-opc.h"
+#undef DEF
+};
+
+extern LLVMEnv *LLEnv;
+extern hqemu::Mutex llvm_global_lock;
+extern hqemu::Mutex llvm_debug_lock;
+
+/*
+ * IRFactory()
+ */
+IRFactory::IRFactory(LLVMTranslator *Trans)
+    : InitOnce(false), Translator(*Trans), EE(nullptr),
+      HostDisAsm(Translator.getHostDisAsm()), Helpers(Translator.getHelpers()),
+      BaseReg(Translator.getBaseReg()), GuestBaseReg(Translator.getGuestBaseReg()),
+      NI(Translator.getNotifyInfo())
+{
+    /* Track TCG virtual registers. */
+    Reg.resize(TCG_MAX_TEMPS);
+
+    TCGContext *s = &tcg_ctx_global;
+    int NumGlobals = s->nb_globals;
+    for (int i = 0; i < NumGlobals; ++i) {
+        TCGTemp *T = &s->temps[i];
+        if (T->type != TCG_TYPE_I32 && T->type != TCG_TYPE_I64)
+            hqemu_error("unsupported register type.\n");
+
+        int Base = (T->fixed_reg) ? T->reg : T->mem_reg;
+        intptr_t Off = (T->fixed_reg) ? -1 : T->mem_offset;
+        Reg[i].set(Base, Off, T->name);
+    }
+
+    for (int i = 0; i < NumGlobals; ++i) {
+        TCGTemp *T1 = &s->temps[i];
+        for (int j = i + 1; j < NumGlobals; ++j) {
+            TCGTemp *T2 = &s->temps[j];
+            if (T1->fixed_reg || T2->fixed_reg)
+                continue;
+            if (Reg[j].Alias)
+                continue;
+            if (T1->mem_offset == T2->mem_offset && T1->type == T2->type)
+                Reg[j].Alias = &Reg[i];
+        }
+    }
+
+    Segment = 0;
+#if defined(__x86_64__) && defined(__linux__)
+    if (GUEST_BASE)
+        Segment = 256;  /* GS: 256 */
+#endif
+
+    dbg() << DEBUG_LLVM << "LLVM IR Factory initialized.\n";
+}
+
+IRFactory::~IRFactory()
+{
+    if (EE) {
+        EE->UnregisterJITEventListener(Listener);
+        EE->removeModule(Mod);
+        delete Listener;
+        delete EE;
+    }
+}
+
+void IRFactory::CreateSession(TraceBuilder *builder)
+{
+    Builder = builder;
+
+    CreateJIT();
+    InitializeTypes();
+
+    MF = new MDFactory(Mod);
+    ExitAddr = CONSTPtr((uintptr_t)tb_ret_addr);
+
+    runPasses = true;
+
+    /* Reset data structures. */
+    StatePtr.clear();
+    InlineCalls.clear();
+    IndirectBrs.clear();
+    CommonBB.clear();
+    toErase.clear();
+    toSink.clear();
+    ClonedFuncs.clear();
+    NI.reset();
+}
+
+void IRFactory::DeleteSession()
+{
+    if (Func) {
+        Func->removeFromParent();
+        delete Func;
+        Func = nullptr;
+    }
+    delete MF;
+    DeleteJIT();
+}
+
+static void setHostAttrs(std::string &MCPU, std::vector<std::string> &MAttrs,
+                         TargetOptions &Options)
+{
+    MCPU = sys::getHostCPUName();
+
+    StringMap<bool> HostFeatures;
+    sys::getHostCPUFeatures(HostFeatures);
+    for (auto &F : HostFeatures)
+        MAttrs.push_back((F.second ? "+" : "-") + F.first().str());
+
+    if (MCPU == "core-avx2" || MCPU == "haswell" || MCPU == "knl")
+        Options.AllowFPOpFusion = FPOpFusion::Fast;
+}
+
+#if defined(ENABLE_MCJIT)
+#if defined(LLVM_V35)
+void IRFactory::CreateJIT()
+{
+    Module *InitMod = Translator.getModule();
+    Context = &InitMod->getContext();
+    Mod = new Module(InitMod->getModuleIdentifier(), *Context);
+    Mod->setDataLayout(InitMod->getDataLayout());
+    Mod->setTargetTriple(InitMod->getTargetTriple());
+
+    DL = getDataLayout(Mod);
+
+    /* Create JIT execution engine. */
+    std::string ErrorMsg, MCPU;
+    std::vector<std::string> MAttrs;
+    TargetOptions Options;
+
+    setHostAttrs(MCPU, MAttrs, Options);
+
+    EngineBuilder builder(Mod);
+    builder.setMCPU(MCPU);
+    builder.setMAttrs(MAttrs);
+    builder.setErrorStr(&ErrorMsg);
+    builder.setEngineKind(EngineKind::JIT);
+    builder.setOptLevel(CodeGenOpt::Default);
+    builder.setUseMCJIT(true);
+    builder.setMCJITMemoryManager(LLEnv->getMemoryManager().get());
+    builder.setTargetOptions(Options);
+
+    EE = builder.create();
+
+    if (!EE)
+        hqemu_error("%s\n", ErrorMsg.c_str());
+
+    /* Create JIT event listener and link target machine. */
+    Listener = new EventListener(NI);
+
+    EE->RegisterJITEventListener(Listener);
+
+    /* Ask LLVM to reserve basereg. */
+    auto TM = EE->getTargetMachine();
+    auto TRI = const_cast<TargetRegisterInfo*>(TM->getRegisterInfo());
+    TRI->setHQEMUReservedRegs(BaseReg[TCG_AREG0].Name);
+
+    dbg() << DEBUG_LLVM << "LLVM MCJIT initialized.\n";
+}
+#else
+void IRFactory::CreateJIT()
+{
+    Module *InitMod = Translator.getModule();
+    Context = &InitMod->getContext();
+    std::unique_ptr<Module> Owner(
+                new Module(InitMod->getModuleIdentifier(), *Context));
+    Mod = Owner.get();
+    Mod->setDataLayout(InitMod->getDataLayout());
+    Mod->setTargetTriple(InitMod->getTargetTriple());
+
+    DL = getDataLayout(Mod);
+
+    /* Create JIT execution engine. */
+    std::string ErrorMsg, MCPU;
+    std::vector<std::string> MAttrs;
+    TargetOptions Options;
+
+    setHostAttrs(MCPU, MAttrs, Options);
+
+    EngineBuilder builder(std::move(Owner));
+    builder.setMCPU(MCPU);
+    builder.setMAttrs(MAttrs);
+    builder.setErrorStr(&ErrorMsg);
+    builder.setEngineKind(EngineKind::JIT);
+    builder.setOptLevel(CodeGenOpt::Default);
+    builder.setMCJITMemoryManager(LLEnv->getMemoryManager());
+    builder.setTargetOptions(Options);
+
+    EE = builder.create();
+
+    if (!EE)
+        hqemu_error("%s\n", ErrorMsg.c_str());
+
+    /* Create JIT event listener and link target machine. */
+    Listener = new EventListener(NI);
+    EE->RegisterJITEventListener(Listener);
+
+#if LLVM_USE_INTEL_JITEVENTS
+    IntelJIT = JITEventListener::createIntelJITEventListener();
+    EE->RegisterJITEventListener(IntelJIT);
+#endif
+
+    /* Ask LLVM to reserve basereg. */
+    auto TM = EE->getTargetMachine();
+    auto MII = const_cast<MCInstrInfo *>(TM->getMCInstrInfo());
+    MII->setHQEMUExitAddr((unsigned long)tb_ret_addr);
+
+    dbg() << DEBUG_LLVM << "LLVM MCJIT initialized.\n";
+}
+#endif
+
+void IRFactory::DeleteJIT()
+{
+    EE->UnregisterJITEventListener(Listener);
+#if LLVM_USE_INTEL_JITEVENTS
+    EE->UnregisterJITEventListener(IntelJIT);
+    delete IntelJIT;
+#endif
+    EE->removeModule(Mod);
+    delete Listener;
+    delete EE;
+    delete Mod;
+    EE = nullptr;
+}
+
+Function *IRFactory::ResolveFunction(std::string Name)
+{
+    Function *NF = Mod->getFunction(Name);
+    if(NF)
+        return NF;
+
+    ValueToValueMapTy VMap;
+    Module *InitMod = Translator.getModule();
+    Function *F = InitMod->getFunction(Name);
+    if (!F)
+        IRError("%s: unknown function %s.\n", __func__, Name.c_str());
+
+    NF = Function::Create(cast<FunctionType>(F->getType()->getElementType()),
+                          F->getLinkage(), F->getName(), Mod);
+    NF->copyAttributesFrom(F);
+    VMap[F] = NF;
+
+    if (Helpers.find(Name) != Helpers.end() && !F->isDeclaration()) {
+        Function::arg_iterator DestI = NF->arg_begin();
+        for (auto J = F->arg_begin(); J != F->arg_end(); ++J) {
+            DestI->setName(J->getName());
+            VMap[&*J] = &*DestI++;
+        }
+        SmallVector<ReturnInst*, 8> Returns;
+        CloneFunctionInto(NF, F, VMap, /*ModuleLevelChanges=*/true, Returns);
+    }
+
+    ClonedFuncs.insert(NF);
+    return NF;
+}
+
+#else
+void IRFactory::CreateJIT()
+{
+    if (InitOnce)
+        return;
+
+    Context = Translator.getContext();
+    Mod = Translator.getModule();
+    DL = getDataLayout(Mod);
+
+    /* Create JIT execution engine. */
+    std::string ErrorMsg, MCPU;
+    std::vector<std::string> MAttrs;
+    TargetOptions Options;
+
+    setHostAttrs(MCPU, MAttrs, Options);
+
+    EngineBuilder builder(Mod);
+    builder.setMCPU(MCPU);
+    builder.setMAttrs(MAttrs);
+    builder.setAllocateGVsWithCode(false);
+    builder.setJITMemoryManager(LLEnv->getMemoryManager().get());
+    builder.setErrorStr(&ErrorMsg);
+    builder.setEngineKind(EngineKind::JIT);
+    builder.setOptLevel(CodeGenOpt::Default);
+    builder.setTargetOptions(Options);
+
+    EE = builder.create();
+
+    if (!EE)
+        hqemu_error("%s\n", ErrorMsg.c_str());
+
+    /* Create JIT event listener and link target machine. */
+    Listener = new EventListener(NI);
+
+    EE->RegisterJITEventListener(Listener);
+    EE->DisableLazyCompilation(false);
+
+    /* Ask LLVM to reserve basereg. */
+    auto TM = EE->getTargetMachine();
+    auto TRI = const_cast<TargetRegisterInfo*>(TM->getRegisterInfo());
+    TRI->setHQEMUReservedRegs(BaseReg[TCG_AREG0].Name);
+
+    /* Bind addresses to external symbols. */
+    SymbolMap &Symbols = Translator.getSymbols();
+    for (auto I = Symbols.begin(), E = Symbols.end(); I != E; ++I) {
+        std::string Name = I->first;
+        if (!Mod->getNamedValue(Name))
+            continue;
+        EE->updateGlobalMapping(Mod->getNamedValue(Name), (void*)I->second);
+    }
+
+    dbg() << DEBUG_LLVM << "LLVM JIT initialized.\n";
+
+    InitOnce = true;
+}
+
+void IRFactory::DeleteJIT()
+{
+    /* Do nothing with the old JIT. */
+}
+
+Function *IRFactory::ResolveFunction(std::string Name)
+{
+    Function *F = Mod->getFunction(Name);
+    if (!F)
+        IRError("%s: unknown function %s.\n", __func__, Name.c_str());
+    return F;
+}
+#endif
+
+/* Initialize basic types that will be used during IR conversion. */
+void IRFactory::InitializeTypes()
+{
+    VoidTy   = Type::getVoidTy(*Context);
+    Int8Ty   = IntegerType::get(*Context, 8);
+    Int16Ty  = IntegerType::get(*Context, 16);
+    Int32Ty  = IntegerType::get(*Context, 32);
+    Int64Ty  = IntegerType::get(*Context, 64);
+    Int128Ty = IntegerType::get(*Context, 128);
+
+    IntPtrTy    = DL->getIntPtrType(*Context);
+    Int8PtrTy   = Type::getInt8PtrTy(*Context, 0);
+    Int16PtrTy  = Type::getInt16PtrTy(*Context, 0);
+    Int32PtrTy  = Type::getInt32PtrTy(*Context, 0);
+    Int64PtrTy  = Type::getInt64PtrTy(*Context, 0);
+
+    FloatTy  = Type::getFloatTy(*Context);
+    DoubleTy = Type::getDoubleTy(*Context);
+}
+
+/* Get the function pointer of the IR converion routines. */
+void *IRFactory::getOpcFunc()
+{
+    return OpcFunc;
+}
+
+
+/* Get the CPU pointer.
+ * If the CPU pointer is not in the first block of function F, return null. */
+Instruction *IRFactory::getDefaultCPU(Function &F)
+{
+    if (!CPU)
+        return nullptr;
+    if (!CPU->getParent() || CPU->getParent() != &F.getEntryBlock())
+        return nullptr;
+    return CPU;
+}
+
+static inline std::string getGuestSymbol(target_ulong pc)
+{
+#if defined(CONFIG_USER_ONLY)
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    std::string Symbol = lookup_symbol(pc);
+    if (Symbol != "")
+        Symbol = "<" + Symbol + ">:";
+    return Symbol;
+#else
+    return "";
+#endif
+}
+
+/* Prepare LLVM Function, initial BasicBlocks and variable declaration. */
+void IRFactory::CreateFunction()
+{
+    target_ulong pc = Builder->getEntryNode()->getGuestPC();
+    std::string Name = getGuestSymbol(pc) +
+                       Builder->getPCString(Builder->getEntryNode());
+
+    dbg() << DEBUG_LLVM << "Requested trace info: pc "
+          << format("0x%" PRIx, pc) << " length " << Builder->getNumNodes()
+          << "\n";
+
+    FunctionType *FuncTy = FunctionType::get(IntPtrTy, false);
+    Func = Function::Create(FuncTy, GlobalVariable::ExternalLinkage, Name, Mod);
+    Func->setCallingConv(CallingConv::C);
+    Func->addFnAttr(Attribute::NoUnwind);
+    Func->addFnAttr(Attribute::Naked);
+    Func->addFnAttr("hqemu");
+
+    /* Prepare all basic blocks. */
+    InitBB = BasicBlock::Create(*Context, "init", Func);
+    ExitBB = BasicBlock::Create(*Context, "exit", Func);
+    CurrBB = BasicBlock::Create(*Context, "entry", Func);
+    LastInst = BranchInst::Create(CurrBB, InitBB);
+    new UnreachableInst(*Context, ExitBB);
+
+    /* Setup base register for CPUArchState pointer, and register for
+     * guest_base. */
+    for (int i = 0; i < TCG_TARGET_NB_REGS; i++)
+        BaseReg[i].Base = nullptr;
+
+    BaseRegister &CPUReg = BaseReg[TCG_AREG0];
+    char Constraint[16] = {'\0'};
+    sprintf(Constraint, "={%s}", CPUReg.Name.c_str());
+    auto IA = InlineAsm::get(FunctionType::get(Int8PtrTy, false), "",
+                             Constraint, true);
+    CPUReg.Base = CallInst::Create(IA, "cpu", LastInst);
+
+    /* Set special register for guest base if necessary. */
+    GuestBaseReg.Base = CONSTPtr(GUEST_BASE);
+    if (GuestBaseReg.Name != "") {
+        sprintf(Constraint, "={%s}", GuestBaseReg.Name.c_str());
+        IA = InlineAsm::get(FunctionType::get(Int8PtrTy, false), "",
+                            Constraint, true);
+        GuestBaseReg.Base = new PtrToIntInst(
+                CallInst::Create(IA, "", LastInst),
+                IntPtrTy, "guest_base", LastInst);
+    }
+
+    CPU = CPUReg.Base;
+    CPUStruct = new BitCastInst(CPU, CPUReg.Ty, "cpu.struct", LastInst);
+    GEPInsertPos = CPUStruct;
+}
+
+/* Prepare an LLVM BasicBlock for a new guest block. */
+void IRFactory::CreateBlock()
+{
+    GraphNode *CurrNode = Builder->getCurrNode();
+    bool isEntryNode = CurrNode == Builder->getEntryNode();
+    std::string pc = Builder->getPCString(CurrNode);
+
+    dbg() << DEBUG_LLVM << "  - Process block pc "
+          << format("0x%" PRIx, CurrNode->getGuestPC()) << "\n";
+
+    if (!isEntryNode)
+        CurrBB = BasicBlock::Create(*Context, pc, Func);
+
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+    Builder->setBasicBlock(CurrNode, CurrBB);
+
+    /* Check if the register has legal type. */
+    int NumGlobals = tcg_ctx.nb_globals;
+    int NumTemps = tcg_ctx.nb_temps;
+    for (int i = 0; i < NumTemps; ++i) {
+        TCGTemp *T = &tcg_ctx.temps[i];
+        if (T->type != TCG_TYPE_I32 && T->type != TCG_TYPE_I64)
+            hqemu_error("unsupported register type.\n");
+    }
+
+    /* Initialize global registers. */
+    for (int i = 0; i < NumGlobals; ++i) {
+        TCGTemp *T = &tcg_ctx.temps[i];
+        int State = (T->fixed_reg) ? Register::STATE_REV | Register::STATE_MEM :
+                                     Register::STATE_MEM;
+        int Size = (T->type == TCG_TYPE_I32) ? 32 : 64;
+        Type *Ty = (T->type == TCG_TYPE_I32) ? Int32Ty : Int64Ty;
+        Reg[i].reset(State, Size, Ty);
+    }
+
+    /* Initialize temporary registers. */
+    for (int i = NumGlobals; i < NumTemps; ++i) {
+        TCGTemp *T = &tcg_ctx.temps[i];
+        int State = (T->temp_local) ? Register::STATE_LOC :
+                                      Register::STATE_TMP;
+        int Size = (T->type == TCG_TYPE_I32) ? 32 : 64;
+        Type *Ty = (T->type == TCG_TYPE_I32) ? Int32Ty : Int64Ty;
+        Reg[i].reset(State, Size, Ty);
+    }
+
+    Labels.clear();
+
+#ifdef VERIFY_TB
+    Function *F = ResolveFunction("helper_verify_tb");
+    SmallVector<Value *, 4> Params;
+    Params.push_back(CPUStruct);
+    Params.push_back(CONST32(CurrNode->getTB()->id));
+    CallInst *CI = CallInst::Create(F, Params, "", LastInst);
+    MF->setConst(CI);
+#endif
+}
+
+
+/* Wrapper function to set an unconditional branch. */
+void IRFactory::setSuccessor(BranchInst *BI, BasicBlock *BB)
+{
+    BI->setSuccessor(0, BB);
+}
+
+/* Determine whether we should inline a helper function or not. */
+int IRFactory::AnalyzeInlineCost(CallSite CS)
+{
+    Function *Callee = CS.getCalledFunction();
+    HelperInfo *Helper = Helpers[Callee->getName()];
+    int InlineCost = INLINE_THRESHOLD - Helper->Metrics.NumInsts;
+    unsigned ArgNo = 0;
+
+    if (Helper->Metrics.NumInsts <= INLINE_INSTCOUNT)
+        return 1;
+
+    InlineCost *= InlineConstants::InstrCost;
+    for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I, ++ArgNo) {
+        InlineCost -= InlineConstants::InstrCost;
+        if (isa<AllocaInst>(I))
+            InlineCost += Helper->ArgumentWeights[ArgNo].AllocaWeight;
+        else if (isa<Constant>(I))
+            InlineCost += Helper->ArgumentWeights[ArgNo].ConstantWeight;
+    }
+
+    return InlineCost;
+}
+
+
+/* Perform helper function inlining. */
+void IRFactory::ProcessInline()
+{
+    while (!InlineCalls.empty()) {
+        CallInst *CI = static_cast<CallInst *>(InlineCalls.back());
+        InlineCalls.pop_back();
+        InlineFunc(CI);
+    }
+}
+
+void IRFactory::VerifyFunction(Function &F)
+{
+    if (DM.getDebugMode() & DEBUG_VERIFY)
+        verifyFunction(F, &DM.debug());
+}
+
+/* Format function to a legal format and inline calls. Be sure to make the
+ * function in a well form before doing any furthur optimization (i.e. inlining
+ * calls). Otherwise, the optimization may fail or the result may be wrong. */
+void IRFactory::PreProcess()
+{
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+    ProcessErase(toErase);
+
+    /* Insert terminator instruction to basic blocks that branch to ExitBB.
+     * This could happen when the last TCG opc is a call instruction. */
+    for (auto PI = pred_begin(ExitBB), PE = pred_end(ExitBB); PI != PE; PI++) {
+        Instruction *InsertPos = (*PI)->getTerminator();
+        new UnreachableInst(*Context, InsertPos);
+        toErase.push_back(InsertPos);
+    }
+    ProcessErase(toErase);
+    ExitBB->eraseFromParent();
+
+    /* Remove instructions after indirect branches. */
+    std::set<Instruction *> AfterIB;
+    for (unsigned i = 0, e = IndirectBrs.size(); i != e; ++i) {
+        BasicBlock *BB = IndirectBrs[i]->getParent();
+        for (auto I = ++BasicBlock::iterator(IndirectBrs[i]), E = BB->end();
+             I != E; ++I)
+            AfterIB.insert(&*I);
+    }
+    for (auto I = AfterIB.begin(), E = AfterIB.end(); I != E; ++I)
+        toErase.push_back(*I);
+    ProcessErase(toErase);
+
+    /* Sink blocks to the end. */
+    Function::iterator InsertPos = Func->end();
+    Function::BasicBlockListType &Blocks = Func->getBasicBlockList();
+    for (unsigned i = 0, e = toSink.size(); i != e; ++i) {
+        if (&*InsertPos == toSink[i])
+            continue;
+        Blocks.splice(InsertPos, Blocks, toSink[i]);
+    }
+
+    VerifyFunction(*Func);
+
+    /* Inline helper functions. */
+    ProcessInline();
+
+    SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> BackEdges;
+    FindFunctionBackedges(*Func, BackEdges);
+
+    TraceInfo *Trace = Builder->getTrace();
+    Trace->NumLoop = BackEdges.size();
+    dbg() << DEBUG_LLVM << __func__ << ": trace formation with pc "
+          << format("0x%" PRIx, Trace->getEntryPC())
+          << " length " << Trace->getNumBlock()
+          << " is_loop " << (Trace->NumLoop ? true : false) << "\n";
+
+#if 1 || defined(CONFIG_SOFTMMU)
+    if (Trace->NumLoop) {
+        intptr_t Offset = offsetof(CPUState, tcg_exit_req) - ENV_OFFSET;
+        Value *ExitRequestPtr = GetElementPtrInst::CreateInBounds(CPU,
+                                        CONSTPtr(Offset),
+                                        "", InitBB->getTerminator());
+        ExitRequestPtr = new BitCastInst(ExitRequestPtr, Int32PtrTy,
+                                        "tcg_exit_req",
+                                        InitBB->getTerminator());
+
+        /* Create the exit stub. */
+        for (unsigned i = 0, e = BackEdges.size(); i != e; ++i) {
+            BasicBlock *TCGExitBB = BasicBlock::Create(*Context, "exit", Func);
+            LastInst = BranchInst::Create(TCGExitBB, TCGExitBB);
+            StoreInst *SI = new StoreInst(CONST32(0), ExitRequestPtr, true, LastInst);
+            InsertExit(0);
+            LastInst->eraseFromParent();
+
+            MF->setExit(SI);
+
+            auto BackEdgeBB = const_cast<BasicBlock*>(BackEdges[i].first);
+            auto LoopHeader = const_cast<BasicBlock*>(BackEdges[i].second);
+            auto BI = const_cast<TerminatorInst *>(BackEdgeBB->getTerminator());
+
+            toErase.push_back(BI);
+
+            Value *ExitRequest = new LoadInst(ExitRequestPtr, "", true, BI);
+            Value *Cond = new ICmpInst(BI, ICmpInst::ICMP_EQ, ExitRequest,
+                                       CONST32(0), "");
+            BI = BranchInst::Create(LoopHeader, TCGExitBB, Cond, BI);
+            BI->getParent()->setName("loopback");
+            MF->setLoop(BI);
+        }
+    }
+#else
+    if (Trace->NumLoop) {
+        for (unsigned i = 0, e = BackEdges.size(); i != e; ++i) {
+            auto BackEdgeBB = const_cast<BasicBlock*>(BackEdges[i].first);
+            auto BI = const_cast<TerminatorInst *>(BackEdgeBB->getTerminator());
+            BI->getParent()->setName("loopback");
+            MF->setLoop(BI);
+
+            for (auto BI = BackEdgeBB->begin(), BE = BackEdgeBB->end(); BI != BE; ++BI) {
+                if (auto SI = dyn_cast<StoreInst>(BI)) {
+                    intptr_t Off = 0;
+                    Value *Base = getBaseWithConstantOffset(DL, getPointerOperand(SI), Off);
+                    if (Base == CPU && isStateOfPC(Off))
+                        toErase.push_back(SI);
+                }
+            }
+        }
+    }
+#endif
+
+    ProcessErase(toErase);
+
+    if (DM.getDebugMode() & DEBUG_IR) {
+        hqemu::MutexGuard locked(llvm_debug_lock);
+        Func->print(DM.debug());
+    }
+}
+
+void IRFactory::InitializeLLVMPasses(legacy::FunctionPassManager *FPM)
+{
+    auto TM = EE->getTargetMachine();
+#if defined(LLVM_V35)
+    TM->addAnalysisPasses(*FPM);
+    FPM->add(new DataLayoutPass(Mod));
+    FPM->add(createBasicTargetTransformInfoPass(TM));
+#else
+    PassRegistry &PassReg = *PassRegistry::getPassRegistry();
+    initializeTargetTransformInfoWrapperPassPass(PassReg);
+
+    FPM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+#endif
+}
+
+void IRFactory::Optimize()
+{
+#define addPass(PM, P) do { PM->add(P); } while(0)
+#define addPassOptional(PM, P, Disable) \
+    do { \
+        if (!Disable) PM->add(P); \
+    } while(0)
+
+#if defined(ENABLE_PASSES)
+    if (runPasses) {
+        legacy::FunctionPassManager *FPM = new legacy::FunctionPassManager(Mod);
+
+        InitializeLLVMPasses(FPM);
+
+        addPass(FPM, createProfileExec(this));
+        addPass(FPM, createCombineGuestMemory(this));
+        addPass(FPM, createCombineZExtTrunc());
+        addPassOptional(FPM, createStateMappingPass(this), DisableStateMapping);
+        addPass(FPM, createPromoteMemoryToRegisterPass());
+        addPass(FPM, createCombineCasts(this));
+        addPassOptional(FPM, createSimplifyPointer(this), !EnableSimplifyPointer);
+        addPass(FPM, createAggressiveDCEPass());
+        addPass(FPM, createCFGSimplificationPass());
+        addPass(FPM, createInstructionCombiningPass());
+        addPass(FPM, createRedundantStateElimination(this));
+        addPass(FPM, createCombineCasts(this));
+
+        FPM->run(*Func);
+
+        delete FPM;
+    }
+#endif
+
+#undef addPass
+#undef addPassOptional
+}
+
+
+/* Legalize LLVM IR after running the pre-defined passes. */
+void IRFactory::PostProcess()
+{
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+#if defined(ENABLE_MCJIT)
+    for (auto I = ClonedFuncs.begin(), E = ClonedFuncs.end(); I != E; ++I) {
+        Function *F = *I;
+        if (!F->isDeclaration())
+            F->removeFromParent();
+    }
+    /* Bind addresses to external symbols. */
+    SymbolMap &Symbols = Translator.getSymbols();
+    for (auto I = Symbols.begin(), E = Symbols.end(); I != E; ++I) {
+        std::string Name = I->first;
+        if (!Mod->getNamedValue(Name))
+            continue;
+        EE->updateGlobalMapping(Mod->getNamedValue(Name), (void*)I->second);
+    }
+#endif
+
+    if (DM.getDebugMode() & DEBUG_IR_OPT) {
+        hqemu::MutexGuard locked(llvm_debug_lock);
+        Func->print(DM.debug());
+    }
+}
+
+/* Legalize LLVM IR after running the pre-defined passes. */
+void IRFactory::FinalizeObject()
+{
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+    uintptr_t Code = (uintptr_t)NI.Code;
+    uint32_t Size = NI.Size;
+
+#if defined(ENABLE_MCJIT)
+    for (unsigned i = 0, e = NI.Patches.size(); i != e; ++i) {
+        NotifyInfo::PatchInfo &Patch = NI.Patches[i];
+        uintptr_t Addr = Patch.Addr;
+        code_ostream OS(Addr);
+
+        /* If the address to patch is outside this code region, skip this
+         * invalid patch point. Actually this should not happen, but LLVM v35
+         * seems to report such invalid address. */
+        if (Addr >= Code + Size)
+            continue;
+        if (Patch.Type == PATCH_EXIT_TB) {
+#if defined(LLVM_V35) && defined(TCG_TARGET_I386)
+            EmitByte(OS, 0xE9);
+            EmitConstant(OS, (uintptr_t)tb_ret_addr - Addr - 5, 4);
+#endif
+        } else if (Patch.Type == PATCH_TRACE_BLOCK_CHAINING) {
+#if defined(TCG_TARGET_I386)
+            unsigned NumSkip = 3 - Addr % 4;
+            OS.Skip(NumSkip);
+            EmitByte(OS, 0xE9);
+            EmitConstant(OS, 0, 4);
+            NI.ChainSlot[Patch.Idx].Addr = Addr + NumSkip;
+#elif defined(TCG_TARGET_PPC64)
+            unsigned NumSkip = 0;
+            if (Addr & 7)
+                NumSkip = 4;
+            OS.Skip(NumSkip);
+            EmitConstant(OS, 0x48000000 | (16 & 0x3fffffc), 4); /* b .+16    */
+            EmitConstant(OS, 0x60000000, 4);                    /* nop       */
+            EmitConstant(OS, 0x7C0903A6 | (12 << 21), 4);       /* mtctr r12 */
+            EmitConstant(OS, 0x4E800420, 4);                    /* bctr      */
+            NI.ChainSlot[Patch.Idx].Addr = Addr + NumSkip;
+#else
+            NI.ChainSlot[Patch.Idx].Addr = Addr;
+#endif
+        }
+    }
+#endif
+
+    /* Flush instruction cache */
+    flush_icache_range(Code, Code + Size);
+
+    if (DM.getDebugMode() & DEBUG_OUTASM) {
+        hqemu::MutexGuard locked(llvm_debug_lock);
+        if (HostDisAsm)
+            HostDisAsm->PrintOutAsm((uint64_t)Code, (uint64_t)Size);
+        else {
+            auto &OS = DM.debug();
+            OS << "\nOUT: [size=" << Size << "]\n";
+            disas(stderr, (void *)Code, Size);
+            OS << "\n";
+        }
+    }
+}
+
+/* Start the LLVM JIT compilation. */
+void IRFactory::Compile()
+{
+    dbg() << DEBUG_LLVM
+          << "Translator " << Translator.getID() << " starts compiling...\n";
+
+    /* Run optimization passes. */
+    PreProcess();
+    Optimize();
+    PostProcess();
+
+    VerifyFunction(*Func);
+
+    /* JIT. */
+    NI.Func = Func;
+    EE->getPointerToFunction(Func);
+    EE->finalizeObject();
+
+    FinalizeObject();
+
+    dbg() << DEBUG_LLVM << __func__ << ": done.\n";
+}
+
+PointerType *IRFactory::getPointerTy(int Size, unsigned AS)
+{
+    switch (Size) {
+    case 32: return Type::getInt32PtrTy(*Context, AS);
+    case 64: return Type::getInt64PtrTy(*Context, AS);
+    case 16: return Type::getInt16PtrTy(*Context, AS);
+    case 8:  return Type::getInt8PtrTy(*Context, AS);
+    default:
+         IRError("%s: invalid bit type %d.\n", __func__, Size);
+    }
+    return nullptr;
+}
+
+Value *IRFactory::getExtendValue(Value *V, Type *Ty, int opc)
+{
+    int OldSize = DL->getTypeSizeInBits(V->getType());
+    int NewSize = DL->getTypeSizeInBits(Ty);
+
+    if (OldSize > NewSize)
+        IRError("%s: invalid size old=%d new=%d\n", __func__, OldSize, NewSize);
+    if (OldSize == NewSize)
+        return V;
+
+    if (opc & MO_SIGN)
+        return SEXT(V, Ty);
+    return ZEXT(V, Ty);
+}
+
+Value *IRFactory::getTruncValue(Value *V, int opc)
+{
+    int OldSize = DL->getTypeSizeInBits(V->getType());
+    int NewSize = getSizeInBits(opc);
+
+    if (OldSize < NewSize)
+        IRError("%s: invalid size old=%d new=%d\n", __func__, OldSize, NewSize);
+    if (OldSize == NewSize)
+        return V;
+
+    Type *Ty = Type::getIntNTy(*Context, NewSize);
+    return TRUNC(V, Ty);
+}
+
+Value *IRFactory::ConvertEndian(Value *V, int opc)
+{
+#ifdef NEED_BSWAP
+    switch (opc & MO_SIZE) {
+    case MO_8:  return V;
+    case MO_16: return BSWAP16(V);
+    case MO_32: return BSWAP32(V);
+    case MO_64: return BSWAP64(V);
+    default:
+        IRError("%s: invalid size (opc=%d)\n", __func__, opc);
+        break;
+    }
+    return V;
+#else
+    return V;
+#endif
+}
+
+Value *IRFactory::CreateBSwap(Type *Ty, Value *V, Instruction *InsertPos)
+{
+    SmallVector<Value *, 4> Params;
+    Type *Tys[] = { Ty };
+
+    Function *Fn = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Tys);
+    Params.push_back(V);
+    return CallInst::Create(Fn, Params, "", InsertPos);
+}
+
+Value *IRFactory::ConvertCPUType(Function *F, int Idx, Instruction *InsertPos)
+{
+    Type *ParamTy = F->getFunctionType()->getParamType(Idx);
+    if (CPUStruct->getType() != ParamTy)
+        return new BitCastInst(CPU, ParamTy, "", InsertPos);
+    return CPUStruct;
+}
+
+Value *IRFactory::ConvertCPUType(Function *F, int Idx, BasicBlock *InsertPos)
+{
+    Type *ParamTy = F->getFunctionType()->getParamType(Idx);
+    if (CPUStruct->getType() != ParamTy)
+        return new BitCastInst(CPU, ParamTy, "", InsertPos);
+    return CPUStruct;
+}
+
+/* Return true if the offset is for the state of PC. */
+bool IRFactory::isStateOfPC(intptr_t Off)
+{
+    intptr_t IPOffset;
+#if defined(TARGET_ALPHA)
+    IPOffset = offsetof(CPUArchState, pc);
+#elif defined(TARGET_AARCH64)
+    IPOffset = offsetof(CPUArchState, pc);
+#elif defined(TARGET_ARM)
+    IPOffset = offsetof(CPUArchState, regs[15]);
+#elif defined(TARGET_CRIS)
+    IPOffset = offsetof(CPUArchState, pc);
+#elif defined(TARGET_I386)
+    IPOffset = offsetof(CPUArchState, eip);
+#elif defined(TARGET_M68K)
+    IPOffset = offsetof(CPUArchState, pc);
+#elif defined(TARGET_MICROBLAZE)
+    IPOffset = offsetof(CPUArchState, sregs[0]);
+#elif defined(TARGET_MIPS)
+    IPOffset = offsetof(CPUArchState, active_tc.PC);
+#elif defined(TARGET_PPC)
+    IPOffset = offsetof(CPUArchState, nip);
+#elif defined(TARGET_SH4)
+    IPOffset = offsetof(CPUArchState, pc);
+#elif defined(TARGET_SPARC)
+    intptr_t IPOffset2;
+    IPOffset = offsetof(CPUArchState, pc);
+    IPOffset2 = offsetof(CPUArchState, npc);
+#else
+#error "unsupported processor type"
+#endif
+
+#if defined(TARGET_ALPHA) || defined(TARGET_ARM) || defined(TARGET_AARCH64)   || \
+    defined(TARGET_CRIS) || defined(TARGET_I386) || defined(TARGET_M68K)      || \
+    defined(TARGET_MICROBLAZE) || defined(TARGET_MIPS) || defined(TARGET_PPC) || \
+    defined(TARGET_SH4)
+    return (Off >= IPOffset && Off < IPOffset + TARGET_LONG_SIZE);
+#elif defined(TARGET_SPARC)
+    return ((Off >= IPOffset && Off < IPOffset + TARGET_LONG_SIZE) ||
+            (Off >= IPOffset2 && Off < IPOffset2 + TARGET_LONG_SIZE));
+#endif
+}
+
+/* Trace building requires store IP instruction to link basic blocks.
+ * But in some archirecture, IP is promoted to register and we need to
+ * regenerate the store IP instruction. */
+void IRFactory::CreateStorePC(Instruction *InsertPos)
+{
+    for (int i = 0, e = tcg_ctx.nb_globals; i != e; ++i) {
+        Register &reg = Reg[i];
+        if (reg.isReg() && reg.isDirty()) {
+            if (isStateOfPC(reg.Off)) {
+                StoreState(reg, InsertPos);
+                reg.Demote();
+            }
+        }
+    }
+}
+
+/* Store dirty states back to CPUArchState in memory. */
+void IRFactory::SaveGlobals(int level, Instruction *InsertPos)
+{
+    if (level == COHERENCE_NONE)
+        return;
+
+    int NumGlobals = tcg_ctx.nb_globals;
+    int NumTemps = tcg_ctx.nb_temps;
+    for (int i = 0; i < NumGlobals; ++i) {
+        Register &reg = Reg[i];
+        if (reg.isReg() && reg.isDirty())
+            StoreState(reg, InsertPos);
+        reg.Demote();
+    }
+
+    if (level == COHERENCE_GLOBAL)
+        return;
+
+    /* Store local registers to stack. */
+    for (int i = NumGlobals; i < NumTemps; ++i) {
+        Register &reg = Reg[i];
+        if (reg.isReg() && reg.isLocal() && reg.isDirty())
+            StoreState(reg, InsertPos);
+        reg.Demote();
+    }
+}
+
+/* Get or insert the pointer to the CPU register in the AddrCache. */
+Value *IRFactory::StatePointer(Register &reg)
+{
+    intptr_t Off = reg.Off;
+    PointerType *PTy = (reg.Size == 32) ? Int32PtrTy : Int64PtrTy;
+    std::pair<intptr_t, Type *> Key(Off, PTy);
+    if (StatePtr.find(Key) == StatePtr.end()) {
+        std::string Name = isStateOfPC(Off) ? "pc" : reg.Name;
+        auto GEP = GetElementPtrInst::CreateInBounds(BaseReg[reg.Base].Base,
+                                             CONSTPtr(Off), "", GEPInsertPos);
+        StatePtr[Key] = new BitCastInst(GEP, PTy, Name, InitBB->getTerminator());
+    }
+    return StatePtr[Key];
+}
+
+Value *IRFactory::StatePointer(Register &reg, intptr_t Off, Type *PTy)
+{
+    if (!reg.isRev())
+        IRError("%s: internal error.\n", __func__);
+
+    std::pair<intptr_t, Type *> Key(Off, PTy);
+    if (StatePtr.find(Key) == StatePtr.end()) {
+        std::string Name = isStateOfPC(Off) ? "pc" : "";
+        auto GEP = GetElementPtrInst::CreateInBounds(BaseReg[reg.Base].Base,
+                                             CONSTPtr(Off), "", GEPInsertPos);
+        StatePtr[Key] = new BitCastInst(GEP, PTy, Name, InitBB->getTerminator());
+    }
+    return StatePtr[Key];
+}
+
+/* Retrieve value from CPUArchState. */
+Value *IRFactory::LoadState(Register &reg)
+{
+    if (reg.isRev())
+        return BaseReg[reg.Base].Base;
+    if (reg.isAlias())
+        return LoadState(reg.getAlias());
+    if (reg.isReg())
+        return reg.getData();
+    if (reg.isLocal()) {
+        if (!reg.AI)
+            reg.AI = CreateAlloca(reg.Ty, 0, "loc", InitBB->getTerminator());
+        return new LoadInst(reg.AI, "", false, LastInst);
+    }
+
+    /* If we go here, the state is not loaded into a LLVM virtual register.
+     * Load it from CPUArchState. */
+    Value *V = new LoadInst(StatePointer(reg), "", false, LastInst);
+    reg.setData(V);
+
+    return V;
+}
+
+void IRFactory::StoreState(Register &reg, Instruction *InsertPos)
+{
+#ifdef ASSERT
+    int Size = DL->getTypeSizeInBits(reg.getData()->getType());
+    if (Size != reg.Size)
+        IRError("%s: internal error\n", __func__);
+#endif
+    if (reg.isRev())
+        IRError("%s: fatal error\n", __func__);
+    if (reg.isLocal()) {
+        if (!reg.AI)
+            reg.AI = CreateAlloca(reg.Ty, 0, "loc", InitBB->getTerminator());
+        new StoreInst(reg.getData(), reg.AI, false, InsertPos);
+    } else {
+        bool Volatile = isStateOfPC(reg.Off);
+        new StoreInst(reg.getData(), StatePointer(reg), Volatile, InsertPos);
+    }
+}
+
+
+/*
+ * TCG opcode to LLVM IR translation functions.
+ */
+void IRFactory::op_hotpatch(const TCGArg *args)
+{
+    IRDebug(INDEX_op_hotpatch);
+}
+
+void IRFactory::op_annotate(const TCGArg *args)
+{
+    IRDebug(INDEX_op_annotate);
+
+    uint32_t Annotation = *args;
+    if (Annotation == A_SetCC) {
+        if (LastInst && LastInst != &*LastInst->getParent()->begin())
+            MF->setCondition(&*--BasicBlock::iterator(LastInst));
+    } else if (Annotation == A_NoSIMDization) {
+        Builder->addAttribute(A_NoSIMDization);
+    }
+}
+
+void IRFactory::op_jmp(const TCGArg *args)
+{
+    IRDebug(INDEX_op_jmp);
+
+    Register &In = Reg[args[0]];
+    Value *InData = LoadState(In);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+    if (!InData->getType()->isPointerTy())
+        InData = ITP8(InData);
+
+    IndirectBrInst *IB = IndirectBrInst::Create(InData, 1, LastInst);
+    MF->setExit(IB);
+}
+
+/*
+ * op_discard()
+ *  args[0]: In
+ */
+void IRFactory::op_discard(const TCGArg *args)
+{
+    IRDebug(INDEX_op_discard);
+    Register &In = Reg[args[0]];
+    if (In.isReg())
+        In.Demote();
+}
+
+/*
+ * op_set_label()
+ *  args[0]: Label number
+ */
+void IRFactory::op_set_label(const TCGArg *args)
+{
+    IRDebug(INDEX_op_set_label);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+
+    TCGArg label = args[0];
+    if (Labels.find(label) == Labels.end())
+        Labels[label] = BasicBlock::Create(*Context, "true_dest", Func);
+
+    CurrBB = Labels[label];
+    if (LastInst) {
+        if (LastInst != &*LastInst->getParent()->begin() &&
+            isa<IndirectBrInst>(--BasicBlock::iterator(LastInst)))
+            LastInst->eraseFromParent();
+        else
+            setSuccessor(LastInst, CurrBB);
+    }
+
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+/*
+ * op_call()
+ *  args[0]                       : [nb_oargs:16][nb_iargs:16]
+ *  args[1~#nb_oargs]             : out args
+ *  args[1+#nb_oargs~#nb_iargs-2] : function parameters
+ *  args[1+#nb_oargs+#nb_iargs-1] : function address
+ *  args[1+#nb_oargs+#nb_iargs]   : flags
+ */
+void IRFactory::op_call(const TCGArg *args)
+{
+    IRDebug(INDEX_op_call);
+
+    TCGOp * const op = NI.Op;
+    int nb_oargs = op->callo;
+    int nb_iargs = op->calli;
+    int nb_params = nb_iargs;
+    tcg_insn_unit *func_addr = (tcg_insn_unit *)(intptr_t)args[nb_oargs + nb_iargs];
+    int flags = args[nb_oargs + nb_iargs + 1];
+    SmallVector<Value *, 4> Params;
+
+    /* If the called function is an illegal helper, skip this trace. */
+    if (isIllegalHelper((void *)func_addr)) {
+        Builder->Abort();
+        return;
+    }
+
+    /* Get function declaration from LLVM module. */
+    TCGHelperMap &TCGHelpers = Translator.getTCGHelpers();
+    if (TCGHelpers.find((uintptr_t)func_addr) == TCGHelpers.end())
+        IRError("%s: cannot resolve funtion.\n", __func__);
+
+    std::string FName = TCGHelpers[(uintptr_t)func_addr];
+    Function *F = ResolveFunction(FName);
+
+    std::set<std::string> &ConstHelpers = Translator.getConstHelpers();
+    if (ConstHelpers.find(FName) != ConstHelpers.end())
+        flags |= TCG_CALL_NO_READ_GLOBALS;
+
+    /* Package the function parameters.
+       NOTE: There are situations where the numbers of given arguments
+       are greater than the *real* function parameters. Ex:
+           declare void foo(int64, int64);
+              and
+           call foo(int32, int32, int32, int32);
+     */
+    int real_nb_params = F->getFunctionType()->getNumParams();
+    if (nb_params == real_nb_params) {
+        for (int i = 0; i < real_nb_params; ++i) {
+            Type *ParamTy = F->getFunctionType()->getParamType(i);
+            Register &In = Reg[args[nb_oargs + i]];
+            Value *InData = LoadState(In);
+
+            size_t real_size = DL->getTypeSizeInBits(ParamTy);
+            size_t size = DL->getTypeSizeInBits(InData->getType());
+
+            if (ParamTy->isPointerTy() && !InData->getType()->isPointerTy())
+                InData = ITP8(InData);
+            else if (real_size < size)
+                InData = TRUNC(InData, IntegerType::get(*Context, real_size));
+
+            if (InData->getType() != ParamTy)
+                InData = new BitCastInst(InData, ParamTy, "", LastInst);
+            Params.push_back(InData);
+        }
+    } else {
+        int idx = 0;
+        for (int i = 0; i < real_nb_params; ++i) {
+            Value *V = nullptr;
+            Type *ParamTy = F->getFunctionType()->getParamType(i);
+            size_t real_size = DL->getTypeSizeInBits(ParamTy);
+            size_t size, remain = real_size;
+
+next:
+            Register &In = Reg[args[nb_oargs + idx]];
+            Value *InData = LoadState(In);
+
+            size = DL->getTypeSizeInBits(InData->getType());
+            if (size == real_size) {
+                if (InData->getType() != ParamTy)
+                    InData = new BitCastInst(InData, ParamTy, "", LastInst);
+                Params.push_back(InData);
+                idx++;
+            } else {
+                if (remain == real_size)
+                    V = ZEXT(InData, IntegerType::get(*Context, real_size));
+                else {
+                    InData = ZEXT(InData, ParamTy);
+                    InData = SHL(InData, ConstantInt::get(ParamTy, real_size-remain));
+                    V = OR(V, InData);
+                }
+
+                if (remain < size)
+                    IRError("%s: fatal error.\n", __func__);
+
+                remain -= size;
+                idx++;
+
+                if (remain)
+                    goto next;
+
+                Params.push_back(V);
+            }
+        }
+
+        if (idx != nb_params)
+            IRError("%s: num params not matched.\n", __func__);
+    }
+
+
+    /* Save global registers if this function is not TCG constant function.
+       Otherwise, mark this call instruction for state mapping use.
+       The rules can be found in tcg_reg_alloc_call() in tcg/tcg.c */
+    if (!(flags & TCG_CALL_NO_READ_GLOBALS))
+        SaveGlobals(COHERENCE_GLOBAL, LastInst);
+
+    /* handle COREMU's lightweight memory transaction helper */
+    if (isLMTFunction(FName)) {
+        uint32_t Idx = NI.setRestorePoint();
+        Value *ResVal = GetElementPtrInst::CreateInBounds(CPU,
+                CONSTPtr(offsetof(CPUArchState, restore_val)), "", LastInst);
+        ResVal = new BitCastInst(ResVal, Int32PtrTy, "", LastInst);
+        new StoreInst(CONST32(Idx), ResVal, true, LastInst);
+    }
+
+    CallInst *CI = CallInst::Create(F, Params, "", LastInst);
+
+    if (flags & TCG_CALL_NO_READ_GLOBALS)
+        MF->setConst(CI);
+
+    /* Determine if this function can be inlined. */
+    if (Helpers.find(FName) != Helpers.end()) {
+        bool MustInline = false;
+        HelperInfo *Helper = Helpers[FName];
+        if (AnalyzeInlineCost(CallSite(CI)) > 0) {
+            MustInline = true;
+            InlineCalls.push_back(CI);
+        }
+
+        if (!MustInline) {
+            Function *NoInlineF = ResolveFunction(Helper->FuncNoInline->getName());
+            CI->setCalledFunction(NoInlineF);
+        }
+    }
+
+    /* Format the return value.
+       NOTE: There are situations where the return value is split and
+       is used by different instructions. Ex:
+           int64 ret = call foo();
+           ... = opcode ret[0..31];
+           ... = opcode ret[32..64];
+     */
+    if (nb_oargs == 1) {
+        Register &Out = Reg[args[0]];
+        Out.setData(CI, true);
+    } else if (nb_oargs > 1) {
+        Value *V = CI;
+        size_t size = DL->getTypeSizeInBits(F->getReturnType());
+        size_t subsize = size / nb_oargs;
+        for (int i = 0; i < nb_oargs; ++i) {
+            Register &Out = Reg[args[i]];
+            Value *OutData = TRUNC(V, IntegerType::get(*Context, subsize));
+            Out.setData(OutData, true);
+            if (i != nb_oargs - 1)
+                V = LSHR(V, ConstantInt::get(IntegerType::get(*Context, size), subsize));
+        }
+    }
+}
+
+/*
+ * op_br()
+ *  args[0]: Label number
+ */
+void IRFactory::op_br(const TCGArg *args)
+{
+    IRDebug(INDEX_op_br);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+
+    TCGArg label = args[0];
+    if (Labels.find(label) == Labels.end())
+        Labels[label] = BasicBlock::Create(*Context, "direct_jump_tb", Func);
+
+    setSuccessor(LastInst, Labels[label]);
+    LastInst = nullptr;
+}
+
+/*
+ * op_mov_i32()
+ *  args[0]: Out
+ *  args[1]: In
+ */
+void IRFactory::op_mov_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mov_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData = LoadState(In);
+
+    size_t Size = DL->getTypeSizeInBits(InData->getType());
+    if (Size != 32)
+        InData = TRUNC32(InData);
+
+    Out.setData(InData, true);
+}
+
+/*
+ * op_movi_i32()
+ *  args[0]: Out
+ *  args[1]: In  (const value)
+ */
+void IRFactory::op_movi_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_movi_i32);
+
+    Register &Out = Reg[args[0]];
+
+    AssertType(Out.Size == 32);
+
+    Out.setData(CONST32(args[1]), true);
+}
+
+static inline CmpInst::Predicate getPred(const TCGArg cond)
+{
+    CmpInst::Predicate pred = ICmpInst::BAD_ICMP_PREDICATE;
+    switch (cond) {
+        case TCG_COND_EQ: pred = ICmpInst::ICMP_EQ;  break;
+        case TCG_COND_NE: pred = ICmpInst::ICMP_NE;  break;
+        case TCG_COND_LT: pred = ICmpInst::ICMP_SLT; break;
+        case TCG_COND_GE: pred = ICmpInst::ICMP_SGE; break;
+        case TCG_COND_LE: pred = ICmpInst::ICMP_SLE; break;
+        case TCG_COND_GT: pred = ICmpInst::ICMP_SGT; break;
+        /* unsigned */
+        case TCG_COND_LTU: pred = ICmpInst::ICMP_ULT; break;
+        case TCG_COND_GEU: pred = ICmpInst::ICMP_UGE; break;
+        case TCG_COND_LEU: pred = ICmpInst::ICMP_ULE; break;
+        case TCG_COND_GTU: pred = ICmpInst::ICMP_UGT; break;
+        default:
+            IRError("%s - unsupported predicate\n", __func__);
+    }
+    return pred;
+}
+
+/*
+ * op_setcond_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ *  args[3]: In3 (condition code)
+ */
+void IRFactory::op_setcond_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_setcond_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+    CmpInst::Predicate Pred = getPred(args[3]);
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = ICMP(InData1, InData2, Pred);
+    OutData = ZEXT32(OutData);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_movcond_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ *  args[3]: In3
+ *  args[4]: In4
+ *  args[5]: In5 (condition code)
+ */
+void IRFactory::op_movcond_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_movcond_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+    Register &In3 = Reg[args[3]];
+    Register &In4 = Reg[args[4]];
+    CmpInst::Predicate Pred = getPred(args[5]);
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32 &&
+               In3.Size == 32 && In4.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+    Value *Cond = ICMP(InData1, InData2, Pred);
+    Value *OutData = SelectInst::Create(Cond, InData3, InData4, "", LastInst);
+    Out.setData(OutData, true);
+}
+
+/* load/store */
+/*
+ * op_ld8u_i32()
+ *  args[0]: Out (ret)
+ *  args[1]: In1 (addr)
+ *  args[2]: In2 (offset)
+ */
+void IRFactory::op_ld8u_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld8u_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = ZEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld8s_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld8s_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = SEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld16u_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld16u_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR16(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = ZEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld16s_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld16s_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR16(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = SEXT32(InData);
+    Out.setData(InData, true);
+}
+
+/*
+ * op_ld_i32()
+ *  args[0]: Out (ret)
+ *  args[1]: In1 (addr)
+ *  args[2]: In2 (offset)
+ */
+void IRFactory::op_ld_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 32);
+
+    Value *InData;
+    if (In.isRev()) {
+        InData = StatePointer(In, Off, Int32PtrTy);
+        InData = new LoadInst(InData, "", false, LastInst);
+        if (isStateOfPC(Off))
+            static_cast<LoadInst*>(InData)->setVolatile(true);
+    } else {
+        InData = LoadState(In);
+        if (InData->getType() != Int8PtrTy)
+            InData = CASTPTR8(InData);
+        InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+        InData = CASTPTR32(InData);
+        InData = new LoadInst(InData, "", false, LastInst);
+    }
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_st8_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st8_i32);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = TRUNC8(InData1);
+    if (InData2->getType() != Int8PtrTy)
+        InData2 = CASTPTR8(InData2);
+    InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+    new StoreInst(InData1, InData2, false, LastInst);
+}
+
+void IRFactory::op_st16_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st16_i32);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = TRUNC16(InData1);
+    if (InData2->getType() != Int8PtrTy)
+        InData2 = CASTPTR8(InData2);
+    InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+    InData2 = CASTPTR16(InData2);
+    new StoreInst(InData1, InData2, false, LastInst);
+}
+
+/*
+ * op_st_i32()
+ *  args[0]: In1
+ *  args[1]: In2 (base)
+ *  args[2]: In3 (offset)
+ */
+void IRFactory::op_st_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st_i32);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+
+    if (In2.isRev()) {
+        Value *InData2 = StatePointer(In2, Off, Int32PtrTy);
+        StoreInst *SI = new StoreInst(InData1, InData2, false, LastInst);
+        if (isStateOfPC(Off))
+            SI->setVolatile(true);
+    } else {
+        Value *InData2 = LoadState(In2);
+        if (InData2->getType() != Int8PtrTy)
+            InData2 = CASTPTR8(InData2);
+        InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+        InData2 = CASTPTR32(InData2);
+        new StoreInst(InData1, InData2, false, LastInst);
+    }
+}
+
+/* arith */
+/*
+ * op_add_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_add_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_add_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData;
+    if (In1.isRev()) {
+        intptr_t Off = static_cast<ConstantInt*>(InData2)->getSExtValue();
+        OutData = StatePointer(In1, Off, Int32PtrTy);
+    } else
+        OutData = ADD(InData1, InData2);
+
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_sub_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_sub_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sub_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SUB(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_mul_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mul_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = MUL(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_div_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_div_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SDIV(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_divu_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_divu_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = UDIV(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_rem_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_rem_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SREM(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_remu_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_remu_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = UREM(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_div2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_div2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+#if 0
+    Register &In2 = Reg[args[3]];
+#endif
+    Register &In3 = Reg[args[4]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In3.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+#if 0
+    Value *InData2 = LoadState(In2);
+#endif
+    Value *InData3 = LoadState(In3);
+    Value *OutData1 = SDIV(InData1, InData3);
+    Value *OutData2 = SREM(InData1, InData3);
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+/*
+ * op_divu2_i32()
+ *  args[0]: Out1
+ *  args[1]: Out2
+ *  args[2]: In1
+ *  args[3]: In2
+ *  args[4]: In3
+ */
+void IRFactory::op_divu2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_divu2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+#if 0
+    Register &In2 = Reg[args[3]];
+#endif
+    Register &In3 = Reg[args[4]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In3.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+#if 0
+    Value *InData2 = LoadState(In2);
+#endif
+    Value *InData3 = LoadState(In3);
+    Value *OutData1 = UDIV(InData1, InData3);
+    Value *OutData2 = UREM(InData1, InData3);
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+/*
+ * op_and_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_and_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_and_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = AND(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_or_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_or_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_or_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = OR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_xor_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_xor_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_xor_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = XOR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/* shifts/rotates */
+/*
+ * op_shl_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_shl_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_shl_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SHL(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_shr_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_shr_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_shr_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = LSHR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_sar_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_sar_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sar_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = ASHR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_rotl_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_rotl_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_rotl_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *C = LSHR(InData1, SUB(CONST32(32), InData2));
+    Value *OutData = SHL(InData1, InData2);
+    OutData = OR(OutData, C);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_rotr_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_rotr_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *c = SHL(InData1, SUB(CONST32(32), InData2));
+    Value *OutData = LSHR(InData1, InData2);
+    OutData = OR(OutData, c);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_deposit_i32()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ *  args[3]: In3 (offset from LSB)
+ *  args[4]: In4 (length)
+ */
+void IRFactory::op_deposit_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_deposit_i32);
+
+    /* Deposit the lowest args[4] bits of register args[2] into register
+     * args[1] starting from bits args[3]. */
+    APInt mask = APInt::getBitsSet(32, args[3], args[3] + args[4]);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    if (args[3])
+        InData2 = SHL(InData2, CONST32(args[3]));
+    InData2 = AND(InData2, ConstantInt::get(*Context, mask));
+    InData1 = AND(InData1, ConstantInt::get(*Context, ~mask));
+    InData1 = OR(InData1, InData2);
+    Out.setData(InData1, true);
+}
+
+/*
+ * op_brcond_i32()
+ *  args[0]: In1
+ *  args[1]: In2
+ *  args[2]: In3 (condition code)
+ *  args[3]: In4 (label)
+ */
+void IRFactory::op_brcond_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_brcond_i32);
+
+    /* brcond_i32 format:
+     *  brcond_i32 op1,op2,cond,<ifTrue>
+     *  <ifFalse>:
+     *      A
+     *  <ifTrue>:
+     *      B
+     */
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    CmpInst::Predicate Pred = getPred(args[2]);
+
+    AssertType(In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+
+    TCGArg label = args[3];
+    if (Labels.find(label) == Labels.end())
+        Labels[label] = BasicBlock::Create(*Context, "succ", Func);
+
+    BasicBlock *ifTrue = Labels[label];
+    BasicBlock *ifFalse = BasicBlock::Create(*Context, "succ", Func);
+
+    Value *Cond = ICMP(InData1, InData2, Pred);
+    BranchInst::Create(ifTrue, ifFalse, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    CurrBB = ifFalse;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+void IRFactory::op_add2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_add2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+    Register &In3 = Reg[args[4]];
+    Register &In4 = Reg[args[5]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In2.Size == 32 &&
+               In3.Size == 32 && In4.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = SHL(ZEXT64(InData2), CONST64(32));
+    InData2 = OR(InData2, InData1);
+
+    InData3 = ZEXT64(InData3);
+    InData4 = SHL(ZEXT64(InData4), CONST64(32));
+    InData4 = OR(InData4, InData3);
+
+    InData2 = ADD(InData2, InData4);
+
+    Value *OutData1 = TRUNC32(InData2);
+    Value *OutData2 = TRUNC32(LSHR(InData2, CONST64(32)));
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+void IRFactory::op_sub2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sub2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+    Register &In3 = Reg[args[4]];
+    Register &In4 = Reg[args[5]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In2.Size == 32 &&
+               In3.Size == 32 && In4.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = SHL(ZEXT64(InData2), CONST64(32));
+    InData2 = OR(InData2, InData1);
+
+    InData3 = ZEXT64(InData3);
+    InData4 = SHL(ZEXT64(InData4), CONST64(32));
+    InData4 = OR(InData4, InData3);
+
+    InData2 = SUB(InData2, InData4);
+
+    Value *OutData1 = TRUNC32(InData2);
+    Value *OutData2 = TRUNC32(LSHR(InData2, CONST64(32)));
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+void IRFactory::op_mulu2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mulu2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = ZEXT64(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *Low = TRUNC32(OutData);
+    Value *High = TRUNC32(LSHR(OutData, CONST64(32)));
+    Out1.setData(Low, true);
+    Out2.setData(High, true);
+}
+
+void IRFactory::op_muls2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_muls2_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+
+    AssertType(Out1.Size == 32 && Out2.Size == 32 &&
+               In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = SEXT64(InData1);
+    InData2 = SEXT64(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *Low = TRUNC32(OutData);
+    Value *High = TRUNC32(LSHR(OutData, CONST64(32)));
+    Out1.setData(Low, true);
+    Out2.setData(High, true);
+}
+
+void IRFactory::op_muluh_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_muluh_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out1.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = ZEXT64(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *High = TRUNC32(LSHR(OutData, CONST64(32)));
+    Out1.setData(High, true);
+}
+
+void IRFactory::op_mulsh_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mulsh_i32);
+
+    Register &Out1 = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out1.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = SEXT64(InData1);
+    InData2 = SEXT64(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *High = TRUNC32(LSHR(OutData, CONST64(32)));
+    Out1.setData(High, true);
+}
+
+void IRFactory::op_brcond2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_brcond2_i32);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    Register &In3 = Reg[args[2]];
+    Register &In4 = Reg[args[3]];
+    CmpInst::Predicate Pred = getPred(args[4]);
+
+    AssertType(In1.Size == 32 && In2.Size == 32 &&
+               In3.Size == 32 && In4.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = SHL(ZEXT64(InData2), CONST64(32));
+    InData3 = ZEXT64(InData3);
+    InData4 = SHL(ZEXT64(InData4), CONST64(32));
+
+    InData2 = OR(InData2, InData1);
+    InData4 = OR(InData4, InData3);
+
+    TCGArg label = args[5];
+    if (Labels.find(label) == Labels.end())
+        Labels[label] = BasicBlock::Create(*Context, "succ", Func);
+
+    BasicBlock *ifTrue = Labels[label];
+    BasicBlock *ifFalse = BasicBlock::Create(*Context, "succ", Func);
+
+    Value *Cond = ICMP(InData2, InData4, Pred);
+    BranchInst::Create(ifTrue, ifFalse, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    CurrBB = ifFalse;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+void IRFactory::op_setcond2_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_setcond2_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+    Register &In3 = Reg[args[3]];
+    Register &In4 = Reg[args[4]];
+    CmpInst::Predicate Pred = getPred(args[5]);
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32 &&
+               In3.Size == 32 && In4.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    InData1 = ZEXT64(InData1);
+    InData2 = SHL(ZEXT64(InData2), CONST64(32));
+    InData3 = ZEXT64(InData3);
+    InData4 = SHL(ZEXT64(InData4), CONST64(32));
+
+    InData2 = OR(InData2, InData1);
+    InData4 = OR(InData4, InData3);
+
+    Value *OutData = ICMP(InData2, InData4, Pred);
+    OutData = ZEXT32(OutData);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_ext8s_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext8s_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC8(InData);
+    InData = SEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext16s_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext16s_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC16(InData);
+    InData = SEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext8u_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext8u_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = AND(InData, CONST32(0xFF));
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext16u_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext16u_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = AND(InData, CONST32(0xFFFF));
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_bswap16_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_bswap16_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC16(InData);
+    InData = BSWAP16(InData);
+    InData = ZEXT32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_bswap32_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_bswap32_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = BSWAP32(InData);
+    Out.setData(InData, true);
+}
+
+/*
+ * op_not_i32()
+ *  args[0]: Out
+ *  args[1]: In
+ */
+void IRFactory::op_not_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_not_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    Value *OutData = XOR(InData, CONST32((uint32_t)-1));
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_neg_i32()
+ *  args[0]: Out
+ *  args[1]: In
+ */
+void IRFactory::op_neg_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_neg_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    Value *OutData = SUB(CONST32(0), InData);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_andc_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_andc_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST32((uint32_t)-1));
+    Value *OutData = AND(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_orc_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_orc_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST32((uint32_t)-1));
+    Value *OutData = OR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_eqv_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_eqv_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST32((uint32_t)-1));
+    Value *OutData = XOR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_nand_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_nand_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *OutData = AND(InData1, InData2);
+    OutData = XOR(OutData, CONST32((uint32_t)-1));
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_nor_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_nor_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 32 && In1.Size == 32 && In2.Size == 32);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *OutData = OR(InData1, InData2);
+    OutData = XOR(OutData, CONST32((uint32_t)-1));
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_mov_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mov_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+
+    size_t Size = DL->getTypeSizeInBits(InData->getType());
+    if (Size != 64)
+        InData = ZEXT64(InData);
+
+    Out.setData(InData, true);
+}
+
+/*
+ * op_movi_i64()
+ *  args[0]: Out
+ *  args[1]: In  (const value)
+ */
+void IRFactory::op_movi_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_movi_i64);
+
+    Register &Out = Reg[args[0]];
+
+    AssertType(Out.Size == 64);
+
+    Out.setData(CONST64(args[1]), true);
+}
+
+void IRFactory::op_setcond_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_setcond_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+    CmpInst::Predicate Pred = getPred(args[3]);
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = ICMP(InData1, InData2, Pred);
+    OutData = ZEXT64(OutData);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_movcond_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_movcond_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+    Register &In3 = Reg[args[3]];
+    Register &In4 = Reg[args[4]];
+    CmpInst::Predicate Pred = getPred(args[5]);
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64 &&
+               In3.Size == 64 && In4.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+    Value *Cond = ICMP(InData1, InData2, Pred);
+    Value *OutData = SelectInst::Create(Cond, InData3, InData4, "", LastInst);
+    Out.setData(OutData, true);
+}
+
+
+/* load/store */
+void IRFactory::op_ld8u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld8u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld8s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld8s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld16u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld16u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR16(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld16s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld16s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR16(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld32u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld32u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR32(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ld32s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld32s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (InData->getType() != Int8PtrTy)
+        InData = CASTPTR8(InData);
+
+    InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+    InData = CASTPTR32(InData);
+    InData = new LoadInst(InData, "", false, LastInst);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+/*
+ * op_ld_i64()
+ *  args[0]: Out
+ *  args[1]: In  (base)
+ *  args[2]: In  (offset)
+ */
+void IRFactory::op_ld_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ld_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(Out.Size == 64);
+
+    Value *InData;
+    if (In.isRev()) {
+        InData = StatePointer(In, Off, Int64PtrTy);
+        InData = new LoadInst(InData, "", false, LastInst);
+        if (isStateOfPC(Off))
+            static_cast<LoadInst*>(InData)->setVolatile(true);
+    } else {
+        InData = LoadState(In);
+        if (InData->getType() != Int8PtrTy)
+            InData = CASTPTR8(InData);
+        InData = GetElementPtrInst::CreateInBounds(InData, CONSTPtr(Off), "", LastInst);
+        InData = CASTPTR64(InData);
+        InData = new LoadInst(InData, "", false, LastInst);
+    }
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_st8_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st8_i64);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = TRUNC8(InData1);
+    if (InData2->getType() != Int8PtrTy)
+        InData2 = CASTPTR8(InData2);
+    InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+    new StoreInst(InData1, InData2, false, LastInst);
+}
+
+void IRFactory::op_st16_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st16_i64);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = TRUNC16(InData1);
+    if (InData2->getType() != Int8PtrTy)
+        InData2 = CASTPTR8(InData2);
+    InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+    InData2 = CASTPTR16(InData2);
+    new StoreInst(InData1, InData2, false, LastInst);
+}
+
+void IRFactory::op_st32_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st32_i64);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = TRUNC32(InData1);
+    if (InData2->getType() != Int8PtrTy)
+        InData2 = CASTPTR8(InData2);
+    InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+    InData2 = CASTPTR32(InData2);
+    new StoreInst(InData1, InData2, false, LastInst);
+}
+
+/*
+ * op_st_i64()
+ *  args[0]: In1
+ *  args[1]: In2 (base)
+ *  args[2]: In3 (offset)
+ */
+void IRFactory::op_st_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_st_i64);
+
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    TCGArg Off = args[2];
+
+    AssertType(In1.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+
+    if (In2.isRev()) {
+        Value *InData2 = StatePointer(In2, Off, Int64PtrTy);
+        StoreInst *SI = new StoreInst(InData1, InData2, false, LastInst);
+        if (isStateOfPC(Off))
+            SI->setVolatile(true);
+    } else {
+        Value *InData2 = LoadState(In2);
+        if (InData2->getType() != Int8PtrTy)
+            InData2 = CASTPTR8(InData2);
+        InData2 = GetElementPtrInst::CreateInBounds(InData2, CONSTPtr(Off), "", LastInst);
+        InData2 = CASTPTR64(InData2);
+        new StoreInst(InData1, InData2, false, LastInst);
+    }
+}
+
+/* arith */
+/*
+ * op_add_i64()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_add_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_add_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData;
+    if (In1.isRev()) {
+        intptr_t Off = static_cast<ConstantInt*>(InData2)->getSExtValue();
+        OutData = StatePointer(In1, Off, Int64PtrTy);
+    } else
+        OutData = ADD(InData1, InData2);
+
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_sub_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sub_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SUB(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_mul_i64()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_mul_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mul_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = MUL(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_div_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_div_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SDIV(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_divu_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_divu_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = UDIV(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_rem_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_rem_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SREM(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_remu_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_remu_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = UREM(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_div2_i64(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_divu2_i64(const TCGArg *args)
+{
+    IRError("%s not implemented.\n", __func__);
+}
+
+void IRFactory::op_and_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_and_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    size_t Size = DL->getTypeSizeInBits(InData1->getType());
+    if (Size == 32)
+        InData1 = ZEXT64(InData1);
+    Size = DL->getTypeSizeInBits(InData2->getType());
+    if (Size == 32)
+        InData2 = ZEXT64(InData2);
+
+    Value *OutData = AND(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_or_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_or_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = OR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_xor_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_xor_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = XOR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/* shifts/rotates */
+void IRFactory::op_shl_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_shl_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = SHL(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+/*
+ * op_shr_i64()
+ *  args[0]: Out
+ *  args[1]: In1
+ *  args[2]: In2
+ */
+void IRFactory::op_shr_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_shr_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = LSHR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_sar_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sar_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *OutData = ASHR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_rotl_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_rotl_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *C = LSHR(InData1, SUB(CONST64(64), InData2));
+    Value *OutData = SHL(InData1, InData2);
+    OutData = OR(OutData, C);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_rotr_i64(const TCGArg *args)
+{
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *c = SHL(InData1, SUB(CONST64(64), InData2));
+    Value *OutData = LSHR(InData1, InData2);
+    OutData = OR(OutData, c);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_deposit_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_deposit_i64);
+
+    /* Deposit the lowest args[4] bits of register args[2] into register
+     * args[1] starting from bits args[3]. */
+    APInt mask = APInt::getBitsSet(64, args[3], args[3] + args[4]);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    if (args[3])
+        InData2 = SHL(InData2, CONST64(args[3]));
+    InData2 = AND(InData2, ConstantInt::get(*Context, mask));
+    InData1 = AND(InData1, ConstantInt::get(*Context, ~mask));
+    InData1 = OR(InData1, InData2);
+    Out.setData(InData1, true);
+}
+
+void IRFactory::op_ext_i32_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext_i32_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_extu_i32_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_extu_i32_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 32);
+
+    Value *InData = LoadState(In);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_extrl_i64_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_extrl_i64_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC32(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_extrh_i64_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_extrh_i64_i32);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 32 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC32(LSHR(InData, CONST64(32)));
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_brcond_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_brcond_i64);
+
+    /* brcond_i32 format:
+     *  brcond_i32 op1,op2,cond,<ifTrue>
+     *  <ifFalse>:
+     *      A
+     *  <ifTrue>:
+     *      B
+     */
+    Register &In1 = Reg[args[0]];
+    Register &In2 = Reg[args[1]];
+    CmpInst::Predicate Pred = getPred(args[2]);
+
+    AssertType(In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    SaveGlobals(COHERENCE_ALL, LastInst);
+
+    TCGArg label = args[3];
+    if (Labels.find(label) == Labels.end())
+        Labels[label] = BasicBlock::Create(*Context, "succ", Func);
+
+    BasicBlock *ifTrue = Labels[label];
+    BasicBlock *ifFalse = BasicBlock::Create(*Context, "succ", Func);
+
+    Value *Cond = ICMP(InData1, InData2, Pred);
+    BranchInst::Create(ifTrue, ifFalse, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    CurrBB = ifFalse;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+void IRFactory::op_ext8s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext8s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC8(InData);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext16s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext16s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC16(InData);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+/*
+ * op_ext32s_i64()
+ *  args[0]: Out
+ *  args[1]: In
+ */
+void IRFactory::op_ext32s_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext32s_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (DL->getTypeSizeInBits(InData->getType()) != 32)
+        InData = TRUNC32(InData);
+    InData = SEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext8u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext8u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = AND(InData, CONST64(0xFF));
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_ext16u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext16u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = AND(InData, CONST64(0xFFFF));
+    Out.setData(InData, true);
+}
+
+/*
+ * op_ext32u_i64()
+ *  args[0]: Out
+ *  args[1]: In
+ */
+void IRFactory::op_ext32u_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_ext32u_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    if (DL->getTypeSizeInBits(InData->getType()) == 32)
+        InData = ZEXT64(InData);
+    else
+        InData = AND(InData, CONST64(0xFFFFFFFF));
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_bswap16_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_bswap16_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC16(InData);
+    InData = BSWAP16(InData);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_bswap32_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_bswap32_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = TRUNC32(InData);
+    InData = BSWAP32(InData);
+    InData = ZEXT64(InData);
+    Out.setData(InData, true);
+}
+
+void IRFactory::op_bswap64_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_bswap64_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    InData = BSWAP64(InData);
+    Out.setData(InData, true);
+
+}
+
+void IRFactory::op_not_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_not_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    Value *OutData = XOR(InData, CONST64((uint64_t)-1));
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_neg_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_neg_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In = Reg[args[1]];
+
+    AssertType(Out.Size == 64 && In.Size == 64);
+
+    Value *InData = LoadState(In);
+    Value *OutData = SUB(CONST64(0), InData);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_andc_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_andc_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST64((uint64_t)-1));
+    Value *OutData = AND(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_orc_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_orc_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST64((uint64_t)-1));
+    Value *OutData = OR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_eqv_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_eqv_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData2 = XOR(InData2, CONST64((uint64_t)-1));
+    Value *OutData = XOR(InData1, InData2);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_nand_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_nand_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *OutData = AND(InData1, InData2);
+    OutData = XOR(OutData, CONST64((uint64_t)-1));
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_nor_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_nor_i64);
+
+    Register &Out = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    Value *OutData = OR(InData1, InData2);
+    OutData = XOR(OutData, CONST64((uint64_t)-1));
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_add2_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_add2_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+    Register &In3 = Reg[args[4]];
+    Register &In4 = Reg[args[5]];
+
+    AssertType(Out1.Size == 64 && Out2.Size == 64 &&
+               In1.Size == 64 && In2.Size == 64 &&
+               In3.Size == 64 && In4.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    InData1 = ZEXT128(InData1);
+    InData2 = SHL(ZEXT128(InData2), CONST128(64));
+    InData2 = OR(InData2, InData1);
+
+    InData3 = ZEXT128(InData3);
+    InData4 = SHL(ZEXT128(InData4), CONST128(64));
+    InData4 = OR(InData4, InData3);
+
+    InData2 = ADD(InData2, InData4);
+
+    Value *OutData1 = TRUNC64(InData2);
+    Value *OutData2 = TRUNC64(LSHR(InData2, CONST128(64)));
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+void IRFactory::op_sub2_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_sub2_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+    Register &In3 = Reg[args[4]];
+    Register &In4 = Reg[args[5]];
+
+    AssertType(Out1.Size == 64 && Out2.Size == 64 &&
+               In1.Size == 64 && In2.Size == 64 &&
+               In3.Size == 64 && In4.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = LoadState(In3);
+    Value *InData4 = LoadState(In4);
+
+    InData1 = ZEXT128(InData1);
+    InData2 = SHL(ZEXT128(InData2), CONST128(64));
+    InData2 = OR(InData2, InData1);
+
+    InData3 = ZEXT128(InData3);
+    InData4 = SHL(ZEXT128(InData4), CONST128(64));
+    InData4 = OR(InData4, InData3);
+
+    InData2 = SUB(InData2, InData4);
+
+    Value *OutData1 = TRUNC64(InData2);
+    Value *OutData2 = TRUNC64(LSHR(InData2, CONST128(64)));
+    Out1.setData(OutData1, true);
+    Out2.setData(OutData2, true);
+}
+
+void IRFactory::op_mulu2_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mulu2_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+
+    AssertType(Out1.Size == 64 && Out2.Size == 64 &&
+               In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = ZEXT128(InData1);
+    InData2 = ZEXT128(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *Low = TRUNC64(OutData);
+    Value *High = TRUNC64(LSHR(OutData, CONST128(64)));
+    Out1.setData(Low, true);
+    Out2.setData(High, true);
+}
+
+void IRFactory::op_muls2_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_muls2_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &Out2 = Reg[args[1]];
+    Register &In1 = Reg[args[2]];
+    Register &In2 = Reg[args[3]];
+
+    AssertType(Out1.Size == 64 && Out2.Size == 64 &&
+               In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = SEXT128(InData1);
+    InData2 = SEXT128(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *Low = TRUNC64(OutData);
+    Value *High = TRUNC64(LSHR(OutData, CONST128(64)));
+    Out1.setData(Low, true);
+    Out2.setData(High, true);
+}
+
+void IRFactory::op_muluh_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_muluh_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out1.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = ZEXT128(InData1);
+    InData2 = ZEXT128(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *High = TRUNC64(LSHR(OutData, CONST128(64)));
+    Out1.setData(High, true);
+}
+
+void IRFactory::op_mulsh_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_mulsh_i64);
+
+    Register &Out1 = Reg[args[0]];
+    Register &In1 = Reg[args[1]];
+    Register &In2 = Reg[args[2]];
+
+    AssertType(Out1.Size == 64 && In1.Size == 64 && In2.Size == 64);
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+
+    InData1 = SEXT128(InData1);
+    InData2 = SEXT128(InData2);
+
+    Value *OutData = MUL(InData1, InData2);
+    Value *High = TRUNC64(LSHR(OutData, CONST128(64)));
+    Out1.setData(High, true);
+}
+
+/* QEMU specific */
+void IRFactory::op_insn_start(const TCGArg *args)
+{
+    IRDebug(INDEX_op_insn_start);
+    NI.NumInsts++;
+}
+
+void IRFactory::InsertLinkAndExit(Instruction *InsertPos)
+{
+    auto ChainSlot = LLEnv->getChainSlot();
+    size_t Key = ChainSlot.first;
+    uintptr_t RetVal = ChainSlot.second;
+    unsigned Idx = NI.setChainSlot(Key);
+    uintptr_t Addr = NI.getChainSlotAddr(Idx);
+
+    /* Here we use the llvm.trap intrinsic to notify LLVM backend to insert
+     * jump instruction for chaining. */
+    ConstantInt *Meta[] = { CONST32(PATCH_TRACE_BLOCK_CHAINING), CONSTPtr(Addr) };
+    Function *TrapFn = Intrinsic::getDeclaration(Mod, Intrinsic::trap);
+    CallInst *CI = CallInst::Create(TrapFn, "", InsertPos);
+    DebugLoc DL = MF->getDebugLoc(PATCH_TRACE_BLOCK_CHAINING, Idx, Func, Meta);
+    CI->setDebugLoc(DL);
+
+    MF->setExit(CI);
+
+    InsertExit(RetVal);
+}
+
+void IRFactory::InsertExit(uintptr_t RetVal, bool setExit)
+{
+    ConstantInt *Meta[] = { CONST32(PATCH_EXIT_TB), ExitAddr };
+    ReturnInst *RI = ReturnInst::Create(*Context, CONSTPtr(RetVal), LastInst);
+    DebugLoc DL = MF->getDebugLoc(PATCH_EXIT_TB, 0, Func, Meta);
+    RI->setDebugLoc(DL);
+
+    if (setExit)
+        MF->setExit(RI);
+}
+
+void IRFactory::InsertLookupIBTC(GraphNode *CurrNode)
+{
+    BasicBlock *BB = nullptr;
+
+    if (CommonBB.find("ibtc") == CommonBB.end()) {
+        BB = CommonBB["ibtc"] = BasicBlock::Create(*Context, "ibtc", Func);
+
+        SmallVector<Value *, 4> Params;
+        Function *F = ResolveFunction("helper_lookup_ibtc");
+        Value *Env = ConvertCPUType(F, 0, BB);
+
+        Params.push_back(Env);
+        CallInst *CI = CallInst::Create(F, Params, "", BB);
+        IndirectBrInst *IB = IndirectBrInst::Create(CI, 1, BB);
+        MF->setConst(CI);
+        MF->setExit(CI);
+
+        IndirectBrs.push_back(IB);
+        toSink.push_back(BB);
+    }
+
+    BB = CommonBB["ibtc"];
+    BranchInst::Create(BB, LastInst);
+}
+
+void IRFactory::InsertLookupCPBL(GraphNode *CurrNode)
+{
+    SmallVector<Value *, 4> Params;
+    Function *F = ResolveFunction("helper_lookup_cpbl");
+    Value *Env = ConvertCPUType(F, 0, LastInst);
+
+    Params.push_back(Env);
+    CallInst *CI = CallInst::Create(F, Params, "", LastInst);
+    IndirectBrInst *IB = IndirectBrInst::Create(CI, 1, LastInst);
+    MF->setConst(CI);
+    MF->setExit(CI);
+
+    IndirectBrs.push_back(IB);
+    toSink.push_back(CurrBB);
+}
+
+void IRFactory::TraceValidateCPBL(GraphNode *NextNode, StoreInst *StorePC)
+{
+    TranslationBlock *NextTB = NextNode->getTB();
+    Value *Cond;
+
+    SmallVector<Value *, 4> Params;
+    Function *F = ResolveFunction("helper_validate_cpbl");
+    Value *Env = ConvertCPUType(F, 0, LastInst);
+
+    Params.push_back(Env);
+    Params.push_back(ConstantInt::get(StorePC->getValueOperand()->getType(),
+                                      NextTB->pc));
+    Params.push_back(CONST32(NextTB->id));
+    CallInst *CI = CallInst::Create(F, Params, "", LastInst);
+    Cond = ICMP(CI, CONST32(1), ICmpInst::ICMP_EQ);
+
+    MF->setConst(CI);
+
+    BasicBlock *Valid = BasicBlock::Create(*Context, "cpbl.valid", Func);
+    BasicBlock *Invalid = BasicBlock::Create(*Context, "cpbl.invalid", Func);
+    toSink.push_back(Invalid);
+
+    BranchInst::Create(Valid, Invalid, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    LastInst = BranchInst::Create(ExitBB, Invalid);
+    Instruction *SI = StorePC->clone();
+    SI->insertBefore(LastInst);
+    InsertExit(0);
+    LastInst->eraseFromParent();
+
+    MF->setExit(SI);
+
+    CurrBB = Valid;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+/*
+ * TraceLinkIndirectJump()
+ *  This routine implements IB inlining, i.e., linking two intra-trace blocks
+ *  via indirect branch.
+ *  Note that we don't need to validate CPBL because this routine is only
+ *  used for user-mode emulation.
+ */
+void IRFactory::TraceLinkIndirectJump(GraphNode *NextNode, StoreInst *SI)
+{
+    dbg() << DEBUG_LLVM << "    - Found an indirect branch. Guess pc "
+          << format("0x%" PRIx, NextNode->getGuestPC()) << "\n";
+
+    BasicBlock *ifTrue = BasicBlock::Create(*Context, "main_path", Func);
+    BasicBlock *ifFalse = BasicBlock::Create(*Context, "exit_stub", Func);
+
+    Value *NextPC = SI->getValueOperand();
+    Value *GuessPC = ConstantInt::get(NextPC->getType(),
+                                      Builder->getGuestPC(NextNode));
+
+    Value *Cond = ICMP(NextPC, GuessPC, ICmpInst::ICMP_EQ);
+    BranchInst::Create(ifTrue, ifFalse, Cond, LastInst);
+    LastInst->eraseFromParent();
+
+    CurrBB = ifTrue;
+
+    /* First set the branch to exit BB, and the link will be resolved
+       at the trace finalization procedure. */
+    BranchInst *BI = BranchInst::Create(ExitBB, CurrBB);
+    Builder->setBranch(BI, NextNode);
+
+    CurrBB = ifFalse;
+    LastInst = BranchInst::Create(ExitBB, CurrBB);
+}
+
+void IRFactory::TraceLinkDirectJump(GraphNode *NextNode, StoreInst *SI)
+{
+    ConstantInt *NextPC = static_cast<ConstantInt *>(SI->getValueOperand());
+    target_ulong next_pc = NextPC->getZExtValue() +
+                           Builder->getCurrNode()->getTB()->cs_base;
+    NextPC = ConstantInt::get(NextPC->getType(), next_pc);
+
+    dbg() << DEBUG_LLVM << "    - Found a direct branch to pc "
+          << format("0x%" PRIx, next_pc) << "\n";
+
+#if defined(CONFIG_SOFTMMU)
+    TranslationBlock *tb = Builder->getCurrNode()->getTB();
+    TranslationBlock *next_tb = NextNode->getTB();
+    /* If two blocks are not in the same page or the next block is across
+     * the page boundary, we have to handle it with CPBL.  */
+    if ((tb->pc & TARGET_PAGE_MASK) != (next_tb->pc & TARGET_PAGE_MASK) ||
+        next_tb->page_addr[1] != (tb_page_addr_t)-1)
+        TraceValidateCPBL(NextNode, SI);
+#endif
+    /* First set the branch to exit BB, and the link will be resolved
+       at the trace finalization procedure. */
+    BranchInst *BI = BranchInst::Create(ExitBB, LastInst);
+    Builder->setBranch(BI, NextNode);
+}
+
+void IRFactory::TraceLinkDirectJump(StoreInst *SI)
+{
+    ConstantInt *NextPC = static_cast<ConstantInt *>(SI->getValueOperand());
+    target_ulong next_pc = NextPC->getZExtValue() +
+                           Builder->getCurrNode()->getTB()->cs_base;
+    NextPC = ConstantInt::get(NextPC->getType(), next_pc);
+
+    dbg() << DEBUG_LLVM << "    - Found a direct branch to pc "
+          << format("0x%" PRIx, next_pc) << " (exit)\n";
+
+#if defined(CONFIG_SOFTMMU)
+    TranslationBlock *tb = Builder->getCurrNode()->getTB();
+    if ((tb->pc & TARGET_PAGE_MASK) != (next_pc & TARGET_PAGE_MASK)) {
+        InsertLookupCPBL(Builder->getCurrNode());
+        return;
+    }
+#endif
+    InsertLinkAndExit(SI);
+}
+
+GraphNode *IRFactory::findNextNode(target_ulong pc)
+{
+#ifdef USE_TRACETREE_ONLY
+    for (auto Child : Builder->getCurrNode()->getChildren()) {
+        if (pc == Builder->getGuestPC(Child))
+            return Child;
+    }
+    return nullptr;
+#else
+    return Builder->getNode(pc);
+#endif
+}
+
+void IRFactory::TraceLink(StoreInst *SI)
+{
+    GraphNode *CurrNode = Builder->getCurrNode();
+    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getValueOperand());
+    if (!CI) {
+        /* Indirect branch */
+        SaveGlobals(COHERENCE_ALL, LastInst);
+
+#if defined(CONFIG_USER_ONLY)
+        for (auto NextNode : CurrNode->getChildren())
+            TraceLinkIndirectJump(NextNode, SI);
+#endif
+        InsertLookupIBTC(CurrNode);
+    } else {
+        /* Direct branch. */
+        target_ulong pc = CI->getZExtValue();
+        GraphNode *NextNode = findNextNode(pc);
+        if (NextNode) {
+            TraceLinkDirectJump(NextNode, SI);
+            return;
+        }
+
+        TraceLinkDirectJump(SI);
+        std::string Name = CurrBB->getName().str() + ".exit";
+        CurrBB->setName(Name);
+        toSink.push_back(CurrBB);
+    }
+}
+
+StoreInst *IRFactory::getStorePC()
+{
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+    std::vector<std::pair<intptr_t, StoreInst *> > StorePC;
+
+    /* Search for store instructions that write value to PC in this block. */
+    bool hasAllConstantPC = true;
+    BasicBlock *BB = LastInst->getParent();
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+        if (StoreInst *SI = dyn_cast<StoreInst>(BI)) {
+            intptr_t Off = 0;
+            Value *Base = getBaseWithConstantOffset(DL,
+                                SI->getPointerOperand(), Off);
+            if (Base == BaseReg[TCG_AREG0].Base && isStateOfPC(Off)) {
+                StorePC.push_back(std::make_pair(Off, SI));
+                if (!isa<ConstantInt>(SI->getValueOperand()))
+                    hasAllConstantPC = false;
+            }
+        }
+    }
+
+    if (StorePC.empty())
+        return nullptr;
+    if (StorePC.size() == 1)
+        return StorePC[0].second;
+
+    /* We only consider the last two stores. */
+    unsigned I1 = StorePC.size() - 2, I2 = StorePC.size() - 1;
+    if (StorePC[I1].first > StorePC[I2].first) {
+        unsigned tmp = I1;
+        I1 = I2;
+        I2 = tmp;
+    }
+
+    intptr_t OffsetA = StorePC[I1].first;
+    intptr_t OffsetB = StorePC[I2].first;
+    StoreInst *SA = StorePC[I1].second;
+    StoreInst *SB = StorePC[I2].second;
+    intptr_t SzA = DL->getTypeSizeInBits(SA->getValueOperand()->getType());
+    intptr_t SzB = DL->getTypeSizeInBits(SB->getValueOperand()->getType());
+    if (SzA != SzB || OffsetA + SzA != OffsetB || SzA + SzB != TARGET_LONG_BITS)
+        return nullptr;
+
+    Value *NewPC;
+    Type *Ty = (TARGET_LONG_BITS == 32) ? Int32Ty : Int64Ty;
+    Type *PTy = (TARGET_LONG_BITS == 32) ? Int32PtrTy : Int64PtrTy;
+    if (hasAllConstantPC) {
+        target_ulong PCA = static_cast<ConstantInt*>(SA->getValueOperand())->getZExtValue();
+        target_ulong PCB = static_cast<ConstantInt*>(SA->getValueOperand())->getZExtValue();
+        NewPC = ConstantInt::get(Ty, PCA | (PCB << SzA));
+    } else {
+        Value *PCA = ZEXT(SA->getValueOperand(), Ty);
+        Value *PCB = ZEXT(SB->getValueOperand(), Ty);
+        PCB = SHL(PCB, ConstantInt::get(Ty, SzA));
+        NewPC = OR(PCA, PCB);
+    }
+
+    toErase.push_back(SA);
+    toErase.push_back(SB);
+
+    Value *Addr = CAST(SA->getPointerOperand(), PTy);
+    return new StoreInst(NewPC, Addr, true, LastInst);
+
+#else
+    return dyn_cast<StoreInst>(--BasicBlock::iterator(LastInst));
+#endif
+}
+
+/*
+ * op_exit_tb()
+ *  args[0]: return value
+ */
+void IRFactory::op_exit_tb(const TCGArg *args)
+{
+    IRDebug(INDEX_op_exit_tb);
+
+    if (!LastInst)
+        return;
+
+    /* Some guest architectures (e.g., ARM) do not explicitly generete a store
+     * instruction to sync the PC value to the memory before exit_tb. We
+     * generate the store PC instruction here so that the following routine can
+     * analyze the PC value it will branch to. Note that other dirty states will
+     * be synced later. */
+    CreateStorePC(LastInst);
+
+    if (LastInst == &*LastInst->getParent()->begin()) {
+        SaveGlobals(COHERENCE_ALL, LastInst);
+        InsertExit(0, true);
+    } else if (isa<CallInst>(--BasicBlock::iterator(LastInst))) {
+        /* Tail call. */
+        for (int i = 0, e = tcg_ctx.nb_globals; i != e; ++i) {
+            Register &reg = Reg[i];
+            if (reg.isReg() && reg.isDirty())
+                runPasses = false;
+        }
+
+        SaveGlobals(COHERENCE_ALL, LastInst);
+        InsertExit(0, true);
+    } else if (StoreInst *SI = getStorePC()) {
+        SaveGlobals(COHERENCE_ALL, SI);
+        TraceLink(SI);
+    } else {
+        runPasses = false;
+        SaveGlobals(COHERENCE_ALL, LastInst);
+        InsertExit(0, true);
+    }
+
+    LastInst->eraseFromParent();
+    LastInst = nullptr;
+}
+
+/*
+ * op_goto_tb()
+ *  args[0]: jump index
+ */
+void IRFactory::op_goto_tb(const TCGArg *args)
+{
+    IRDebug(INDEX_op_goto_tb);
+}
+
+void IRFactory::op_qemu_ld_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_qemu_ld_i32);
+
+    TCGArg DataLo = *args++;
+    TCGArg AddrLo = *args++;
+    TCGArg AddrHi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) ? *args++ : 0;
+    TCGMemOpIdx oi = *args++;
+    TCGMemOp opc = get_memop(oi);
+
+    Register &Out = Reg[DataLo];
+    Register &In1 = Reg[AddrLo];
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = (AddrHi) ? LoadState(Reg[AddrHi]) : nullptr;
+
+    AssertType(In1.Size == 32 || In1.Size == 64);
+
+    SaveStates();
+
+    Value *OutData = QEMULoad(InData1, InData2, oi);
+    OutData = getExtendValue(OutData, Out.Ty, opc);
+    Out.setData(OutData, true);
+}
+
+void IRFactory::op_qemu_st_i32(const TCGArg *args)
+{
+    IRDebug(INDEX_op_qemu_st_i32);
+
+    TCGArg DataLo = *args++;
+    TCGArg AddrLo = *args++;
+    TCGArg AddrHi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) ? *args++ : 0;
+    TCGMemOpIdx oi = *args++;
+    TCGMemOp opc = get_memop(oi);
+
+    Register &In1 = Reg[DataLo];
+    Register &In2 = Reg[AddrLo];
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = (AddrHi) ? LoadState(Reg[AddrHi]) : nullptr;
+
+    AssertType(In1.Size == 32 || In1.Size == 64);
+
+    SaveStates();
+
+    InData1 = getTruncValue(InData1, opc);
+    QEMUStore(InData1, InData2, InData3, oi);
+}
+
+void IRFactory::op_qemu_ld_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_qemu_ld_i64);
+
+    TCGArg DataLo = *args++;
+    TCGArg DataHi = (TCG_TARGET_REG_BITS == 32) ? *args++ : 0;
+    TCGArg AddrLo = *args++;
+    TCGArg AddrHi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) ? *args++ : 0;
+    TCGMemOpIdx oi = *args++;
+    TCGMemOp opc = get_memop(oi);
+
+    Register &Out = Reg[DataLo];
+    Register &In1 = Reg[AddrLo];
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = (AddrHi) ? LoadState(Reg[AddrHi]) : nullptr;
+
+    AssertType(In1.Size == 32 || In1.Size == 64);
+
+    SaveStates();
+
+    Value *OutData = QEMULoad(InData1, InData2, oi);
+    OutData = getExtendValue(OutData, Out.Ty, opc);
+
+    if (DataHi == 0)
+        Out.setData(OutData, true);
+    else {
+        Register &Out2 = Reg[DataHi];
+        Value *OutData1 = TRUNC32(OutData);
+        Value *OutData2 = TRUNC32(LSHR(OutData, CONST64(32)));
+        Out.setData(OutData1, true);
+        Out2.setData(OutData2, true);
+    }
+}
+
+void IRFactory::op_qemu_st_i64(const TCGArg *args)
+{
+    IRDebug(INDEX_op_qemu_st_i64);
+
+    TCGArg DataLo = *args++;
+    TCGArg DataHi = (TCG_TARGET_REG_BITS == 32) ? *args++ : 0;
+    TCGArg AddrLo = *args++;
+    TCGArg AddrHi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) ? *args++ : 0;
+    TCGMemOpIdx oi = *args++;
+    TCGMemOp opc = get_memop(oi);
+
+    Register &In1 = Reg[DataLo];
+    Register &In2 = Reg[AddrLo];
+
+    Value *InData1 = LoadState(In1);
+    Value *InData2 = LoadState(In2);
+    Value *InData3 = (AddrHi) ? LoadState(Reg[AddrHi]) : nullptr;
+
+    AssertType(In2.Size == 32 || In2.Size == 64);
+
+    SaveStates();
+
+    Value *InData;
+    if (DataHi == 0)
+        InData = InData1;
+    else {
+        InData = LoadState(Reg[DataHi]);
+        InData = SHL(ZEXT64(InData), CONST64(32));
+        InData = OR(InData, ZEXT64(InData1));
+    }
+
+    InData = getTruncValue(InData, opc);
+    QEMUStore(InData, InData2, InData3, oi);
+}
+
+
+/*
+ * Metadata Factory
+ */
+MDFactory::MDFactory(Module *M) : UID(0), Context(M->getContext())
+{
+    Dummy = getMDNode(ArrayRef<ConstantInt*>(getUID()));
+}
+
+MDFactory::~MDFactory() {}
+
+#if defined(LLVM_V35)
+void MDFactory::setConstStatic(LLVMContext &Context, Instruction *I,
+                               ArrayRef<ConstantInt*> V)
+{
+    SmallVector<Value *, 4> MDs;
+    for (unsigned i = 0, e = V.size(); i != e; ++i)
+        MDs.push_back(V[i]);
+    I->setMetadata(META_CONST, MDNode::get(Context, MDs));
+}
+
+MDNode *MDFactory::getMDNode(ArrayRef<ConstantInt*> V)
+{
+    SmallVector<Value *, 4> MDs;
+    MDs.push_back(getUID());
+    for (unsigned i = 0, e = V.size(); i != e; ++i)
+        MDs.push_back(V[i]);
+    return MDNode::get(Context, MDs);
+}
+#else
+void MDFactory::setConstStatic(LLVMContext &Context, Instruction *I,
+                               ArrayRef<ConstantInt*> V)
+{
+    SmallVector<Metadata *, 4> MDs;
+    for (unsigned i = 0, e = V.size(); i != e; ++i)
+        MDs.push_back(ConstantAsMetadata::get(V[i]));
+    I->setMetadata(META_CONST, MDNode::get(Context, MDs));
+}
+
+MDNode *MDFactory::getMDNode(ArrayRef<ConstantInt*> V)
+{
+    SmallVector<Metadata *, 4> MDs;
+    MDs.push_back(ConstantAsMetadata::get(getUID()));
+    for (unsigned i = 0, e = V.size(); i != e; ++i)
+        MDs.push_back(ConstantAsMetadata::get(V[i]));
+    return MDNode::get(Context, MDs);
+}
+#endif
+
+#if defined(ENABLE_MCJIT)
+DebugLoc MDFactory::getDebugLoc(unsigned Line, unsigned Col, Function *F,
+                                ArrayRef<ConstantInt*> Meta)
+{
+    Module *M = F->getParent();
+    DIBuilder DIB(*M);
+    auto File = DIB.createFile(F->getName(), "hqemu/");
+#if defined(LLVM_V35)
+    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74, F->getName(),
+                                    "hqemu/", "hqemu", true, "", 0);
+    auto Type = DIB.createSubroutineType(File,
+                DIB.getOrCreateArray(ArrayRef<Value *>()));
+    auto SP = DIB.createFunction(CU, F->getName(), "", File, 1, Type, false,
+                                 true, 1, 0, true);
+    auto Scope = DIB.createLexicalBlockFile(SP, File);
+    DebugLoc DL = DebugLoc::get(Line, Col, Scope);
+    DIB.finalize();
+    SP.replaceFunction(F);
+#elif defined(LLVM_V38) || defined(LLVM_V39)
+    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74, F->getName(),
+                                    "hqemu/", "hqemu", true, "", 0);
+    auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+    auto SP = DIB.createFunction(CU, F->getName(), "", File, 1, Type, false,
+                                 true, 1, 0, true);
+    auto Scope = DIB.createLexicalBlockFile(SP, File, 0);
+    DebugLoc DL = DebugLoc::get(Line, Col, Scope);
+    DIB.finalize();
+    F->setSubprogram(SP);
+#else
+    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74, File,
+                                    "hqemu", true, "", 0);
+    auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+    auto SP = DIB.createFunction(CU, F->getName(), "", File, 1, Type, false,
+                                 true, 1, DINode::FlagZero, true);
+    auto Scope = DIB.createLexicalBlockFile(SP, File, 0);
+    DebugLoc DL = DebugLoc::get(Line, Col, Scope);
+    DIB.finalize();
+    F->setSubprogram(SP);
+#endif
+
+    return DL;
+}
+#else
+DebugLoc MDFactory::getDebugLoc(unsigned Line, unsigned Col, Function *F,
+                                ArrayRef<ConstantInt*> Meta)
+{
+    return DebugLoc::get(Line, Col, getMDNode(Meta));
+}
+#endif
+
+
+/*
+ * TraceBuilder()
+ */
+TraceBuilder::TraceBuilder(IRFactory *IRF, OptimizationInfo *Opt)
+    : IF(IRF), Opt(Opt), Aborted(false), Attribute(A_None), Trace(nullptr)
+{
+    GraphNode *EntryNode = Opt->getCFG();
+    if (!EntryNode)
+        hqemu_error("invalid optimization request.\n");
+
+    /* Find unique nodes. */
+    NodeVec VisitStack;
+    NodeSet Visited;
+    VisitStack.push_back(EntryNode);
+    do {
+        GraphNode *Node = VisitStack.back();
+        VisitStack.pop_back();
+        if (Visited.find(Node) == Visited.end()) {
+            Visited.insert(Node);
+
+            setUniqueNode(Node);
+
+            for (auto Child : Node->getChildren())
+                VisitStack.push_back(Child);
+        }
+    } while (!VisitStack.empty());
+
+    /* Add entry node into the building queue. */
+    NodeQueue.push_back(EntryNode);
+
+    IF->CreateSession(this);
+    IF->CreateFunction();
+}
+
+void TraceBuilder::ConvertToTCGIR(CPUArchState *env)
+{
+    TranslationBlock *tb = CurrNode->getTB();
+
+    if (LLEnv->isTraceMode()) {
+        env->image_base = (uintptr_t)tb->image - tb->pc;
+        tcg_copy_state(env, tb);
+    }
+
+    tcg_func_start(&tcg_ctx, tb);
+    gen_intermediate_code(env, tb);
+    tcg_liveness_analysis(&tcg_ctx);
+}
+
+static inline bool isVecOp(TCGOpcode opc)
+{
+    switch (opc) {
+    case INDEX_op_vector_start ... INDEX_op_vector_end:
+        return true;
+    default:
+        return false;
+    }
+}
+
+void TraceBuilder::ConvertToLLVMIR()
+{
+    IF->CreateBlock();
+
+    auto OpcFunc = (IRFactory::FuncPtr *)IF->getOpcFunc();
+    TCGArg *VecArgs = tcg_ctx.vec_opparam_buf;
+
+    IF->NI.setTB(CurrNode->getTB());
+    for (int oi = tcg_ctx.gen_first_op_idx; oi >= 0; ) {
+        TCGOp * const op = &tcg_ctx.gen_op_buf[oi];
+        TCGArg *args = &tcg_ctx.gen_opparam_buf[op->args];
+        oi = op->next;
+
+        if (isVecOp(op->opc)) {
+            args = VecArgs;
+            VecArgs += 3;
+        }
+
+        IF->NI.setOp(op);
+        (IF->*OpcFunc[op->opc])(args);
+
+        if (isAborted()) {
+            IF->DeleteSession();
+            return;
+        }
+    }
+}
+
+void TraceBuilder::Abort()
+{
+    Aborted = true;
+}
+
+void TraceBuilder::Finalize()
+{
+    /* Reconnect links of basic blocks. The links are previously
+       set to ExitBB. */
+    for (unsigned i = 0, e = Branches.size(); i != e; ++i) {
+        BranchInst *BI = Branches[i].first;
+        GraphNode *Node = Branches[i].second;
+        IF->setSuccessor(BI, getBasicBlock(Node));
+    }
+
+    Trace = new TraceInfo(NodeUsed, Attribute);
+    IF->Compile();
+    IF->DeleteSession();
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-soft-perfmon.cpp b/llvm/llvm-soft-perfmon.cpp
new file mode 100644
index 0000000..a5f9a56
--- /dev/null
+++ b/llvm/llvm-soft-perfmon.cpp
@@ -0,0 +1,357 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <iostream>
+#include <sstream>
+#include "tracer.h"
+#include "utils.h"
+#include "llvm.h"
+#include "llvm-target.h"
+#include "llvm-soft-perfmon.h"
+
+
+extern LLVMEnv *LLEnv;
+extern unsigned ProfileThreshold;
+extern unsigned PredictThreshold;
+
+/*
+ * Software Performance Monitor (SPM)
+ */
+void SoftwarePerfmon::ParseProfileMode(std::string &ProfileLevel)
+{
+    static std::string profile_str[SPM_NUM] = {
+        "none", "basic", "trace", "cache", "pass", "hpm", "exit", "hotspot", "all"
+    };
+    static uint64_t profile_enum[SPM_NUM] = {
+        SPM_NONE, SPM_BASIC, SPM_TRACE, SPM_CACHE, SPM_PASS, SPM_HPM,
+        SPM_EXIT, SPM_HOTSPOT, SPM_ALL,
+    };
+
+    if (ProfileLevel.empty())
+        return;
+
+    std::istringstream ss(ProfileLevel);
+    std::string token;
+    while(getline(ss, token, ',')) {
+        for (int i = 0; i != SPM_NUM; ++i) {
+            if (token == profile_str[i]) {
+                Mode |= profile_enum[i];
+                break;
+            }
+        }
+    }
+}
+
+void SoftwarePerfmon::printProfile()
+{
+    if (!isEnabled())
+        return;
+
+    if (LLVMEnv::TransMode == TRANS_MODE_NONE ||
+        LLVMEnv::TransMode == TRANS_MODE_INVALID)
+        return;
+
+    if (LLVMEnv::TransMode == TRANS_MODE_BLOCK)
+        printBlockProfile();
+    else
+        printTraceProfile();
+}
+
+void SoftwarePerfmon::printBlockProfile()
+{
+    LLVMEnv::TransCodeList &TransCode = LLEnv->getTransCode();
+    uint32_t GuestSize = 0, GuestICount = 0, HostSize = 0;
+    uint64_t TransTime = 0, MaxTime = 0;
+
+    for (auto TC : TransCode) {
+        TraceInfo *Trace = TC->Trace;
+        TranslationBlock *TB = TC->EntryTB;
+        GuestSize += TB->size;
+        GuestICount += TB->icount;
+        HostSize += TC->Size;
+        TransTime += Trace->TransTime;
+        if (Trace->TransTime > MaxTime)
+            MaxTime = Trace->TransTime;
+    }
+
+    auto &OS = DM.debug();
+    OS << "\nBlock statistic:\n"
+       << "Num of Blocks    : " << TransCode.size() << "\n"
+       << "G/H Code Size    : " << GuestSize << "/" << HostSize << "bytes\n"
+       << "Guest ICount     : " << GuestICount << "\n"
+       << "Translation Time : " << format("%.6f", (double)TransTime * 1e-6)
+                                << " seconds (max=" << MaxTime /1000 << " ms)\n";
+}
+
+static void printBasic(LLVMEnv::TransCodeList &TransCode)
+{
+    uint32_t GuestSize = 0, GuestICount = 0, HostSize = 0;
+    uint32_t NumBlock = 0, NumLoop = 0, NumExit = 0, NumIndirectBr = 0;
+    uint32_t MaxBlock = 0, MaxLoop = 0, MaxExit = 0, MaxIndirectBr = 0;
+    uint64_t TransTime = 0, MaxTime = 0;
+    unsigned NumTraces = TransCode.size();
+    std::map<unsigned, unsigned> LenDist;
+
+    for (auto TC : TransCode) {
+        TraceInfo *Trace = TC->Trace;
+        TBVec &TBs = Trace->TBs;
+        for (unsigned i = 0, e = TBs.size(); i != e; ++i) {
+            GuestSize += TBs[i]->size;
+            GuestICount += TBs[i]->icount;
+        }
+        HostSize += TC->Size;
+
+        NumBlock += TBs.size();
+        NumLoop += Trace->NumLoop;
+        NumExit += Trace->NumExit;
+        NumIndirectBr += Trace->NumIndirectBr;
+        TransTime += Trace->TransTime;
+
+        if (TBs.size() > MaxBlock)
+            MaxBlock = TBs.size();
+        if (Trace->NumLoop > MaxLoop)
+            MaxLoop = Trace->NumLoop;
+        if (Trace->NumExit > MaxExit)
+            MaxExit = Trace->NumExit;
+        if (Trace->NumIndirectBr > MaxIndirectBr)
+            MaxIndirectBr = Trace->NumIndirectBr;
+        if (Trace->TransTime > MaxTime)
+            MaxTime = Trace->TransTime;
+        LenDist[TBs.size()]++;
+    }
+
+    auto &OS = DM.debug();
+    OS << "Trace statistic:\n"
+       << "Num of Traces    : " << NumTraces << "\n"
+       << "Profile Thres.   : " << ProfileThreshold << "\n"
+       << "Predict Thres.   : " << PredictThreshold << "\n"
+       << "G/H Code Size    : " << GuestSize << "/" << HostSize << " bytes\n"
+       << "Translation Time : " << format("%.6f", (double)TransTime * 1e-6)
+                                << " seconds (max=" << MaxTime /1000 << " ms)\n"
+       << "Average # Blocks : " << format("%.1f", (double)NumBlock / NumTraces)
+                                << " (max=" << MaxBlock << ")\n"
+       << "Average # Loops  : " << format("%.1f", (double)NumLoop / NumTraces)
+                                << " (max=" << MaxLoop << ")\n"
+       << "Average # Exits  : " << format("%.1f", (double)NumExit / NumTraces)
+                                << " (max=" << MaxExit << ")\n"
+       << "Average # IBs    : " << format("%.1f", (double)NumIndirectBr / NumTraces)
+                                << " (max=" << MaxIndirectBr << ")\n"
+       << "Flush Count      : " << LLEnv->getNumFlush() << "\n";
+
+    OS << "Trace length distribution: (1-" << MaxBlock << ")\n    ";
+    for (unsigned i = 1; i <= MaxBlock; i++)
+        OS << LenDist[i] << " ";
+    OS << "\n";
+}
+
+static void printTraceExec(LLVMEnv::TransCodeList &TransCode)
+{
+    unsigned NumThread = 0;
+    for (auto next_cpu = first_cpu; next_cpu != nullptr;
+         next_cpu = CPU_NEXT(next_cpu))
+        NumThread++;
+
+    /* Detailed trace information and runtime counters. */
+    auto &OS = DM.debug();
+    OS << "----------------------------\n"
+       << "Trace execution information:\n";
+
+    unsigned NumTraces = TransCode.size();
+    for (unsigned i = 0; i != NumThread; ++i) {
+        unsigned TraceUsed = 0;
+
+        OS << ">\n"
+           << "Thread " << i << ":\n"
+           << "                                   dynamic exec count\n"
+           << "  id      pc      #loop:#exit      loop      ibtc      exit\n";
+        for (unsigned j = 0; j != NumTraces; ++j) {
+            TraceInfo *Trace = TransCode[j]->Trace;
+            uint64_t *Counter = Trace->ExecCount[i];
+            if (Counter[0] + Counter[1] + Counter[2] == 0)
+                continue;
+            TraceUsed++;
+            OS << format("%4d", j) << ") "
+               << format("0x%08" PRIx, Trace->getEntryPC()) << "    "
+               << format("%2d", Trace->NumLoop)   << "    "
+               << format("%2d", Trace->NumExit)   << "   "
+               << format("%8" PRId64, Counter[0]) << "  "
+               << format("%8" PRId64, Counter[1]) << "  "
+               << format("%8" PRId64, Counter[2]) << "\n";
+        }
+        OS << "Trace used: " << TraceUsed << "/" << NumTraces <<"\n";
+    }
+}
+
+static void printHPM()
+{
+    auto &OS = DM.debug();
+    OS << "Num of Insns     : " << SP->NumInsns << "\n"
+       << "Num of Loads     : " << SP->NumLoads << "\n"
+       << "Num of Stores    : " << SP->NumStores << "\n"
+       << "Num of Branches  : " << SP->NumBranches << "\n"
+       << "Sample Time      : " << format("%.6f seconds", (double)SP->SampleTime * 1e-6)
+       << "\n";
+}
+
+static void printHotspot(unsigned &CoverSet,
+                         std::vector<std::vector<uint64_t> *> &SampleListVec)
+{
+    auto &OS = DM.debug();
+    auto &TransCode = LLEnv->getTransCode();
+    auto &SortedCode = LLEnv->getSortedCode();
+    uint64_t BlockCacheStart = (uintptr_t)tcg_ctx_global.code_gen_buffer;
+    uint64_t BlockCacheEnd = BlockCacheStart + tcg_ctx_global.code_gen_buffer_size;
+    uint64_t TraceCacheStart = (uintptr_t)LLVMEnv::TraceCache;
+    uint64_t TraceCacheEnd = TraceCacheStart + LLVMEnv::TraceCacheSize;
+    uint64_t TotalSamples = 0;
+    uint64_t NumBlockCache = 0, NumTraceCache = 0, NumOther = 0;
+
+    for (auto *L : SampleListVec) {
+        for (uint64_t IP : *L) {
+            if (IP >= BlockCacheStart && IP < BlockCacheEnd)
+                NumBlockCache++;
+            else if (IP >= TraceCacheStart && IP < TraceCacheEnd)
+                NumTraceCache++;
+            else
+                NumOther++;
+
+            auto IT = SortedCode.upper_bound(IP);
+            if (IT == SortedCode.begin())
+                continue;
+            auto TC = (--IT)->second;
+            if (IP < (uint64_t)TC->Code + TC->Size)
+                TC->SampleCount++;;
+        }
+        delete L;
+    }
+
+    TotalSamples = NumBlockCache + NumTraceCache + NumOther;
+    if (TotalSamples == 0 || TransCode.empty()) {
+        OS << CoverSet << "% CoverSet     : 0\n";
+        return;
+    }
+
+    /* Print the time breakdown of block cache, trace cache and other. */
+    char buf[128] = {'\0'};
+    double RatioBlockCache = (double)NumBlockCache * 100 / TotalSamples;
+    double RatioTraceCache = (double)NumTraceCache * 100 / TotalSamples;
+    sprintf(buf, "block (%.1f%%) trace (%.1f%%) other (%.1f%%)", RatioBlockCache,
+            RatioTraceCache, 100.0f - RatioBlockCache - RatioTraceCache);
+    OS << "Breakdown        : " << buf << "\n";
+
+    /* Print the amount of traces in the cover set. */
+    std::map<TranslatedCode *, unsigned> IndexMap;
+    for (unsigned i = 0, e = TransCode.size(); i != e; ++i)
+        IndexMap[TransCode[i]] = i;
+
+    LLVMEnv::TransCodeList Covered(TransCode.begin(), TransCode.end());
+    std::sort(Covered.begin(), Covered.end(),
+            [](const TranslatedCode *a, const TranslatedCode *b) {
+                return a->SampleCount > b->SampleCount;
+            });
+
+    uint64_t CoverSamples = TotalSamples * CoverSet / 100;
+    uint64_t AccuSamples = 0;
+    unsigned NumTracesInCoverSet = 0;
+    for (TranslatedCode *TC : Covered) {
+        if (AccuSamples >= CoverSamples || TC->SampleCount == 0)
+            break;
+        NumTracesInCoverSet++;
+        AccuSamples += TC->SampleCount;
+    }
+
+    OS << CoverSet << "% CoverSet     : " << NumTracesInCoverSet << "\n";
+
+    if (NumTracesInCoverSet == 0)
+        return;
+
+    /* Print the percentage of time of the traces in the cover set. */
+    if (DM.getDebugMode() & DEBUG_IR_OPT) {
+        OS << "Traces of CoverSet:\n";
+        for (unsigned i = 0; i < NumTracesInCoverSet; ++i) {
+            TranslatedCode *TC = Covered[i];
+            sprintf(buf, "%4d (%.1f%%): ", IndexMap[TC],
+                    (double)TC->SampleCount * 100 / TotalSamples);
+            OS << buf;
+            int j = 0;
+            for (auto *TB: TC->Trace->TBs) {
+                std::stringstream ss;
+                ss << std::hex << TB->pc;
+                OS << (j++ == 0 ? "" : ",") << ss.str();
+            }
+            OS << "\n";
+        }
+    } else {
+        unsigned top = 10;
+
+        OS << "Percentage of CoverSet (top 10): ";
+        if (NumTracesInCoverSet < top)
+            top = NumTracesInCoverSet;
+        for (unsigned i = 0; i < top; ++i) {
+            TranslatedCode *TC = Covered[i];
+            sprintf(buf, "%.1f%%", (double)TC->SampleCount * 100 / TotalSamples);
+            OS << (i == 0 ? "" : " ") << buf;
+        }
+        OS << "\n";
+    }
+}
+
+void SoftwarePerfmon::printTraceProfile()
+{
+    auto &OS = DM.debug();
+    unsigned NumTraces = LLEnv->getTransCode().size();
+
+    OS << "\n";
+    if (NumTraces == 0) {
+        OS << "Trace statistic:\n"
+           << "Num of Traces  : " << NumTraces << "\n\n";
+        return;
+    }
+
+    /* Static information */
+    if (Mode & SPM_BASIC)
+        printBasic(LLEnv->getTransCode());
+    if (Mode & SPM_EXIT)
+        OS << "Num of TraceExit : " << NumTraceExits << "\n";
+    if (Mode & SPM_HPM)
+        printHPM();
+    if (Mode & SPM_HOTSPOT)
+        printHotspot(CoverSet, SP->SampleListVec);
+
+    /* Code cache infomation - start address and size */
+    if (Mode & SPM_CACHE) {
+        size_t BlockSize = (uintptr_t)tcg_ctx_global.code_gen_ptr -
+                           (uintptr_t)tcg_ctx_global.code_gen_buffer;
+        size_t TraceSize = LLEnv->getMemoryManager()->getCodeSize();
+
+        OS << "-------------------------\n"
+           << "Block/Trace Cache information:\n";
+        OS << "Block: start=" << tcg_ctx_global.code_gen_buffer
+           << " size=" << tcg_ctx_global.code_gen_buffer_size
+           << " code=" << format("%8d", BlockSize) << " (ratio="
+           << format("%.2f", (double)BlockSize * 100 / tcg_ctx_global.code_gen_buffer_size)
+           << "%)\n";
+        OS << "Trace: start=" << LLVMEnv::TraceCache
+           << " size=" << LLVMEnv::TraceCacheSize
+           << " code=" << format("%8d", TraceSize) << " (ratio="
+           << format("%.2f", (double)TraceSize * 100 / LLVMEnv::TraceCacheSize)
+           << "%)\n\n";
+    }
+
+    if (Mode & SPM_TRACE)
+        printTraceExec(LLEnv->getTransCode());
+
+    if ((Mode & SPM_PASS) && !ExitFunc.empty()) {
+        OS << "\n-------------------------\n"
+           << "Pass information:\n";
+        for (unsigned i = 0, e = ExitFunc.size(); i != e; ++i)
+            (*ExitFunc[i])();
+    }
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/llvm-target.cpp b/llvm/llvm-target.cpp
new file mode 100644
index 0000000..609a4ad
--- /dev/null
+++ b/llvm/llvm-target.cpp
@@ -0,0 +1,812 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+
+using namespace llvm::object;
+
+extern "C" {
+#if defined(TARGET_I386)
+extern const int comis_eflags[4];
+extern const int fcom_ccval[4];
+#endif
+}
+
+
+static std::vector<TCGHelperInfo> MMUHelper = {
+#if defined(CONFIG_SOFTMMU)
+    { (void *)llvm_ret_ldub_mmu, "llvm_ret_ldub_mmu", },
+    { (void *)llvm_le_lduw_mmu,  "llvm_le_lduw_mmu", },
+    { (void *)llvm_le_ldul_mmu,  "llvm_le_ldul_mmu", },
+    { (void *)llvm_le_ldq_mmu,   "llvm_le_ldq_mmu", },
+    { (void *)llvm_be_lduw_mmu,  "llvm_be_lduw_mmu", },
+    { (void *)llvm_be_ldul_mmu,  "llvm_be_ldul_mmu", },
+    { (void *)llvm_be_ldq_mmu,   "llvm_be_ldq_mmu", },
+    { (void *)llvm_ret_ldsb_mmu, "llvm_ret_ldsb_mmu", },
+    { (void *)llvm_le_ldsw_mmu,  "llvm_le_ldsw_mmu", },
+    { (void *)llvm_le_ldsl_mmu,  "llvm_le_ldsl_mmu", },
+    { (void *)llvm_be_ldsw_mmu,  "llvm_be_ldsw_mmu", },
+    { (void *)llvm_be_ldsl_mmu,  "llvm_be_ldsl_mmu", },
+
+    { (void *)llvm_ret_stb_mmu, "llvm_ret_stb_mmu", },
+    { (void *)llvm_le_stw_mmu,  "llvm_le_stw_mmu", },
+    { (void *)llvm_le_stl_mmu,  "llvm_le_stl_mmu", },
+    { (void *)llvm_le_stq_mmu,  "llvm_le_stq_mmu", },
+    { (void *)llvm_be_stw_mmu,  "llvm_be_stw_mmu", },
+    { (void *)llvm_be_stl_mmu,  "llvm_be_stl_mmu", },
+    { (void *)llvm_be_stq_mmu,  "llvm_be_stq_mmu", },
+#endif
+};
+
+
+/* Helper functions that cause side effect.
+ * For example, helpers modifying CPU states that cannot be identified,
+ * or helpers that call MMU helpers.
+ * During translating qemu_ld/st, we record MMU helper calls so that we
+ * know how to restore when page fault is handled. Unfortunately, we lose
+ * track of the MMU helper calls in a helper function and the restoration
+ * will fail. Currently, we mark such helper functions as illegal ones and
+ * we skip trace building when a call to one of them when translating
+ * op_call. */
+static std::vector<TCGHelperInfo> IllegalHelper = {
+#if defined(CONFIG_SOFTMMU)
+#  if defined(TARGET_I386)
+    { (void *)helper_cmpxchg8b, "helper_cmpxchg8b", },
+    { (void *)helper_boundw, "helper_boundw", },
+    { (void *)helper_boundl, "helper_boundl", },
+#  elif defined(TARGET_ARM)
+    { (void *)helper_dc_zva, "helper_dc_zva", },
+#  endif
+#else
+#  if defined(TARGET_AARCH64)
+    { (void *)helper_simd_tbl, "helper_simd_tbl", },
+#  endif
+#endif
+};
+
+
+#define DEF_HELPER_FLAGS_0(name, flags, ret) { (void *)helper_##name, "helper_"#name }, 
+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) DEF_HELPER_FLAGS_0(name, flags, ret)
+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) DEF_HELPER_FLAGS_0(name, flags, ret)
+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) DEF_HELPER_FLAGS_0(name, flags, ret)
+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) DEF_HELPER_FLAGS_0(name, flags, ret)
+
+static std::vector<TCGHelperInfo> LMTHelper = {
+#if defined(CONFIG_SOFTMMU)
+#include "atomic-helper.h"
+#endif
+};
+
+#undef DEF_HELPER_FLAGS_0
+#undef DEF_HELPER_FLAGS_1
+#undef DEF_HELPER_FLAGS_2
+#undef DEF_HELPER_FLAGS_3
+#undef DEF_HELPER_FLAGS_4
+
+
+const char *getMMUFName(const void *func)
+{
+    for (unsigned i = 0, e = MMUHelper.size(); i != e; ++i) {
+        if (func == MMUHelper[i].func)
+            return MMUHelper[i].name;
+    }
+    return "";
+}
+
+bool isMMUFunction(std::string &Name)
+{
+    for (unsigned i = 0, e = MMUHelper.size(); i != e; ++i) {
+        if (Name == MMUHelper[i].name)
+            return true;
+    }
+    return false;
+}
+
+bool isLMTFunction(std::string &Name)
+{
+    for (unsigned i = 0, e = LMTHelper.size(); i != e; ++i) {
+        if (Name == LMTHelper[i].name)
+            return true;
+    }
+    return false;
+}
+
+bool isIllegalHelper(const void *func)
+{
+    for (unsigned i = 0, e = IllegalHelper.size(); i != e; ++i) {
+        if (func == IllegalHelper[i].func)
+            return true;
+    }
+    return false;
+}
+
+/* Determine whether the function name is a system library or not. */
+bool isLibcall(std::string &Name)
+{
+    if (Name == "fmodf" || Name == "fmod" || Name == "fmodl" ||
+        Name == "abs" || Name == "labs" || Name == "llabs" ||
+        Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
+        Name == "sqrtf" || Name == "sqrt" || Name == "sqrtl" ||
+        Name == "logf" || Name == "log" || Name == "logl" ||
+        Name == "log2f" || Name == "log2" || Name == "log2l" ||
+        Name == "log10f" || Name == "log10" || Name == "log10l" ||
+        Name == "expf" || Name == "exp" || Name == "expl" ||
+        Name == "exp2f" || Name == "exp2" || Name == "exp2l" ||
+        Name == "ldexpf" || Name == "ldexp" || Name == "ldexpl" ||
+        Name == "sinf" || Name == "sin" || Name == "sinl" ||
+        Name == "cosf" || Name == "cos" || Name == "cosl" ||
+        Name == "tanf" || Name == "tan" || Name == "tanl" ||
+        Name == "atanf" || Name == "atan" || Name == "atanl" ||
+        Name == "atanf2" || Name == "atan2" || Name == "atanl2" ||
+        Name == "powf" || Name == "pow" || Name == "powl" ||
+        Name == "ceilf" || Name == "ceil" || Name == "ceill" ||
+        Name == "truncf" || Name == "trunc" || Name == "truncl" ||
+        Name == "rintf" || Name == "rint" || Name == "rintl" ||
+        Name == "lrintf" || Name == "lrint" || Name == "lrintl" ||
+        Name == "nearbyintf" || Name == "nearbyint" || Name == "nearbyintl" ||
+        Name == "floorf" || Name == "floor" || Name == "floorl" ||
+        Name == "copysignf" || Name == "copysign" || Name == "copysignl" ||
+        Name == "memcpy" || Name == "memmove" || Name == "memset" ||
+        Name == "fegetround" || Name == "fesetround" ||
+        Name == "__isinfl" || Name == "__isnanl")
+    {
+        return true;
+    }
+
+    return false;
+}
+
+/* Determine whether the function name is a softfloat helper or not. */
+bool isSoftFPcall(std::string &Name)
+{
+    static char SoftFPName[][128] = {
+        "float16_to_float32",
+        "float32_add",
+        "float32_compare",
+        "float32_compare_quiet",
+        "float32_div",
+        "float32_mul",
+        "float32_scalbn",
+        "float32_sqrt",
+        "float32_sub",
+        "float32_to_float16",
+        "float32_to_float64",
+        "float32_to_int32",
+        "float32_to_int64",
+        "float32_to_uint32",
+        "float32_minnum",
+        "float32_maxnum",
+        "float64_add",
+        "float64_compare",
+        "float64_compare_quiet",
+        "float64_div",
+        "float64_mul",
+        "float64_scalbn",
+        "float64_sqrt",
+        "float64_sub",
+        "float64_to_float32",
+        "float64_to_int32",
+        "float64_to_int64",
+        "float64_to_uint32",
+        "float64_minnum",
+        "float64_maxnum",
+        "int32_to_float32",
+        "int32_to_float64",
+        "int64_to_float32",
+        "normalizeRoundAndPackFloat128",
+        "propagateFloat128NaN",
+        "propagateFloatx80NaN",
+        "roundAndPackFloat128",
+        "roundAndPackFloat32",
+        "roundAndPackFloat64",
+        "roundAndPackFloatx80",
+        "set_float_rounding_mode",
+        "subFloat128Sigs",
+        "subFloat32Sigs",
+        "subFloat64Sigs",
+        "subFloatx80Sigs",
+        "uint32_to_float32",
+        "uint32_to_float64",
+#if 0
+        /* FIXME: this function causes LLVM JIT error:
+           LLVM ERROR: Error reading function 'set_float_exception_flags' from bitcode file: Malformed block record */
+        "set_float_exception_flags",
+#endif
+        "addFloat32Sigs",
+        "addFloat64Sigs",
+
+        "float32_to_int32_round_to_zero",
+        "float64_to_int32_round_to_zero",
+
+        "int32_to_floatx80",
+        "int64_to_floatx80",
+        "float32_to_floatx80",
+        "float64_to_floatx80",
+        "floatx80_abs",
+        "floatx80_chs",
+        "floatx80_is_infinity",
+        "floatx80_is_neg",
+        "floatx80_is_zero",
+        "floatx80_is_zero_or_denormal",
+        "floatx80_is_any_nan",
+
+        "floatx80_to_int32",
+        "floatx80_to_int32_round_to_zero",
+        "floatx80_to_int64",
+        "floatx80_to_int64_round_to_zero",
+        "floatx80_to_float32",
+        "floatx80_to_float64",
+        "floatx80_to_float128",
+        "floatx80_round_to_int",
+        "floatx80_add",
+        "floatx80_sub",
+        "floatx80_mul",
+        "floatx80_div",
+        "floatx80_rem",
+        "floatx80_sqrt",
+        "floatx80_eq",
+        "floatx80_le",
+        "floatx80_lt",
+        "floatx80_unordered",
+        "floatx80_eq_quiet",
+        "floatx80_le_quiet",
+        "floatx80_lt_quiet",
+        "floatx80_unordered_quiet",
+        "floatx80_compare",
+        "floatx80_compare_quiet",
+        "floatx80_is_quiet_nan",
+        "floatx80_is_signaling_nan",
+        "floatx80_maybe_silence_nan",
+        "floatx80_scalbn",
+    };
+
+    for (int i = 0, e = ARRAY_SIZE(SoftFPName); i < e; i++) {
+        if (Name == SoftFPName[i])
+            return true;
+    }
+    return false;
+}
+
+/* Bind function names/addresses that are used in the softfloat helpers. */
+void AddFPUSymbols(LLVMTranslator *Translator)
+{
+#define AddSymbol(a) Translator->AddSymbol(#a, (void*)a)
+    AddSymbol(float32_add);
+    AddSymbol(float32_sub);
+    AddSymbol(float32_mul);
+    AddSymbol(float32_div);
+    AddSymbol(float32_sqrt);
+    AddSymbol(float32_scalbn);
+    AddSymbol(float32_compare);
+    AddSymbol(float32_compare_quiet);
+    AddSymbol(float32_minnum);
+    AddSymbol(float32_maxnum);
+    AddSymbol(float64_add);
+    AddSymbol(float64_sub);
+    AddSymbol(float64_mul);
+    AddSymbol(float64_div);
+    AddSymbol(float64_sqrt);
+    AddSymbol(float64_scalbn);
+    AddSymbol(float64_compare);
+    AddSymbol(float64_compare_quiet);
+    AddSymbol(float64_minnum);
+    AddSymbol(float64_maxnum);
+    AddSymbol(float16_to_float32);
+    AddSymbol(float32_to_float16);
+    AddSymbol(float32_to_float64);
+    AddSymbol(float32_to_int32);
+    AddSymbol(float32_to_int64);
+    AddSymbol(float32_to_uint32);
+    AddSymbol(float64_to_float32);
+    AddSymbol(float64_to_int32);
+    AddSymbol(float64_to_int64);
+    AddSymbol(float64_to_uint32);
+    AddSymbol(int32_to_float32);
+    AddSymbol(int32_to_float64);
+    AddSymbol(int64_to_float32);
+    AddSymbol(uint32_to_float32);
+    AddSymbol(uint32_to_float64);
+    AddSymbol(float32_to_int32_round_to_zero);
+    AddSymbol(float64_to_int32_round_to_zero);
+
+    AddSymbol(int32_to_floatx80);
+    AddSymbol(int64_to_floatx80);
+    AddSymbol(float32_to_floatx80);
+    AddSymbol(float64_to_floatx80);
+    AddSymbol(floatx80_abs);
+    AddSymbol(floatx80_chs);
+    AddSymbol(floatx80_is_infinity);
+    AddSymbol(floatx80_is_neg);
+    AddSymbol(floatx80_is_zero);
+    AddSymbol(floatx80_is_zero_or_denormal);
+    AddSymbol(floatx80_is_any_nan);
+
+    AddSymbol(floatx80_to_int32);
+    AddSymbol(floatx80_to_int32_round_to_zero);
+    AddSymbol(floatx80_to_int64);
+    AddSymbol(floatx80_to_int64_round_to_zero);
+    AddSymbol(floatx80_to_float32);
+    AddSymbol(floatx80_to_float64);
+    AddSymbol(floatx80_to_float128);
+    AddSymbol(floatx80_round_to_int);
+    AddSymbol(floatx80_add);
+    AddSymbol(floatx80_sub);
+    AddSymbol(floatx80_mul);
+    AddSymbol(floatx80_div);
+    AddSymbol(floatx80_rem);
+    AddSymbol(floatx80_sqrt);
+    AddSymbol(floatx80_eq);
+    AddSymbol(floatx80_le);
+    AddSymbol(floatx80_lt);
+    AddSymbol(floatx80_unordered);
+    AddSymbol(floatx80_eq_quiet);
+    AddSymbol(floatx80_le_quiet);
+    AddSymbol(floatx80_lt_quiet);
+    AddSymbol(floatx80_unordered_quiet);
+    AddSymbol(floatx80_compare);
+    AddSymbol(floatx80_compare_quiet);
+    AddSymbol(floatx80_is_quiet_nan);
+    AddSymbol(floatx80_is_signaling_nan);
+    AddSymbol(floatx80_maybe_silence_nan);
+    AddSymbol(floatx80_scalbn);
+
+    AddSymbol(rint);
+    AddSymbol(rintf);
+    AddSymbol(lrint);
+    AddSymbol(lrintf);
+    AddSymbol(llrint);
+    AddSymbol(llrintf);
+    AddSymbol(remainder);
+    AddSymbol(remainderf);
+    AddSymbol(fabs);
+    AddSymbol(fabsf);
+    AddSymbol(sqrt);
+    AddSymbol(sqrtf);
+    AddSymbol(trunc);
+    AddSymbol(exp2);
+    AddSymbol(log);
+    AddSymbol(ldexp);
+    AddSymbol(floor);
+    AddSymbol(ceil);
+    AddSymbol(sin);
+    AddSymbol(cos);
+    AddSymbol(tan);
+    AddSymbol(atan2);
+    AddSymbol(__isinf);
+    AddSymbol(__isnan);
+#undef AddSymbol
+}
+
+void AddLMTSymbols(LLVMTranslator *Translator)
+{
+    for (unsigned i = 0, e = LMTHelper.size(); i != e; ++i) {
+        TCGHelperInfo &H = LMTHelper[i];
+        Translator->AddSymbol(H.name, H.func);
+    }
+}
+
+void AddMMUSymbols(LLVMTranslator *Translator)
+{
+    for (unsigned i = 0, e = MMUHelper.size(); i != e; ++i) {
+        TCGHelperInfo &H = MMUHelper[i];
+        Translator->AddSymbol(H.name, H.func);
+    }
+}
+
+/* Bind function names/addresses that are used by the helpers. */
+#if defined(CONFIG_USER_ONLY)
+void AddDependentSymbols(LLVMTranslator *Translator)
+{
+    Translator->AddSymbol("helper_verify_tb", (void*)helper_verify_tb);
+    Translator->AddSymbol("helper_lookup_ibtc", (void*)helper_lookup_ibtc);
+    Translator->AddSymbol("guest_base", (void*)&guest_base);
+    Translator->AddSymbol("cpu_loop_exit", (void*)cpu_loop_exit);
+    Translator->AddSymbol("qemu_logfile", (void*)&qemu_logfile);
+    Translator->AddSymbol("qemu_loglevel", (void*)&qemu_loglevel);
+
+    Translator->AddSymbol("alignment_count", (void*)alignment_count);
+    Translator->AddSymbol("aligned_boundary", (void*)&aligned_boundary);
+
+#if defined(TARGET_I386)
+    Translator->AddSymbol("parity_table", (void*)parity_table);
+    Translator->AddSymbol("comis_eflags", (void*)comis_eflags);
+    Translator->AddSymbol("fcom_ccval", (void*)fcom_ccval);
+    Translator->AddSymbol("raise_exception", (void*)raise_exception);
+    Translator->AddSymbol("raise_exception_err", (void*)raise_exception_err);
+#endif
+
+    AddFPUSymbols(Translator);
+}
+#else
+void AddDependentSymbols(LLVMTranslator *Translator)
+{
+    Translator->AddSymbol("helper_verify_tb", (void*)helper_verify_tb);
+    Translator->AddSymbol("helper_lookup_ibtc", (void*)helper_lookup_ibtc);
+    Translator->AddSymbol("helper_lookup_cpbl", (void*)helper_lookup_cpbl);
+    Translator->AddSymbol("helper_validate_cpbl", (void*)helper_validate_cpbl);
+    Translator->AddSymbol("cpu_loop_exit", (void*)cpu_loop_exit);
+    Translator->AddSymbol("qemu_logfile", (void*)&qemu_logfile);
+    Translator->AddSymbol("qemu_loglevel", (void*)&qemu_loglevel);
+    Translator->AddSymbol("exp2", (void*)exp2);
+
+#if defined(TARGET_I386)
+    Translator->AddSymbol("parity_table", (void*)parity_table);
+    Translator->AddSymbol("comis_eflags", (void*)comis_eflags);
+    Translator->AddSymbol("fcom_ccval", (void*)fcom_ccval);
+#endif
+
+    AddFPUSymbols(Translator);
+    AddLMTSymbols(Translator);
+    AddMMUSymbols(Translator);
+}
+#endif
+
+/* Return base address and offset of a memory access pointer. */
+Value *getBaseWithConstantOffset(const DataLayout *DL, Value *Ptr,
+                                 intptr_t &Offset)
+{
+    Operator *PtrOp = dyn_cast<Operator>(Ptr);
+    if (!PtrOp)
+        return Ptr;
+
+    if (PtrOp->getOpcode() == Instruction::BitCast ||
+        PtrOp->getOpcode() == Instruction::IntToPtr)
+        return getBaseWithConstantOffset(DL, PtrOp->getOperand(0), Offset);
+
+    /* If this is a GEP with constant indices, we can look through it. */
+    GEPOperator *GEP = dyn_cast<GEPOperator>(PtrOp);
+    if (!GEP || !GEP->hasAllConstantIndices())
+        return Ptr;
+
+    gep_type_iterator GTI = gep_type_begin(GEP);
+    for (auto I = GEP->idx_begin(), E = GEP->idx_end(); I != E; ++I, ++GTI) {
+        ConstantInt *OpC = cast<ConstantInt>(*I);
+        if (OpC->isZero())
+            continue;
+        
+        /* Handle a struct and array indices which add their offset to the
+         * pointer. */
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39)
+        if (StructType *STy = dyn_cast<StructType>(*GTI))
+#else
+        if (StructType *STy = GTI.getStructTypeOrNull())
+#endif
+            Offset += DL->getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+        else {
+            intptr_t Size = DL->getTypeAllocSize(GTI.getIndexedType());
+            Offset += OpC->getSExtValue() * Size;
+        }
+    }
+
+    return getBaseWithConstantOffset(DL, GEP->getPointerOperand(), Offset);
+}
+
+static bool accumulateConstantOffset(const DataLayout *DL, GEPOperator *GEP,
+                                     APInt &Offset, Value *GuestBase)
+{
+    for (auto GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); GTI != GTE; ++GTI) {
+        /* Skip the operand if it is from the guest base. */
+        if (GTI.getOperand() == GuestBase)
+            continue;
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+            return false;
+        if (OpC->isZero())
+            continue;
+
+        /* Handle a struct index, which adds its field offset to the pointer. */
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39)
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+#else
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+#endif
+            unsigned ElementIdx = OpC->getZExtValue();
+            const StructLayout *SL = DL->getStructLayout(STy);
+            Offset += APInt(Offset.getBitWidth(),
+                            SL->getElementOffset(ElementIdx));
+            continue;
+        }
+
+        /* For array or vector indices, scale the index by the size of the type. */
+        APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+        Offset += Index * APInt(Offset.getBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+    }
+    return true;
+}
+
+Value *StripPointer(Value *Ptr)
+{
+    if (!Ptr->getType()->isPointerTy())
+        return Ptr;
+
+    SmallPtrSet<Value *, 8> Visited;
+    Visited.insert(Ptr);
+    do {
+        Operator *PtrOp = cast<Operator>(Ptr);
+        unsigned Opcode = PtrOp->getOpcode();
+        if (Opcode == Instruction::BitCast  ||
+            Opcode == Instruction::IntToPtr ||
+            Opcode == Instruction::GetElementPtr)
+            Ptr = cast<Operator>(Ptr)->getOperand(0);
+        else
+            return Ptr;
+
+        if (Visited.count(Ptr))
+            break;
+        Visited.insert(Ptr);
+    } while (true);
+
+    return Ptr;
+}
+
+Value *StripPointerWithConstantOffset(const DataLayout *DL, Value *Ptr,
+                                      APInt &Offset, Value *GuestBase)
+{
+    if (!Ptr->getType()->isPointerTy())
+        return Ptr;
+    
+    std::set<Value *> Visited;
+    Visited.insert(Ptr);
+    Value *V = Ptr;
+    do {
+        if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+            APInt GEPOffset(Offset);
+            if (!accumulateConstantOffset(DL, GEP, GEPOffset, GuestBase))
+                return V;
+            Offset = GEPOffset;
+            V = GEP->getPointerOperand();
+            continue;
+        }
+
+        Operator *PtrOp = cast<Operator>(V);
+        unsigned Opcode = PtrOp->getOpcode();
+        if (Opcode == Instruction::BitCast || Opcode == Instruction::IntToPtr) {
+            V = cast<Operator>(V)->getOperand(0);
+        } else if (Opcode == Instruction::Add ||
+                   Opcode == Instruction::Sub) {
+            if (!isa<ConstantInt>(PtrOp->getOperand(1)))
+                return V;
+
+            int64_t C = cast<ConstantInt>(PtrOp->getOperand(1))->getSExtValue();
+            if (Opcode == Instruction::Add)
+                Offset += APInt(Offset.getBitWidth(), C, true);
+            else
+                Offset -= APInt(Offset.getBitWidth(), C, true);
+            V = PtrOp->getOperand(0);
+        } else
+            return V;
+
+        if (Visited.find(V) != Visited.end())
+            break;
+        Visited.insert(V);
+    } while (true);
+    
+    return V;
+}
+
+/* Remove an instruction from a basic block. Also delete any instrution used by
+ * this instruction if it is no longer being used. */
+static void DeleteDeadInstructions(Instruction *Inst)
+{
+    SmallVector<Instruction*, 16> DeadInsts;
+    DeadInsts.push_back(Inst);
+    do {
+        Instruction *I = DeadInsts.pop_back_val();
+        for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+            Value *OpV = I->getOperand(i);
+            I->setOperand(i, nullptr);
+
+            if (!OpV->use_empty()) continue;
+
+            Instruction *OpI = dyn_cast<Instruction>(OpV);
+            if (OpI && OpI->getParent())
+                DeadInsts.push_back(OpI);
+        }
+        I->eraseFromParent();
+    } while (!DeadInsts.empty());
+}
+
+/* Perform instruction removal from the parent container. */
+void ProcessErase(IVec &toErase)
+{
+    for (auto I = toErase.begin(), E = toErase.end(); I != E; ++I)
+        DeleteDeadInstructions(*I);
+    toErase.clear();
+}
+
+
+/*
+ * JIT Event Listener
+ */
+void EventListener::NotifyFunctionEmitted(const Function &F,
+                                          void *Code, size_t Size,
+                                          const EmittedFunctionDetails &Details)
+{
+    if (!NI.Func)
+        return;
+
+    NI.Code = (uint8_t *)Code;
+    NI.Size = Size;
+}
+
+#if defined(LLVM_V35)
+void EventListener::NotifyObjectEmitted(const ObjectImage &Obj)
+{
+    StringRef Name;
+    uint64_t Code;
+    uint64_t Size;
+    unsigned NumFunc = 0;
+    DIContext* Context = DIContext::getDWARFContext(Obj.getObjectFile());
+
+    for (auto I = Obj.begin_symbols(), E = Obj.end_symbols(); I != E; ++I) {
+        object::SymbolRef::Type SymType;
+        if (I->getType(SymType)) continue;
+        if (SymType == object::SymbolRef::ST_Function) {
+            if (I->getName(Name)) continue;
+            if (I->getAddress(Code)) continue;
+            if (I->getSize(Size)) continue;
+
+            NumFunc++;
+            if (!Context)
+                continue;
+            
+            DILineInfoTable  Lines = Context->getLineInfoForAddressRange(Code, Size);
+            DILineInfoTable::iterator  Begin = Lines.begin();
+            DILineInfoTable::iterator  End = Lines.end();
+            for (DILineInfoTable::iterator It = Begin; It != End; ++It)
+                NI.addPatch(It->second.Line, It->second.Column, It->first);
+        }
+    }
+    if (NumFunc != 1)
+        hqemu_error("internal error.\n");
+
+    NI.Code = (uint8_t *)Code;
+    NI.Size = Size;
+}
+#elif defined(LLVM_V38)
+void EventListener::NotifyObjectEmitted(const ObjectFile &Obj,
+                                        const RuntimeDyld::LoadedObjectInfo &L)
+{
+    OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+    const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+    DIContext* Context = new DWARFContextInMemory(DebugObj);
+    uint64_t Code;
+    uint64_t Size;
+    unsigned NumFunc = 0;
+
+    for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
+        SymbolRef Sym = P.first;
+        if (Sym.getType() != SymbolRef::ST_Function)
+            continue;
+
+        ErrorOr<StringRef> Name = Sym.getName();
+        if (!Name)
+            continue;
+
+        ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
+        if (AddrOrErr.getError())
+            continue;
+
+        Code = *AddrOrErr;
+        Size = P.second;
+        NumFunc++;
+
+        DILineInfoTable Lines = Context->getLineInfoForAddressRange(Code, Size);
+        DILineInfoTable::iterator Begin = Lines.begin();
+        DILineInfoTable::iterator End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It)
+            NI.addPatch(It->second.Line, It->second.Column, It->first);
+    }
+
+    if (NumFunc != 1)
+        hqemu_error("internal error.\n");
+
+    NI.Code = (uint8_t *)Code;
+    NI.Size = Size;
+}
+#elif defined(LLVM_V39) || defined(LLVM_V50)
+void EventListener::NotifyObjectEmitted(const ObjectFile &Obj,
+                                        const RuntimeDyld::LoadedObjectInfo &L)
+{
+    OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+    const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+    DIContext* Context = new DWARFContextInMemory(DebugObj);
+    uint64_t Code;
+    uint64_t Size;
+    unsigned NumFunc = 0;
+
+    for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
+        SymbolRef Sym = P.first;
+        Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
+        if (!SymTypeOrErr)
+            continue;
+
+        SymbolRef::Type SymType = *SymTypeOrErr;
+        if (SymType != SymbolRef::ST_Function)
+            continue;
+
+        Expected<StringRef> Name = Sym.getName();
+        if (!Name)
+            continue;
+
+        Expected<uint64_t> AddrOrErr = Sym.getAddress();
+        if (!AddrOrErr)
+            continue;
+
+        Code = *AddrOrErr;
+        Size = P.second;
+        NumFunc++;
+
+        DILineInfoTable Lines = Context->getLineInfoForAddressRange(Code, Size);
+        DILineInfoTable::iterator Begin = Lines.begin();
+        DILineInfoTable::iterator End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It)
+            NI.addPatch(It->second.Line, It->second.Column, It->first);
+    }
+
+    if (NumFunc != 1)
+        hqemu_error("internal error.\n");
+
+    NI.Code = (uint8_t *)Code;
+    NI.Size = Size;
+}
+#else
+void EventListener::NotifyObjectEmitted(const ObjectFile &Obj,
+                                        const RuntimeDyld::LoadedObjectInfo &L)
+{
+    OwningBinary<ObjectFile> DebugObjOwner = L.getObjectForDebug(Obj);
+    const ObjectFile &DebugObj = *DebugObjOwner.getBinary();
+    std::unique_ptr<DIContext> Context = DWARFContext::create(DebugObj);
+    uint64_t Code;
+    uint64_t Size;
+    unsigned NumFunc = 0;
+
+    for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
+        SymbolRef Sym = P.first;
+        Expected<SymbolRef::Type> SymTypeOrErr = Sym.getType();
+        if (!SymTypeOrErr)
+            continue;
+
+        SymbolRef::Type SymType = *SymTypeOrErr;
+        if (SymType != SymbolRef::ST_Function)
+            continue;
+
+        Expected<StringRef> Name = Sym.getName();
+        if (!Name)
+            continue;
+
+        Expected<uint64_t> AddrOrErr = Sym.getAddress();
+        if (!AddrOrErr)
+            continue;
+
+        Code = *AddrOrErr;
+        Size = P.second;
+        NumFunc++;
+
+        DILineInfoTable Lines = Context->getLineInfoForAddressRange(Code, Size);
+        DILineInfoTable::iterator Begin = Lines.begin();
+        DILineInfoTable::iterator End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It)
+            NI.addPatch(It->second.Line, It->second.Column, It->first);
+    }
+
+    if (NumFunc != 1)
+        hqemu_error("internal error.\n");
+
+    NI.Code = (uint8_t *)Code;
+    NI.Size = Size;
+}
+#endif
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm-translator.cpp b/llvm/llvm-translator.cpp
new file mode 100644
index 0000000..e435b1f
--- /dev/null
+++ b/llvm/llvm-translator.cpp
@@ -0,0 +1,924 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "fpu/softfloat-native-def.h"
+#include "utils.h"
+#include "tracer.h"
+#include "llvm.h"
+#include "llvm-debug.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-hard-perfmon.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "llvm-opc.h"
+#include "llvm-state.h"
+#include "llvm-translator.h"
+
+
+static cl::opt<bool> DisableFastMath("disable-fast-math", cl::init(false),
+    cl::cat(CategoryHQEMU), cl::desc("Disable fast-math optimizations"));
+
+
+static char include_helper[][64] = {
+#include "llvm-helper.h"
+};
+
+extern LLVMEnv *LLEnv;
+extern hqemu::Mutex llvm_global_lock;
+extern hqemu::Mutex llvm_debug_lock;
+
+extern bool TraceCacheFull;
+
+
+#if defined(TCG_TARGET_I386)
+#  if defined(__i386__)
+#    define AREG0 "ebp"
+#  elif defined(__x86_64__)
+#    define AREG0 "r14"
+#  endif
+#elif defined(TCG_TARGET_PPC64)
+#  define AREG0 "r27"
+#elif defined(TCG_TARGET_ARM)
+#  define AREG0 "r7"
+#elif defined(TCG_TARGET_AARCH64)
+#  define AREG0 "x19"
+#  define AREG1 "x28"
+#else
+#  error "unsupported processor type"
+#endif
+const char *BaseRegStr = AREG0;    /* The base register name */
+
+
+
+/*
+ * LLVM Translator
+ */
+LLVMTranslator::LLVMTranslator(unsigned id, CPUArchState *env)
+    : MyID(id), Env(env)
+{
+    dbg() << DEBUG_LLVM << "Starting LLVM Translator " << MyID << ".\n";
+
+    if (!Env)
+        hqemu_error("internal error. LLVMEnv is not initialized.\n");
+
+    /* Create LLVM module and basic types. */
+    InitializeModule();
+    InitializeType();
+    InitializeTarget();
+    InitializeHelpers();
+    InitializeDisasm();
+
+    /* Create the TCG IR to LLVM IR conversion module. */
+    IF = new IRFactory(this);
+
+#if defined(ENABLE_MCJIT)
+    if (MyID == 0)
+        LLEnv->getMemoryManager()->AddSymbols(Symbols);
+#endif
+
+    dbg() << DEBUG_LLVM << "LLVM Translator " << MyID << " initialized.\n";
+}
+
+LLVMTranslator::~LLVMTranslator()
+{
+    if (GuestDisAsm) delete GuestDisAsm;
+    if (HostDisAsm) delete HostDisAsm;
+    delete IF;
+    delete Mod;
+}
+
+/* Perform the initialization of the LLVM module. */
+void LLVMTranslator::InitializeModule()
+{
+    const char *p = strrchr(CONFIG_LLVM_BITCODE, '/');
+    if (!p || ++p == 0)
+        hqemu_error("unknown bitcode file.\n");
+
+    std::string Bitcode(p);
+    std::vector<std::string> Path;
+
+    Path.push_back(std::string("/etc/hqemu/").append(Bitcode));
+    p = getenv("HOME");
+    if (p)
+        Path.push_back(std::string(p).append("/.hqemu/").append(Bitcode));
+    Path.push_back(CONFIG_LLVM_BITCODE);
+
+    unsigned i = 0, e = Path.size();
+    for (; i != e; ++i) {
+        struct stat buf;
+        if (stat(Path[i].c_str(), &buf) != 0)
+            continue;
+
+        SMDiagnostic Err;
+#if defined(LLVM_V35)
+        Mod = ParseIRFile(Path[i], Err, Context);
+#else
+        std::unique_ptr<Module> Owner = parseIRFile(Path[i], Err, Context);
+        Mod = Owner.release();
+#endif
+        if (Mod)
+            break;
+    }
+
+    if (i == e)
+        hqemu_error("cannot find bitcode file %s.\n", Bitcode.c_str());
+
+    DL = getDataLayout(Mod);
+
+    dbg() << DEBUG_LLVM << "Use bitcode file " << Path[i] << ".\n";
+    dbg() << DEBUG_LLVM << "LLVM module initialized (" << Mod->getTargetTriple() << ").\n";
+}
+
+void LLVMTranslator::InitializeType()
+{
+    VoidTy   = Type::getVoidTy(Context);
+    Int8Ty   = IntegerType::get(Context, 8);
+    Int16Ty  = IntegerType::get(Context, 16);
+    Int32Ty  = IntegerType::get(Context, 32);
+    Int64Ty  = IntegerType::get(Context, 64);
+    Int128Ty = IntegerType::get(Context, 128);
+    
+    IntPtrTy    = DL->getIntPtrType(Context);
+    Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+    Int16PtrTy  = Type::getInt16PtrTy(Context, 0);
+    Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+    Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+    
+    FloatTy  = Type::getFloatTy(Context);
+    DoubleTy = Type::getDoubleTy(Context);
+
+    FloatPtrTy  = Type::getFloatPtrTy(Context, 0);
+    DoublePtrTy = Type::getDoublePtrTy(Context, 0);
+}
+
+/* Setup guest-dependent data structures. */
+void LLVMTranslator::InitializeTarget()
+{
+    /* TODO: any smart way to hack into CPUArchState type? */
+    Value *Base = Mod->getNamedValue("basereg");
+    if (!Base)
+        hqemu_error("cannot resolve cpu_proto.\n");
+
+    BaseReg.resize(TCG_TARGET_NB_REGS);
+    BaseReg[TCG_AREG0].RegNo = TCG_AREG0;
+    BaseReg[TCG_AREG0].Name = BaseRegStr;
+    BaseReg[TCG_AREG0].Ty = Base->getType();
+    BaseReg[TCG_AREG0].Base = nullptr;
+
+#if defined(CONFIG_USER_ONLY) && defined(AREG1)
+    if (guest_base != 0 || TARGET_LONG_BITS == 32) {
+        GuestBaseReg.Name = AREG1;
+        GuestBaseReg.Base = nullptr;
+    }
+#endif
+
+    /* Define the new types of special registers. */
+    std::map<Type *, Type *> SpecialReg;
+    DefineSpecialReg(SpecialReg);
+
+    /* Convert the CPUArchState of aggregate type to the list of single element
+     * of primitive type. */
+    intptr_t Off = 0;
+    FlattenCPUState(Base->getType()->getContainedType(0), Off, SpecialReg);
+}
+
+/* This function defines the special registers and the new types to be reset. */
+void LLVMTranslator::DefineSpecialReg(std::map<Type *, Type *> &SpecialReg)
+{
+#if defined(TARGET_I386)
+    Value *SIMDReg = Mod->getNamedValue("xmm_reg");
+    if (SIMDReg) {
+        /* remap XMMReg --> <64 x i8> */
+        Type *Int8Ty = IntegerType::get(Context, 8);
+        Type *OldTy = SIMDReg->getType()->getContainedType(0);
+        Type *NewTy = VectorType::get(Int8Ty, 16);
+        SpecialReg[OldTy] = NewTy;
+    }
+#endif
+}
+
+/* Convert the CPUArchState of the aggregate type to a list of single element of
+ * primitive type. Each element contains a pair of offset to CPUArchState and its
+ * type. This list of flattened type will be used for the state mapping pass. */
+void LLVMTranslator::FlattenCPUState(Type *Ty, intptr_t &Off,
+                                     std::map<Type *, Type *> &SpecialReg)
+{
+    switch (Ty->getTypeID()) {
+        default:
+        {
+            StateType[Off] = Ty;
+            Off += DL->getTypeSizeInBits(Ty) / 8;
+            break;
+        }
+        case Type::StructTyID:
+        {
+            /* Map a special register to another type with the same size as the
+             * original type. E.g., mapping a <16 * i8> type to <2 * i64>. */
+            if (SpecialReg.find(Ty) != SpecialReg.end()) {
+                Type *NewTy = SpecialReg[Ty];
+                StateType[Off] = NewTy;
+                Off += DL->getTypeSizeInBits(Ty) / 8;
+                break;
+            }
+
+            StructType *STy = cast<StructType>(Ty);
+            intptr_t Size = DL->getTypeSizeInBits(STy) / 8;
+            intptr_t SubOff;
+
+            const StructLayout *SL = DL->getStructLayout(STy);
+            for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+                SubOff = Off + SL->getElementOffset(i);
+                FlattenCPUState(STy->getElementType(i), SubOff, SpecialReg);
+            }
+
+            Off += Size;
+
+            /* Structure could have padding at the end of the struct. Expand
+             * the size of the last struct member by adding the padding size. */
+            if (Off != SubOff) {
+                intptr_t LastOff = StateType.rbegin()->first;
+                intptr_t NewSize = (Off - LastOff) * 8;
+                Type *NewTy = IntegerType::get(Context, NewSize);
+                StateType[LastOff] = NewTy;
+            }
+            break;
+        }
+        case Type::ArrayTyID:
+        {
+#if defined(CONFIG_SOFTMMU)
+            /* Do not flatten the SoftTLB because it could create a huge amount
+             * of flattened states. */
+            if (Off == offsetof(CPUArchState, tlb_table[0][0])) {
+                StateType[Off] = Ty;
+                Off += DL->getTypeSizeInBits(Ty) / 8;
+                break;
+            }
+#endif
+            ArrayType *ATy = cast<ArrayType>(Ty);
+            intptr_t ElemSize = DL->getTypeSizeInBits(ATy->getElementType()) / 8;
+            for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
+                intptr_t SubOff = Off + i * ElemSize;
+                FlattenCPUState(ATy->getElementType(), SubOff, SpecialReg);
+            }
+            Off += DL->getTypeSizeInBits(ATy) / 8;
+            break;
+        }
+    }
+}
+
+static inline void Materialize(Function &F)
+{
+#if defined(LLVM_V35)
+    std::string ErrInfo;
+    F.Materialize(&ErrInfo);
+#else
+    F.materialize();
+#endif
+}
+
+/* Materialize helper functions and compute inline costs. */
+void LLVMTranslator::InitializeHelpers()
+{
+    /* Set target-specific symbols. */
+    AddDependentSymbols(this);
+
+    /* Set const helpers. (i.e., helpers that have no side effect) */
+    InitializeConstHelpers();
+
+    /* Materialize fpu helper functions. */
+    TCGHelperInfo *FPUHelper = (TCGHelperInfo *)get_native_fpu_helpers();
+    for (int i = 0, e = num_native_fpu_helpers(); i != e; ++i) {
+        std::string ErrInfo;
+        Function *Func = Mod->getFunction(FPUHelper[i].name);
+        if (Func && Func->isMaterializable())
+            Materialize(*Func);
+    }
+
+    /* Materialize defined helper functions that are allowed for inlining. */
+    for (int i = 0, e = ARRAY_SIZE(include_helper); i < e; ++i) {
+        std::string ErrInfo;
+        Helpers[include_helper[i]] = new HelperInfo;
+        Function *Func = Mod->getFunction(include_helper[i]);
+        if (Func && Func->isMaterializable())
+            Materialize(*Func);
+    }
+
+    /* Initialize all TCG helper functions. */
+    const TCGHelperInfo *all_helpers = get_tcg_helpers();
+    for (int i = 0, e = tcg_num_helpers(); i != e; ++i) {
+        uintptr_t func = (uintptr_t)all_helpers[i].func;
+        const char *name = all_helpers[i].name;
+        if (!name)
+            hqemu_error("invalid helper name.\n");
+
+        TCGHelpers[func] = std::string("helper_") + std::string(name);
+    }
+
+    for (int i = 0, e = tcg_num_helpers(); i != e; ++i) {
+        std::string FName = std::string("helper_") +
+                            std::string(all_helpers[i].name);
+        std::string FNameNoInline = FName + std::string("_noinline");
+        if (Helpers.find(FName) != Helpers.end()) {
+            HelperInfo *Helper = Helpers[FName];
+            Function *F = Mod->getFunction(FName);
+            if (!F)
+                hqemu_error("fatal error - %s\n", FName.c_str());
+            Helper->Func = F;
+            Mod->getOrInsertFunction(FNameNoInline, F->getFunctionType());
+            Helper->FuncNoInline = Mod->getFunction(FNameNoInline);
+            Helpers[FNameNoInline] = Helper;
+
+            AddSymbol(FNameNoInline, all_helpers[i].func);
+        }
+    }
+
+    /* Analyze the inline cost for each helper function and make a non-inlined
+     * counterpart object in LLVM Module. For the non-inlined function, just
+     * remap the function address in LLVM module which causes the JIT to emit a
+     * call instruction to the function address. */
+    for (int i = 0, e = tcg_num_helpers(); i != e; ++i) {
+        const TCGHelperInfo *th = &all_helpers[i];
+        std::string FName = std::string("helper_") + std::string(th->name);
+        if (Helpers.find(FName) != Helpers.end()) {
+            HelperInfo *Helper = Helpers[FName];
+            bool ret = OptimizeHelper(*Helper);
+            if (!ret) {
+                /* If the helper function consists of loops, it is not suitable
+                 * to be inlined because it conflicts to the state mapping
+                 * pass. */
+                Helpers.erase(FName);
+                goto skip;
+            }
+
+            Helper->CalculateMetrics(Helper->Func);
+            continue;
+        }
+skip:
+        AddSymbol(FName, th->func);
+    }
+
+    /* Add all states of the nested helpers to the calling helper.
+     * Then, calculate state boundary and determine if we can know all states
+     * (included in the nested functions) by this helper function.
+     *
+     * Note that we only allow one-level helper inlining. */
+    for (auto &I : Helpers) {
+        HelperInfo *Helper = I.second;
+        bool hasNestNestedCall = false;
+        for (CallInst *CI : Helper->NestedCalls) {
+            std::string FName = CI->getCalledFunction()->getName();
+            HelperInfo *NestedHelper = Helpers[FName];
+            Helper->States.insert(Helper->States.begin(),
+                                  NestedHelper->States.begin(),
+                                  NestedHelper->States.end());
+
+            CI->setCalledFunction(NestedHelper->FuncNoInline);
+            if (I.first != FName && NestedHelper->hasNestedCall)
+                hasNestNestedCall = true;
+        }
+        /* Clear hasNestedCall if onle one level nested functions. If the
+         * helper has only one level nested helpers, then all states are found. */
+        Helper->hasNestedCall = hasNestNestedCall;
+
+        /* Compute state boundaries. */
+        StateAnalyzer Analyzer(DL);
+        for (auto J : Helper->States)
+            Analyzer.addStateRef(J.first, J.second);
+
+        StateRange Reads, Writes;
+        Analyzer.computeStateRange(Reads, Writes);
+
+        Helper->insertState(Reads, false);
+        Helper->insertState(Writes, true);
+    }
+
+    for (auto &I : Helpers) {
+        HelperInfo *Helper = I.second;
+        Helper->States.clear();
+        Helper->NestedCalls.clear();
+    }
+}
+
+void LLVMTranslator::InitializeDisasm()
+{
+    std::string TargetTriple = "UnknownArch";
+
+#if defined(TARGET_I386)
+  #if defined(TARGET_X86_64)
+    TargetTriple = "x86_64";
+  #else
+    TargetTriple = "i386";
+  #endif
+#elif defined(TARGET_ARM)
+  #if defined(TARGET_AARCH64)
+    TargetTriple = "aarch64";
+  #else
+    TargetTriple = "arm";
+  #endif
+#elif defined(TARGET_PPC)
+    TargetTriple = "ppc";
+#endif
+
+   GuestDisAsm = MCDisasm::CreateMCDisasm(TargetTriple, false);
+   HostDisAsm = MCDisasm::CreateMCDisasm(Mod->getTargetTriple(), true);
+
+   if (GuestDisAsm)
+       dbg() << DEBUG_INASM << __func__
+             << ": use LLVM disassembler for guest (" << TargetTriple << ").\n";
+   else
+       dbg() << DEBUG_INASM << __func__
+             << ": can't find LLVM disassembler for guest ("
+             << TargetTriple << "). Use QEMU disas.\n";
+
+   if (HostDisAsm)
+       dbg() << DEBUG_OUTASM << __func__
+             << ": use LLVM disassembler for host ("
+             << Mod->getTargetTriple() << ").\n";
+   else
+       dbg() << DEBUG_OUTASM << __func__
+             << ": can't find LLVM disassembler for host ("
+             << Mod->getTargetTriple() << "). Use QEMU disas.\n";
+}
+
+static bool isLegalIntrinsic(IntrinsicInst *II)
+{
+    switch (II->getIntrinsicID()) {
+        case Intrinsic::memset:
+        case Intrinsic::memcpy:
+        case Intrinsic::memmove:
+        case Intrinsic::dbg_declare:
+            return false;
+        default:
+            break;
+    }
+    return true;
+}
+
+/* Determine if the function argument and Ptr are alias. */
+static Value *isFromFuncArgument(Function &F, Value *Ptr)
+{
+    Ptr = StripPointer(Ptr);
+    for (auto I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+        if (Ptr == &*I)
+            return Ptr;
+    }
+    return nullptr;
+}
+
+/* Create function pass manager to optimize the helper function. */
+static void Optimize(Function &F)
+{
+    auto FPM = new legacy::FunctionPassManager(F.getParent());
+
+    FPM->add(createReplaceIntrinsic());
+    if (!DisableFastMath)
+        FPM->add(createFastMathPass());
+    FPM->run(F);
+
+    delete FPM;
+}
+
+/* Analyze and optimize a helper function. */
+bool LLVMTranslator::OptimizeHelper(HelperInfo &Helper)
+{
+    Function &F = *Helper.Func;
+
+    /* We don't want to inline helper functions that contain loop. */
+    SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> BackEdges;
+    FindFunctionBackedges(F, BackEdges);
+    if (BackEdges.size())
+        return false;
+
+    Optimize(F);
+
+    /* Collect and analyze memory and call instructions. */
+    SmallVector<CallInst *, 16> Calls;
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; ++II) {
+        Instruction *I = &*II;
+
+        if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+            intptr_t Off = 0;
+            Value *Base = getBaseWithConstantOffset(DL, getPointerOperand(I), Off);
+
+            if (auto GV = dyn_cast<GlobalValue>(StripPointer(Base))) {
+                if (!GV->hasPrivateLinkage())
+                    continue;
+            }
+
+            /* XXX: We assume the pointer is derived from the function argument.
+             *      Skip it if not from the the function argument. */
+            Value *Arg = isFromFuncArgument(F, Base);
+            if (!Arg)
+                return false;
+
+            if (Base->getType() == BaseReg[TCG_AREG0].Ty) {
+                /* This is a load/store of CPU state plus a constant offset.
+                 * Track the state. */
+                Helper.States.push_back(std::make_pair(I, Off));
+            } else {
+                /* This is a load/store of unknown pointer.
+                 * Track the maximum access size. */
+                Type *Ty = cast<PointerType>(Arg->getType())->getElementType();
+                intptr_t Size = DL->getTypeSizeInBits(Ty) / 8;
+                Helper.mayConflictArg = true;
+                Helper.ConflictSize = std::max(Helper.ConflictSize, Size);
+            }
+        } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+            Calls.push_back(CI);
+        }
+    }
+
+    /* Analyze calls. */
+    for (CallInst *CI : Calls) {
+        if (CI->isInlineAsm())
+            continue;
+
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+            if (!isLegalIntrinsic(II))
+                return false;
+            continue;
+        }
+
+        if (!CI->getCalledFunction())
+            return false;
+
+        std::string FName = CI->getCalledFunction()->getName();
+        if (isLibcall(FName) || isSoftFPcall(FName)) {
+            /* Libcalls/SoftFPCalls are always const function. Mark it. */
+            ConstantInt *Meta[] = { CONST32(0) };
+            MDFactory::setConstStatic(Context, CI, Meta);
+            continue;
+        }
+
+        if (Helpers.find(FName) == Helpers.end())
+            return false;
+
+        Helper.hasNestedCall = true;
+        Helper.NestedCalls.push_back(CI);
+    }
+
+    return true;
+}
+
+/* Figure out an approximation for how many instructions will be constant
+ * folded if the specified value is constant. */
+static unsigned CountCodeReductionForConstant(Value *V, CodeMetrics &Metrics)
+{
+    unsigned IndirectCallBonus;
+    IndirectCallBonus = -InlineConstants::IndirectCallThreshold;
+
+    unsigned Reduction = 0;
+    for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI) {
+        User *U = UI->getUser();
+        if (isa<BranchInst>(U) || isa<SwitchInst>(U)) {
+            /* We will be able to eliminate all but one of the successors. */
+            const TerminatorInst &TI = cast<TerminatorInst>(*U);
+            const unsigned NumSucc = TI.getNumSuccessors();
+            unsigned Instrs = 0;
+            for (unsigned I = 0; I != NumSucc; ++I)
+                Instrs += Metrics.NumBBInsts[TI.getSuccessor(I)];
+            /* We don't know which blocks will be eliminated, so use the average size. */
+            Reduction += InlineConstants::InstrCost*Instrs*(NumSucc-1)/NumSucc*2;
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+            /* Turning an indirect call into a direct call is a BIG win */
+            if (CI->getCalledValue() == V)
+                Reduction += IndirectCallBonus;
+        } else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+            /* Turning an indirect call into a direct call is a BIG win */
+            if (II->getCalledValue() == V)
+                Reduction += IndirectCallBonus;
+        } else {
+            Instruction &Inst = cast<Instruction>(*U);
+            
+            if (Inst.mayReadFromMemory() || Inst.mayHaveSideEffects() ||
+                    isa<AllocaInst>(Inst))
+                continue;
+            
+            bool AllOperandsConstant = true;
+            for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i)
+                if (!isa<Constant>(Inst.getOperand(i)) && Inst.getOperand(i) != V) {
+                    AllOperandsConstant = false;
+                    break;
+                }
+            
+            if (AllOperandsConstant) {
+                /* We will get to remove this instruction... */
+                Reduction += InlineConstants::InstrCost;
+                Reduction += CountCodeReductionForConstant(&Inst, Metrics);
+            }
+        }
+    }
+    return Reduction;
+}
+
+/* Figure out an approximation of how much smaller the function will be if
+ * it is inlined into a context where an argument becomes an alloca. */
+static unsigned CountCodeReductionForAlloca(Value *V) 
+{
+    if (!V->getType()->isPointerTy()) return 0; 
+
+    unsigned Reduction = 0;
+    for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI) {
+        Instruction *I = cast<Instruction>(UI->getUser());
+
+        if (isa<LoadInst>(I) || isa<StoreInst>(I))
+            Reduction += InlineConstants::InstrCost;
+        else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+            /* If the GEP has variable indices, we won't be able to do much with it. */
+            if (GEP->hasAllConstantIndices())
+                Reduction += CountCodeReductionForAlloca(GEP);
+        } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
+            /* Track pointer through bitcasts. */
+            Reduction += CountCodeReductionForAlloca(BCI);
+        } else
+            return 0;
+    }
+    
+    return Reduction;
+}
+
+void HelperInfo::CalculateMetrics(Function *F)
+{
+    Metrics.NumInsts = 0;
+
+    for (auto FI = F->begin(); FI != F->end(); FI++) {
+        unsigned NumInsts = 0;
+        BasicBlock *BB = &*FI;
+        for (auto BI = FI->begin(); BI != FI->end(); BI++) {
+            if (isa<PHINode>(BI)) /* PHI nodes don't count. */
+                continue;
+            NumInsts++;
+        }
+        Metrics.NumBlocks++;
+        Metrics.NumInsts += NumInsts;
+        Metrics.NumBBInsts[BB] = NumInsts;
+    }
+
+    ArgumentWeights.reserve(F->arg_size());
+    for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) {
+        Value *V = &*I;
+        ArgumentWeights.push_back(ArgInfo(
+                    CountCodeReductionForConstant(V, Metrics),
+                    CountCodeReductionForAlloca(V)));
+    }
+}
+
+void LLVMTranslator::InitializeConstHelpers()
+{
+#if defined(TARGET_I386)
+    ConstHelpers.insert("helper_outb");
+    ConstHelpers.insert("helper_inb");
+    ConstHelpers.insert("helper_outw");
+    ConstHelpers.insert("helper_inw");
+    ConstHelpers.insert("helper_outl");
+    ConstHelpers.insert("helper_inl");
+#elif defined(TARGET_ARM)
+    ConstHelpers.insert("helper_vfp_tosis");
+    ConstHelpers.insert("helper_vfp_tosid");
+    ConstHelpers.insert("helper_vfp_tosizs");
+    ConstHelpers.insert("helper_vfp_tosizd");
+    ConstHelpers.insert("helper_vfp_touis");
+    ConstHelpers.insert("helper_vfp_touid");
+    ConstHelpers.insert("helper_vfp_touizs");
+    ConstHelpers.insert("helper_vfp_touizd");
+
+    ConstHelpers.insert("helper_vfp_sitos");
+    ConstHelpers.insert("helper_vfp_sitod");
+    ConstHelpers.insert("helper_vfp_uitos");
+    ConstHelpers.insert("helper_vfp_uitod");
+
+    ConstHelpers.insert("helper_vfp_fcvtds");
+    ConstHelpers.insert("helper_vfp_fcvtsd");
+
+    ConstHelpers.insert("helper_vfp_cmps");
+    ConstHelpers.insert("helper_vfp_cmpd");
+    ConstHelpers.insert("helper_vfp_cmpes");
+    ConstHelpers.insert("helper_vfp_cmped");
+
+#if defined(TARGET_AARCH64)
+    ConstHelpers.insert("helper_vfp_tosls");
+    ConstHelpers.insert("helper_vfp_tosld");
+    ConstHelpers.insert("helper_vfp_sqtos");
+    ConstHelpers.insert("helper_vfp_sqtod");
+    ConstHelpers.insert("helper_vfp_uqtos");
+    ConstHelpers.insert("helper_vfp_uqtod");
+
+    ConstHelpers.insert("helper_vfp_cmps_a64");
+    ConstHelpers.insert("helper_vfp_cmpd_a64");
+    ConstHelpers.insert("helper_vfp_cmpes_a64");
+    ConstHelpers.insert("helper_vfp_cmped_a64");
+    ConstHelpers.insert("helper_vfp_minnums");
+    ConstHelpers.insert("helper_vfp_maxnums");
+    ConstHelpers.insert("helper_vfp_minnumd");
+    ConstHelpers.insert("helper_vfp_maxnumd");
+
+    ConstHelpers.insert("helper_get_cp_reg64");
+    ConstHelpers.insert("helper_dc_zva");
+#endif
+#endif
+}
+
+void LLVMTranslator::Abort(TraceBuilder &Builder)
+{
+    target_ulong pc = Builder.getEntryNode()->getGuestPC();
+    dbg() << DEBUG_LLVM << __func__
+          << ": abort trace pc " << format("0x%" PRIx "", pc) << "\n";
+}
+
+/* Make a jump from the head block in the block code cache to the translated
+ * host code of this region in the optimized code cache. Also patch previous
+ * built regions that have direct branch to this region. */
+void LLVMTranslator::Commit(TraceBuilder &Builder)
+{
+    bool Invalid = false;
+    OptimizationInfo *Opt = Builder.getOpt();
+    TraceInfo *Trace = Builder.getTrace();
+    TBVec &TBs = Trace->TBs;
+
+    for (unsigned i = 0, e = TBs.size(); i != e; ++i) {
+        if (TBs[i]->mode == BLOCK_INVALID) {
+            Invalid = true;
+            break;
+        }
+    }
+
+    if (Invalid || llvm_check_cache() == 1) {
+        delete Trace;
+        delete Opt;
+        return;
+    }
+
+    TranslatedCode *TC = new TranslatedCode;
+    TC->Active = true;
+    TC->Size = NI.Size;
+    TC->Code = NI.Code;
+    TC->EntryTB = Trace->getEntryTB();
+    TC->Restore = NI.Restore;
+    TC->Trace = Trace;
+
+    /* If we go here, this is a legal trace. */
+    LLVMEnv::ChainSlot &ChainPoint = LLEnv->getChainPoint();
+    TranslationBlock *EntryTB = TC->EntryTB;
+
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    for (unsigned i = 0; i != NI.NumChainSlot; ++i)
+        ChainPoint[NI.ChainSlot[i].Key] = NI.ChainSlot[i].Addr;
+
+    TraceID tid = LLEnv->insertTransCode(TC);
+    EntryTB->tid = tid;
+    EntryTB->mode = BLOCK_OPTIMIZED;
+    EntryTB->opt_ptr = TC->Code;
+
+    /* Set the jump from the block to the trace */
+    patch_jmp(tb_get_jmp_entry(EntryTB), TC->Code);
+
+    if (!SP->isEnabled()) {
+        delete Trace;
+        TC->Trace = nullptr;
+    }
+
+    delete Opt;
+}
+
+void LLVMTranslator::dump(CPUArchState *env, TranslationBlock *tb)
+{
+    auto &DebugMode = DM.getDebugMode();
+    if (DebugMode & (DEBUG_INASM | DEBUG_OP)) {
+        hqemu::MutexGuard locked(llvm_debug_lock);
+        dbg() << DEBUG_LLVM << "Translator " << MyID << " dumps asm...\n";
+        if (DebugMode & DEBUG_INASM)
+            printAsm(Env, tb);
+        if (DebugMode & DEBUG_OP)
+            printOp(Env, tb);
+    }
+}
+
+void LLVMTranslator::GenBlock(CPUArchState *env, OptimizationInfo *Opt)
+{
+    struct timeval start, end;
+    if (SP->isEnabled())
+        gettimeofday(&start, nullptr);
+
+    TraceBuilder Builder(IF, Opt);
+    GraphNode *Node = Builder.getNextNode();
+    if (!Node)
+        hqemu_error("fatal error.\n");
+
+    Builder.ConvertToTCGIR(env);
+
+    if (DM.getDebugMode() & (DEBUG_INASM | DEBUG_OP))
+        dump(env, Opt->getCFG()->getTB());
+
+    Builder.ConvertToLLVMIR();
+    Builder.Finalize();
+
+    if (SP->isEnabled()) {
+        gettimeofday(&end, nullptr);
+        Builder.getTrace()->setTransTime(&start, &end);
+    }
+
+    Commit(Builder);
+}
+
+void LLVMTranslator::GenTrace(CPUArchState *env, OptimizationInfo *Opt)
+{
+    struct timeval start, end;
+    if (SP->isEnabled())
+        gettimeofday(&start, nullptr);
+
+    TraceBuilder Builder(IF, Opt);
+    for (;;) {
+        GraphNode *Node = Builder.getNextNode();
+        if (!Node)
+            break;
+
+        Builder.ConvertToTCGIR(Env);
+
+        if (DM.getDebugMode() & (DEBUG_INASM | DEBUG_OP))
+            dump(Env, Node->getTB());
+
+        Builder.ConvertToLLVMIR();
+
+        if (Node->getTB()->mode == BLOCK_INVALID || Builder.isAborted()) {
+            Abort(Builder);
+            return;
+        }
+    }
+    Builder.Finalize();
+
+    if (SP->isEnabled()) {
+        gettimeofday(&end, nullptr);
+        Builder.getTrace()->setTransTime(&start, &end);
+    }
+
+    Commit(Builder);
+}
+
+/* Display the guest assembly code of the given basic block. */
+void LLVMTranslator::printAsm(CPUArchState *env, TranslationBlock *tb)
+{
+    auto &OS = DM.debug();
+    if (GuestDisAsm) {
+        OS << "----------------\n"
+           << "IN: [size=" << tb->size << "]\n";
+#if defined(CONFIG_USER_ONLY)
+        GuestDisAsm->PrintInAsm((uint64_t)g2h(tb->pc), tb->size, tb->pc);
+#else
+        GuestDisAsm->PrintInAsm((uint64_t)tb->image, tb->size, tb->pc);
+#endif
+        OS << "\n";
+        return;
+    }
+
+#if defined(CONFIG_USER_ONLY)
+    /* The guest is not supported by the LLVM MCDisassembler. Use QEMU disas. */
+    int disas_flags = 0;
+
+#if defined(TARGET_I386)
+  #if defined(TARGET_X86_64)
+    if ((tb->flags >> HF_CS64_SHIFT) & 1)
+        disas_flags = 2;
+    else
+  #endif
+        disas_flags = !((tb->flags >> HF_CS32_SHIFT) & 1);
+#elif defined(TARGET_ARM)
+  #if defined(TARGET_AARCH64)
+    disas_flags = 4 | (0 << 1);
+  #else
+    disas_flags = env->thumb;
+  #endif
+#elif defined(TARGET_PPC)
+    int le_mode = env->hflags & (1 << MSR_LE) ? 1 : 0;
+    disas_flags = env->bfd_mach;
+    disas_flags |= le_mode << 16;
+#endif
+    
+    OS << "----------------\n";
+    OS << "IN: [size=" << tb->size << "%d]\n";
+    target_disas(stderr, ENV_GET_CPU(env), tb->pc, tb->size, disas_flags);
+    OS << "\n";
+#endif
+}
+
+extern "C" void printops(const char *outbuf) {
+    DM.debug() << outbuf;
+}
+
+/* Display TCG IR of the given basic block. */
+void LLVMTranslator::printOp(CPUArchState *env, TranslationBlock *tb)
+{
+    auto &OS = DM.debug();
+    OS << "OP:\n";
+    tcg_dump_ops_fn(&tcg_ctx, printops);
+    OS << "\n";
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/llvm.cpp b/llvm/llvm.cpp
new file mode 100644
index 0000000..80c8473
--- /dev/null
+++ b/llvm/llvm.cpp
@@ -0,0 +1,1251 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <fstream>
+#include <dlfcn.h>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm-types.h"
+#include "llvm-annotate.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-hard-perfmon.h"
+#include "llvm-translator.h"
+#include "llvm-state.h"
+#include "llvm-opc.h"
+#include "llvm.h"
+#include "tracer.h"
+#include "optimization.h"
+
+
+#define MAX_TRANSLATORS     8
+#define MAX_SEARCH_DEPTH    8
+#define ACTIVE_QUEUE_SIZE   (1 << 16)
+#define ACTIVE_QUEUE_MASK   (ACTIVE_QUEUE_SIZE - 1)
+
+
+cl::OptionCategory CategoryHQEMU("HQEMU Options");
+
+static cl::opt<std::string> DebugLevel("debuglv", cl::init(""),
+    cl::cat(CategoryHQEMU), cl::desc("Set debug level"));
+
+static cl::opt<std::string> DebugFile("debugfile", cl::init(""),
+    cl::cat(CategoryHQEMU), cl::desc("Set debug file (default=stderr)"));
+
+static cl::opt<std::string> ProfileLevel("profile", cl::init(""),
+    cl::cat(CategoryHQEMU), cl::desc("Set profile level"));
+
+static cl::opt<unsigned> NumThreads("threads", cl::init(1),
+    cl::cat(CategoryHQEMU), cl::desc("Number of threads used in the hybridm mode"));
+
+static cl::opt<unsigned> NumTranslations("count", cl::init(-1U),
+    cl::cat(CategoryHQEMU),
+    cl::desc("Maximum number of traces to translate (default=2^32)"));
+
+static cl::opt<unsigned> NETProfileThreshold("net-profile",
+    cl::init(NET_PROFILE_THRESHOLD),
+    cl::cat(CategoryHQEMU),
+    cl::desc("Hot threshold value for NET trace creation (default=50)"));
+
+static cl::opt<unsigned> NETPredictThreshold("net-predict",
+    cl::init(NET_PREDICT_THRESHOLD),
+    cl::cat(CategoryHQEMU),
+    cl::desc("Maximum number of basic blocks in a NET trace (default=64)"));
+
+static cl::opt<bool> DisableNETPlus("disable-netplus", cl::init(false),
+    cl::cat(CategoryHQEMU),
+    cl::desc("Disable NETPlus algorithm (use NET trace formation only)"));
+
+
+/* static members */
+bool LLVMEnv::InitOnce = false;
+int LLVMEnv::TransMode = TRANS_MODE_NONE;
+uint8_t *LLVMEnv::TraceCache = nullptr;
+size_t LLVMEnv::TraceCacheSize = 0;
+bool LLVMEnv::RunWithVTune = false;
+
+LLVMDebug DM;
+LLVMEnv *LLEnv;
+QueueManager *QM;
+AnnotationFactory *AF;
+SoftwarePerfmon *SP;
+HardwarePerfmon *HP;
+ControlFlowGraph GlobalCFG;
+
+hqemu::Mutex llvm_global_lock;
+hqemu::Mutex llvm_debug_lock;
+
+bool ThreadStop = false;
+bool ThreadExit = false;
+bool TraceCacheFull = false;
+unsigned NumPendingThread = 0;
+int MonThreadID;
+
+extern unsigned ProfileThreshold;
+extern unsigned PredictThreshold;
+
+/*
+ * LLVMEnv()
+ *  Intialize LLVM translator(s) and globally shared resources. The LLVMEnv
+ *  instance must be initialized before using the underlying transaltion
+ *  service and should be initialized only ONCE.
+ */
+LLVMEnv::LLVMEnv() : NumTranslator(1), UseThreading(false), NumFlush(0)
+{
+    /* Set LLVMEnv pointer first so other classes can access it. */
+    LLEnv = this;
+
+    ParseCommandLineOptions();
+
+    /* Check if HQEMU is running in Intel VTune. */
+    ProbeIntelVTune();
+
+    /* Initialize debugger and software profiler. */
+    DM.setDebugMode(DebugLevel, DebugFile);
+
+    dbg() << DEBUG_LLVM << "Initializing LLVM Environment.\n";
+
+    /* Initialize LLVM targets. */
+    InitializeAllTargetInfos();
+    InitializeAllTargets();
+    InitializeAllAsmPrinters();
+    InitializeAllAsmParsers();
+    InitializeAllTargetMCs();
+    InitializeAllDisassemblers();
+
+    MonThreadID = gettid();
+    qemu_mutex_init(&mutex);
+
+    Translator.resize(NumTranslator);
+    HelperThread.resize(NumTranslator);
+    ThreadEnv.resize(NumTranslator);
+    for (unsigned i = 0; i < NumTranslator; ++i) {
+        CPUState *cpu = ThreadEnv[i] = cpu_create();
+        CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+        cpu->cpu_index = -i -1;
+        env->build_mode = BUILD_LLVM;
+        Translator[i] = nullptr;
+    }
+
+    QM = new QueueManager;
+    AF = new AnnotationFactory;
+    SP = new SoftwarePerfmon(ProfileLevel);
+    HP = new HardwarePerfmon;
+
+    if (SP->Mode & (SPM_HPM | SPM_HOTSPOT)) {
+        if (RunWithVTune)
+            DM.debug() << "Warning: cannot profile hpm,hotspot inside VTune. Disable it.\n";
+    }
+
+    /* Create the memory manager and intialize the optimized code cache. There
+     * is only copy of the optimized code cache and is shared by all underlying
+     * translators. */
+    MM = std::shared_ptr<MemoryManager>(
+                MemoryManager::Create(TraceCache, TraceCacheSize));
+
+    CreateTranslator();
+
+    /* Initialize HPM after the LLVM thread is initialized. */
+    HP->Init(MonThreadID);
+
+    dbg() << DEBUG_LLVM << "LLVM Environment initialized. "
+          << format("guest_base=0x%lx.\n", GUEST_BASE)
+          << format("\tBlock code cache: addr=%p size=%zd bytes.\n",
+                    tcg_ctx_global.code_gen_buffer,
+                    tcg_ctx_global.code_gen_buffer_size)
+          << format("\tTrace code cache: addr=%p size=%zd bytes.\n",
+                    TraceCache, TraceCacheSize);
+}
+
+LLVMEnv::~LLVMEnv()
+{
+    if (TransMode == TRANS_MODE_BLOCK) {
+        size_t BlockCodeSize = MM->getCodeSize();
+        dbg() << DEBUG_LLVM << "Finalizing LLVM environment."
+              << "\n\tBlock code size: " << BlockCodeSize << " bytes.\n";
+    } else {
+        size_t BlockCodeSize = (uintptr_t)tcg_ctx_global.code_gen_ptr -
+                               (uintptr_t)tcg_ctx_global.code_gen_buffer;
+        size_t TraceCodeSize = MM->getCodeSize();
+        dbg() << DEBUG_LLVM << "Finalizing LLVM environment."
+              << "\n\tBlock code size  : " << format("%8d", BlockCodeSize) << " bytes"
+              << "\n\tTrace code size  : " << format("%8d", TraceCodeSize) << " bytes"
+              << "\n\tTrace/Block ratio: "
+              << format("%.2f%%\n\n", (double)TraceCodeSize * 100 / BlockCodeSize);
+    }
+
+    /* Stop the HPM early so that the handling thread will no longer receive
+     * the overflow signal. */
+    delete HP;
+
+    if (UseThreading && !ThreadExit)
+        StopThread();
+
+    DeleteTranslator();
+
+    for (int i = 0, e = tcg_ctx_global.tb_ctx->nb_tbs; i != e; ++i) {
+        if (tbs[i].image) delete_image(&tbs[i]);
+        if (tbs[i].state) delete_state(&tbs[i]);
+        if (tbs[i].chain) ChainInfo::free(&tbs[i]);
+    }
+
+    SP->printProfile();
+
+    delete SP;
+    delete QM;
+    delete AF;
+
+    /* Delete all translated code. */
+    for (unsigned i = 0, e = TransCode.size(); i != e; ++i)
+        delete TransCode[i];
+
+    dbg() << DEBUG_LLVM << "LLVM environment finalized.\n";
+
+    DM.Flush();
+}
+
+void LLVMEnv::ProbeIntelVTune()
+{
+#if defined(__i386__)
+#define NEW_DLL_ENVIRONMENT_VAR  "INTEL_JIT_PROFILER32"
+#elif defined(__x86_64__)
+#define NEW_DLL_ENVIRONMENT_VAR  "INTEL_JIT_PROFILER64"
+#else
+#define NEW_DLL_ENVIRONMENT_VAR  ""
+#endif
+#define DLL_ENVIRONMENT_VAR      "VS_PROFILER"
+#define DEFAULT_DLLNAME          "libJitPI.so"
+
+    if (!strcmp(NEW_DLL_ENVIRONMENT_VAR, ""))
+        return;
+
+    void *DLLHandle = nullptr;
+    char *DLLName = getenv(NEW_DLL_ENVIRONMENT_VAR);
+    if (!DLLName)
+        DLLName = getenv(DLL_ENVIRONMENT_VAR);
+
+    if (DLLName) {
+        DLLHandle = dlopen(DLLName, RTLD_LAZY);
+        if (DLLHandle)
+            goto has_vtune;
+    }
+    if (!DLLHandle) {
+        DLLHandle = dlopen(DEFAULT_DLLNAME, RTLD_LAZY);
+        if (DLLHandle)
+            goto has_vtune;
+    }
+    return;
+
+has_vtune:
+    dlclose(DLLHandle);
+    RunWithVTune = true;
+}
+
+#if defined(LLVM_V35) || defined(LLVM_V38) || defined(LLVM_V39) || defined(LLVM_V50)
+static void PrintVersion()
+{
+    Triple HostTriple(sys::getDefaultTargetTriple());
+    raw_ostream &OS = outs();
+
+    OS << "HQEMU (http://itanium.iis.sinica.edu.tw/hqemu/):\n"
+       << "  HQEMU version: " << PACKAGE_VERSION_MAJOR << "."
+                        << PACKAGE_VERSION_MINOR << "\n"
+       << "  QEMU version: " << QEMU_VERSION << "\n"
+       << "  Guest ISA: " << TARGET_NAME << "\n"
+       << "  Host ISA: " << HostTriple.getArchName() << "\n";
+    OS << "\n";
+    cl::PrintVersionMessage();
+}
+#else
+static void PrintVersion(raw_ostream &OS)
+{
+    Triple HostTriple(sys::getDefaultTargetTriple());
+    OS << "HQEMU (http://itanium.iis.sinica.edu.tw/hqemu/):\n"
+       << "  HQEMU version: " << PACKAGE_VERSION_MAJOR << "."
+                        << PACKAGE_VERSION_MINOR << "\n"
+       << "  QEMU version: " << QEMU_VERSION << "\n"
+       << "  Guest ISA: " << TARGET_NAME << "\n"
+       << "  Host ISA: " << HostTriple.getArchName() << "\n";
+    OS << "\n";
+    cl::PrintVersionMessage();
+}
+#endif
+
+void LLVMEnv::ParseCommandLineOptions()
+{
+    /* Disable passes that would change the DebugLoc metadata which
+     * may fail our block/trace chaining. */
+    static const char *argv[] = {
+        "-disable-tail-duplicate",
+        "-disable-early-taildup",
+        "-disable-block-placement",
+#if defined(TCG_TARGET_ARM) || defined(TCG_TARGET_AARCH64)
+        "-disable-branch-fold",
+#elif defined(TCG_TARGET_PPC64)
+        "-disable-branch-fold",
+        "-ppc-asm-full-reg-names",
+#endif
+    };
+
+    cl::SetVersionPrinter(PrintVersion);
+
+    /* Hide LLVM builtin options. */
+#if defined(LLVM_V35)
+    StringMap<cl::Option*> opts;
+    cl::getRegisteredOptions(opts);
+#else
+    StringMap<cl::Option*> &opts = cl::getRegisteredOptions();
+#endif
+    for (auto &I : opts) {
+        auto opt = I.second;
+        if (opt->Category == &cl::GeneralCategory)
+            opt->setHiddenFlag(cl::Hidden);
+    }
+
+    dbg() << DEBUG_LLVM << "Parsing command line options.\n";
+
+    /* Get translation mode from LLVM_MODE. */
+    TransMode = getTransMode();
+    if (TransMode == TRANS_MODE_INVALID)
+        hqemu_error("invalid LLVM_MODE.\n");
+
+    /* Get command-line options from LLVM_CMD and update them in LLVM. */
+    std::vector<const char *> PassArgs;
+    char *p = getenv("LLVM_CMD");
+    if (p) {
+        const char *token = strtok(p, " ");
+        while (token) {
+            PassArgs.push_back(token);
+            token = strtok(nullptr, " ");
+        }
+    }
+
+    SmallVector<const char *, 16> Args;
+    Args.push_back("qemu-" TARGET_NAME);
+    for (unsigned i = 0, e = ARRAY_SIZE(argv); i < e; ++i)
+        Args.push_back(argv[i]);
+    for (const char *s : PassArgs)
+        Args.push_back(s);
+    Args.push_back(nullptr);
+    cl::ParseCommandLineOptions(Args.size() - 1,
+                                const_cast<char **>(&Args[0]));
+
+    /* Overwrite NET trace formation parameters. */
+    ProfileThreshold = NETProfileThreshold;
+    PredictThreshold = NETPredictThreshold;
+
+    /* 
+     * After this point, command-line options are all set.
+     * We need to update functions that are controlled by the options.
+     */
+
+    /* Update threading number if hybridm is enabled. */
+    UseThreading = (TransMode == TRANS_MODE_HYBRIDM);
+    if (!UseThreading)
+        return;
+
+    if (NumThreads != 1)
+        NumTranslator = (NumThreads < 1) ? 1 : MIN(MAX_TRANSLATORS, NumThreads);
+}
+
+#if defined(CONFIG_USER_ONLY)
+#define TIMEOUT_INTERVAL 1
+#else
+#define TIMEOUT_INTERVAL 1000
+#endif
+
+/*
+ * WorkerFunc()
+ *  The thread routine of the LLVM translation threads.
+ */
+void *WorkerFunc(void *argv)
+{
+    unsigned MyID = (unsigned long)argv;
+    LLVMTranslator *Translator = LLEnv->getTranslator(MyID);
+    MemoryManager *MM = LLEnv->getMemoryManager().get();
+    CPUState *cpu = LLEnv->getThreadEnv(MyID);
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+
+    /* Block all signals. */
+    sigset_t set;
+    sigfillset(&set);
+    pthread_sigmask(SIG_SETMASK, &set, nullptr);
+
+    copy_tcg_context();
+    optimization_init(env);
+
+    Atomic<unsigned>::inc_return(&NumPendingThread);
+
+    for (;;) {
+        /* Exit the loop if a request is received. */
+        if (unlikely(ThreadExit))
+            break;
+
+        if (unlikely(ThreadStop)) {
+            Atomic<unsigned>::inc_return(&NumPendingThread);
+            while (ThreadStop)
+                usleep(100);
+
+            Translator = LLEnv->getTranslator(MyID);
+        }
+
+        /* Exit the loop if the trace cache is full. */
+        if (unlikely(!MM->isSizeAvailable())) {
+            TraceCacheFull = true;
+            ThreadStop = true;
+            continue;
+        }
+
+        /* Everything is fine. Process an optimization request. */
+        OptimizationInfo *Opt = (OptimizationInfo *)QM->Dequeue();
+        if (Opt)
+            Translator->GenTrace(env, Opt);
+
+        usleep(TIMEOUT_INTERVAL);
+    }
+
+    pthread_exit(nullptr);
+    return nullptr;
+}
+
+/*
+ * CreateTranslator()
+ *  Create LLVM translators and worker threads. We create the instances of
+ *  translators and helper threads during the initialization of LLVMEnv and
+ *  each helper thread will pick its own translator instance later.
+ */
+void LLVMEnv::CreateTranslator()
+{
+    dbg() << DEBUG_LLVM << "Creating " << NumTranslator << " translator(s).\n";
+
+    for (unsigned i = 0; i < NumTranslator; ++i) {
+        CPUArchState *env = (CPUArchState *)ThreadEnv[i]->env_ptr;
+        Translator[i] = LLVMTranslator::CreateLLVMTranslator(i, env);
+    }
+
+    ThreadStop = false;
+    ThreadExit = false;
+    TraceCacheFull = false;
+
+    if (UseThreading)
+        StartThread();
+}
+
+/*
+ * DeleteTranslator()
+ *  Destroy LLVMTranslator.
+ */
+void LLVMEnv::DeleteTranslator()
+{
+    dbg() << DEBUG_LLVM << "Destroying " << NumTranslator << " translator(s).\n";
+
+    /* Wait for worker threads finishing their jobs, clear all optimization
+     * requests and flush trace code cache. */
+    if (UseThreading && !ThreadExit) {
+        ThreadStop = true;
+        while (NumPendingThread != NumTranslator)
+            usleep(100);
+
+        QM->Flush();
+        MM->Flush();
+    }
+
+    for (unsigned i = 0; i < NumTranslator; ++i) {
+        delete Translator[i];
+        Translator[i] = nullptr;
+    }
+}
+
+void LLVMEnv::RestartTranslator()
+{
+    dbg() << DEBUG_LLVM << "Restarting " << NumTranslator << " translator(s).\n";
+
+    for (unsigned i = 0; i < NumTranslator; ++i) {
+        CPUArchState *env = (CPUArchState *)ThreadEnv[i]->env_ptr;
+        Translator[i] = LLVMTranslator::CreateLLVMTranslator(i, env);
+    }
+
+    TraceCacheFull = false;
+    NumPendingThread = 0;
+    ThreadStop = false;;
+}
+
+void LLVMEnv::StartThread()
+{
+    ThreadExit = false;
+    for (unsigned i = 0; i < NumTranslator; ++i) {
+        int ret = pthread_create(&HelperThread[i], nullptr, WorkerFunc,
+                                 (void*)(long)i);
+        if (ret != 0)
+            hqemu_error("failed to create worker thread.\n");
+    }
+
+    /* Wait until all threads are ready. */
+    while (NumPendingThread != NumTranslator)
+        usleep(200);
+    NumPendingThread = 0;
+}
+
+void LLVMEnv::StopThread()
+{
+    ThreadExit = true;
+    for (unsigned i = 0; i < NumTranslator; ++i)
+        pthread_join(HelperThread[i], nullptr);
+}
+
+LLVMTranslator *LLVMEnv::AcquireSingleTranslator()
+{
+    if (Translator.empty())
+        hqemu_error("internal error.\n");
+
+    qemu_mutex_lock(&mutex);
+    return Translator[0];
+}
+
+void LLVMEnv::ReleaseSingleTranslator()
+{
+    qemu_mutex_unlock(&mutex);
+}
+
+
+/*
+ * CreateLLVMEnv()
+ *  The interface to create the LLVMEnv instance.
+ */
+void LLVMEnv::CreateLLVMEnv()
+{
+    if (InitOnce == true)
+        hqemu_error("LLVM environment already initialized.\n");
+
+    if (TraceCache == nullptr)
+        hqemu_error("llvm_alloc_cache() must be called before this function.\n");
+
+    new LLVMEnv;
+    InitOnce = true;
+}
+
+void LLVMEnv::DeleteLLVMEnv()
+{
+    if (InitOnce == false)
+        hqemu_error("LLVM environment already destroyed.\n");
+
+    /* Stop the LLVM translation threads before the program is terminated. */
+    delete LLEnv;
+    InitOnce = false;
+}
+
+TraceID LLVMEnv::insertTransCode(TranslatedCode *TC)
+{
+    TraceID tid = TransCode.size();
+    TransCode.push_back(TC);
+    SortedCode[(uintptr_t)TC->Code] = TC;
+
+    for (auto TB : TC->Trace->TBs) {
+        ChainInfo &Chain = *ChainInfo::get(TB);
+        Chain.insertDepTrace(TC->EntryTB->id);
+    }
+    return tid;
+}
+
+LLVMEnv::SlotInfo LLVMEnv::getChainSlot()
+{
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    size_t Key = ChainPoint.size();
+    uintptr_t RetVal = (Key << 2) | TB_EXIT_LLVM;
+    ChainPoint.push_back(0);
+    return SlotInfo(Key, RetVal);
+}
+
+static bool OptimizeOrSkip()
+{
+    static unsigned curr = 0;
+
+    dbg() << DEBUG_LLVM << "Received an optimization request ID=" << curr << "."
+          << (curr >= NumTranslations ? " (skip)\n" : "\n");
+
+    return curr++ >= NumTranslations;
+}
+
+int LLVMEnv::OptimizeBlock(CPUArchState *env, OptRequest Request)
+{
+    if (InitOnce == false)
+        hqemu_error("internal error.\n");
+
+    if (OptimizeOrSkip() == true)
+        return 0;
+
+    env->build_mode = BUILD_LLVM | BUILD_TCG;
+    LLVMTranslator *Translator = LLEnv->AcquireSingleTranslator();
+    Translator->GenBlock(env, Request.release());
+    LLEnv->ReleaseSingleTranslator();
+    env->build_mode = BUILD_NONE;
+    return 1;
+}
+
+int LLVMEnv::OptimizeTrace(CPUArchState *env, OptRequest Request)
+{
+    if (InitOnce == false)
+        return 0;
+
+    if (TransMode == TRANS_MODE_NONE)
+        return 0;
+    if (OptimizeOrSkip() == true)
+        return 0;
+
+    OptimizationInfo *Opt = Request.release();
+    Opt->ComposeCFG();
+
+    if (TransMode == TRANS_MODE_HYBRIDS) {
+        if (!TraceCacheFull) {
+            if (!LLEnv->getMemoryManager()->isSizeAvailable())
+                TraceCacheFull = true;
+            else {
+                LLVMTranslator *Translator = LLEnv->AcquireSingleTranslator();
+                Translator->GenTrace(env, Opt);
+                LLEnv->ReleaseSingleTranslator();
+            }
+        }
+
+        if (TraceCacheFull)
+            return 0;
+    } else if (TransMode == TRANS_MODE_HYBRIDM) {
+        /* Put the optimization request into the request queue and continue. */
+        QM->Enqueue(Opt);
+    }
+
+    return 1;
+}
+
+#if defined(CONFIG_USER_ONLY)
+QueueManager::QueueManager()
+{
+    CurrentQueue = new Queue;
+}
+
+QueueManager::~QueueManager()
+{
+    delete CurrentQueue;
+}
+
+void QueueManager::Enqueue(OptimizationInfo *Opt)
+{
+    CurrentQueue->enqueue(Opt);
+}
+
+void *QueueManager::Dequeue()
+{
+    return CurrentQueue->dequeue();
+}
+
+void QueueManager::Flush()
+{
+    while (1) {
+        OptimizationInfo *Opt = (OptimizationInfo *)CurrentQueue->dequeue();
+        if (Opt == nullptr)
+            break;
+        delete Opt;
+    }
+}
+
+#else
+QueueManager::QueueManager()
+{
+    ActiveQueue.resize(ACTIVE_QUEUE_SIZE);
+    for (unsigned i = 0, e = ActiveQueue.size(); i != e; ++i)
+        ActiveQueue[i] = nullptr;
+}
+
+QueueManager::~QueueManager()
+{
+    for (unsigned i = 0, e = ActiveQueue.size(); i != e; ++i) {
+        if (ActiveQueue[i])
+            delete ActiveQueue[i];
+    }
+}
+
+void QueueManager::Enqueue(OptimizationInfo *Opt)
+{
+    Queue *CurrentQueue = ActiveQueue[pcid & ACTIVE_QUEUE_MASK];
+    if (unlikely(!CurrentQueue))
+        CurrentQueue = ActiveQueue[pcid & ACTIVE_QUEUE_MASK] = new Queue;
+    CurrentQueue->enqueue(Opt);
+}
+
+void *QueueManager::Dequeue()
+{
+    Queue *CurrentQueue = ActiveQueue[pcid & ACTIVE_QUEUE_MASK];
+    if (unlikely(!CurrentQueue))
+        return nullptr;
+    return CurrentQueue->dequeue();
+}
+
+void QueueManager::Flush()
+{
+    for (unsigned i = 0, e = ActiveQueue.size(); i != e; ++i) {
+        if (!ActiveQueue[i])
+            continue;
+
+        while (1) {
+            OptimizationInfo *Opt = (OptimizationInfo *)ActiveQueue[i]->dequeue();
+            if (!Opt)
+                break;
+            delete Opt;
+        }
+    }
+}
+#endif
+
+
+/*
+ * OptimizationInfo
+ */
+
+OptimizationInfo::OptimizationInfo(TranslationBlock *HeadTB, TraceEdge &Edges)
+    : isUserTrace(true), isBlock(false), CFG(nullptr)
+{
+    for (auto &E : Edges)
+        Trace.push_back(E.first);
+
+#if defined(CONFIG_USER_ONLY)
+    if (!llvm_has_annotation(HeadTB->pc, ANNOTATION_LOOP))
+        ExpandTrace(HeadTB, Edges);
+#endif
+
+    /* Build CFG from the edges. */
+    std::map<TranslationBlock *, GraphNode *> NodeMap;
+
+    NodeMap[HeadTB] = new GraphNode(HeadTB);
+    for (auto &E : Edges) {
+        TranslationBlock *Parent = E.first;
+        if (NodeMap.find(Parent) == NodeMap.end())
+            NodeMap[Parent] = new GraphNode(Parent);
+
+        GraphNode *ParentNode = NodeMap[Parent];
+        for (auto Child : E.second) {
+            if (NodeMap.find(Child) == NodeMap.end())
+                NodeMap[Child] = new GraphNode(Child);
+
+            ParentNode->insertChild(NodeMap[Child]);
+        }
+    }
+
+    CFG = NodeMap[HeadTB];
+}
+
+void OptimizationInfo::SearchCycle(TraceNode &SearchNodes, TraceNode &Nodes,
+                                   TraceEdge &Edges, TBVec &Visited, int Depth)
+{
+    TranslationBlock *Curr = Visited.back();
+
+    if (llvm_has_annotation(Curr->pc, ANNOTATION_LOOP))
+        return;
+    if (Nodes.size() >= PredictThreshold)
+        return;
+
+    /* If the current node is one of the main NET trace node, we found a cyclic path.
+     * The links of such cyclic path are added to the trace edges. */
+    if (SearchNodes.find(Curr) != SearchNodes.end()) {
+        for (unsigned i = 1, e = Visited.size(); i != e; ++i) {
+            TranslationBlock *Pred = Visited[i - 1];
+            TranslationBlock *Succ = Visited[i];
+            Nodes.insert(Succ);
+            Edges[Pred].insert(Succ);
+        }
+        return;
+    }
+    /* Stop if we reach the maximum search depth. */
+    if (Depth == MAX_SEARCH_DEPTH)
+        return;
+
+    /* Still cannot find a cyclic path? Keep looking for the successors. */
+    for (auto Succ : GlobalCFG.getSuccessor(Curr)) {
+        Visited.push_back(Succ);
+        SearchCycle(SearchNodes, Nodes, Edges, Visited, Depth + 1);
+        Visited.pop_back();
+    }
+}
+
+/*
+ * ExpandTrace()
+ *  Expand a NET trace to a bigger region with the NETPlus algorithm.
+ *  NETPlus: trace formation algorithm based on the paper published in
+ *  RESoLVE'11. D. Davis and K. Hazelwood, "Improving Region Selection Through
+ *  Loop Completion," in ASPLOS Workshop on Runtime Environments/Systems,
+ *  Layering, and Virtualized Environments, 2011.
+ */
+void OptimizationInfo::ExpandTrace(TranslationBlock *HeadTB, TraceEdge &Edges)
+{
+    if (DisableNETPlus)
+        return;
+
+    TraceNode Nodes;
+    TraceNode MainTraceNodes;
+    std::map<target_ulong, TranslationBlock*> NodeMap;
+#ifdef USE_TRACETREE_ONLY
+    MainTraceNodes.insert(HeadTB);
+    NodeMap[HeadTB->pc] = HeadTB;
+#else
+    for (auto &E : Edges) {
+        TranslationBlock *TB = E.first;
+        MainTraceNodes.insert(TB);
+        NodeMap[TB->pc] = TB;
+    }
+#endif
+
+    for (auto &E : Edges)
+        Nodes.insert(E.first);
+
+    /* Put critical section when traversing GlobalCFG. */
+    hqemu::MutexGuard locked(GlobalCFG.getLock());
+
+    for (auto TB : Trace) {
+        TBVec Visited;
+        Visited.push_back(TB);
+        if (NodeMap.find(TB->jmp_pc[0]) != NodeMap.end())
+            Edges[TB].insert(NodeMap[TB->jmp_pc[0]]);
+        if (TB->jmp_pc[1] != (target_ulong)-1 &&
+            NodeMap.find(TB->jmp_pc[1]) != NodeMap.end())
+            Edges[TB].insert(NodeMap[TB->jmp_pc[1]]);
+
+        for (auto Succ : GlobalCFG.getSuccessor(TB)) {
+            Visited.push_back(Succ);
+            SearchCycle(MainTraceNodes, Nodes, Edges, Visited, 0);
+            Visited.pop_back();
+        }
+    }
+}
+
+/*
+ * ComposeCFG()
+ *  Compose a trace of CFG from a list of TBs.
+ */
+void OptimizationInfo::ComposeCFG()
+{
+    bool isUser = true;
+    TranslationBlock *HeadTB = Trace[0];
+
+#if defined(CONFIG_SOFTMMU)
+    isUser = isUserTB(HeadTB) ? true : false;
+    for (auto TB : Trace) {
+        if (unlikely(TB->mode == BLOCK_INVALID)) {
+            /* A NET trace may contain invalidated block because the block
+             * is invalidated during trace formation. */
+            dbg() << DEBUG_LLVM << __func__ << ": skip due to invalidated block\n";
+            return;
+        }
+
+        if (isUser && isUserTB(TB) == false) {
+            dbg() << DEBUG_LLVM << __func__ << ": skip due to mixed mode\n";
+            return;
+        }
+
+        /* Our translator assumes that component blocks have the same cs_base. */
+        if (TB->cs_base != HeadTB->cs_base) {
+            dbg() << DEBUG_LLVM << __func__ << ": skip due to inconsistent cs\n";
+            return;
+        }
+    }
+#endif
+
+    /* Check if the consecutive blocks are really connected. */
+    TraceEdge Edges;
+
+    TranslationBlock *Curr = Trace[0];
+    for (unsigned i = 1, e = Trace.size(); i != e; ++i) {
+        TranslationBlock *Pred = Trace[i - 1];
+        Curr = Trace[i];
+        if (Pred->jmp_pc[0] != (target_ulong)-1 &&
+            Pred->jmp_pc[0] != Curr->pc &&
+            Pred->jmp_pc[1] != Curr->pc) {
+            /* Disconnected. Discard the tailing blocks. */
+            Trace.resize(i);
+            LoopHeadIdx = -1;
+            break;
+        }
+
+        /* Connected. */
+        Edges[Pred].insert(Curr);
+    }
+    if (LoopHeadIdx != -1)
+        Edges[Curr].insert(Trace[LoopHeadIdx]);
+
+#if defined(CONFIG_USER_ONLY)
+    if (!llvm_has_annotation(Trace[0]->pc, ANNOTATION_LOOP))
+        ExpandTrace(HeadTB, Edges);
+#endif
+
+    /* Build CFG from the edges. */
+    std::map<TranslationBlock *, GraphNode *> NodeMap;
+
+    NodeMap[HeadTB] = new GraphNode(HeadTB);
+    for (auto &E : Edges) {
+        TranslationBlock *Parent = E.first;
+        if (NodeMap.find(Parent) == NodeMap.end())
+            NodeMap[Parent] = new GraphNode(Parent);
+
+        GraphNode *ParentNode = NodeMap[Parent];
+        for (auto Child : E.second) {
+            if (NodeMap.find(Child) == NodeMap.end())
+                NodeMap[Child] = new GraphNode(Child);
+
+            ParentNode->insertChild(NodeMap[Child]);
+        }
+    }
+
+    CFG = NodeMap[HeadTB];
+    isUserTrace = isUser;
+}
+
+
+/* The following implements routines of the C interfaces for QEMU. */
+extern "C" {
+
+void hqemu_help(void)
+{
+    /* Hide LLVM builtin options. */
+#if defined(LLVM_V35)
+    StringMap<cl::Option*> opts;
+    cl::getRegisteredOptions(opts);
+#else
+    StringMap<cl::Option*> &opts = cl::getRegisteredOptions();
+#endif
+    for (auto &I : opts) {
+        auto opt = I.second;
+        if (opt->Category == &cl::GeneralCategory)
+            opt->setHiddenFlag(cl::Hidden);
+    }
+
+    SmallVector<const char *, 16> Args;
+    Args.push_back("\n  export LLVM_CMD='[OPTION1] [OPTION2]'\n  qemu-" TARGET_NAME);
+    Args.push_back(nullptr);
+    cl::ParseCommandLineOptions(Args.size() - 1,
+                                const_cast<char **>(&Args[0]));
+    cl::PrintHelpMessage(false, false);
+}
+
+int llvm_init()
+{
+    LLVMEnv::CreateLLVMEnv();
+    return 0;
+}
+
+int llvm_finalize()
+{
+    LLVMEnv::DeleteLLVMEnv();
+#if 0
+    llvm_shutdown();
+#endif
+    return 0;
+}
+
+int llvm_alloc_cache()
+{
+    size_t BlockCacheSize = (tcg_ctx.code_gen_buffer_size / 2)
+                             & qemu_real_host_page_mask;
+    LLVMEnv::TraceCacheSize = tcg_ctx.code_gen_buffer_size - BlockCacheSize;
+    LLVMEnv::TraceCache = (uint8_t *)tcg_ctx.code_gen_buffer + BlockCacheSize;
+
+    tcg_ctx.code_gen_buffer_size = BlockCacheSize;
+    return 0;
+}
+
+int llvm_check_cache(void)
+{
+    if (LLVMEnv::InitOnce == false)
+        return 1;
+    return TraceCacheFull ? 1 : 0;
+}
+
+/*
+ * llvm_tb_flush()
+ *  Wrapper fucntion to flush the optmizated code cache.
+ */
+int llvm_tb_flush(void)
+{
+    if (LLVMEnv::InitOnce == false)
+        return 1;
+    if (LLVMEnv::TransMode == TRANS_MODE_NONE)
+        return 1;
+
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+    LLEnv->DeleteTranslator();
+
+    for (int i = 0, e = tcg_ctx_global.tb_ctx->nb_tbs; i != e; ++i) {
+        if (tbs[i].image) delete_image(&tbs[i]);
+        if (tbs[i].state) delete_state(&tbs[i]);
+        if (tbs[i].chain) ChainInfo::free(&tbs[i]);
+
+        tbs[i].image = tbs[i].state = tbs[i].chain = nullptr;
+    }
+
+    /* Remove all translated code. */
+    LLVMEnv::TransCodeList &TransCode = LLEnv->getTransCode();
+    for (unsigned i = 0, e = TransCode.size(); i != e; ++i)
+        delete TransCode[i];
+
+    TransCode.clear();
+    LLEnv->getSortedCode().clear();
+    LLEnv->getChainPoint().clear();
+
+    /* Clear global cfg. */
+    GlobalCFG.reset();
+
+    LLEnv->RestartTranslator();
+    LLEnv->incNumFlush();
+
+    dbg() << DEBUG_LLVM << __func__ << ": trace cache flushed.\n";
+
+    return 0;
+}
+
+static void llvm_suppress_chaining(TranslationBlock *tb)
+{
+    /* TODO: add unlinking rule for non-x86 hosts. */
+    std::vector<uintptr_t> &Chains = ChainInfo::get(tb)->Chains;
+    if (Chains.empty())
+        return;
+
+    for (unsigned i = 0, e = Chains.size(); i != e; ++i) {
+#if defined(TCG_TARGET_I386)
+        patch_jmp(Chains[i], Chains[i] + 5);
+#elif defined(TCG_TARGET_ARM) || defined(TCG_TARGET_AARCH64)
+        patch_jmp(Chains[i], Chains[i] + 4);
+#elif defined(TCG_TARGET_PPC64)
+        patch_jmp(Chains[i], Chains[i] + 16);
+#endif
+    }
+    Chains.clear();
+}
+
+/*
+ * llvm_tb_remove()
+ *  Remove the traces containing the `tb' that is invalidated by QEMU.
+ */
+int llvm_tb_remove(TranslationBlock *tb)
+{
+    if (LLVMEnv::TransMode == TRANS_MODE_NONE)
+        return 1;
+    if (!tb->chain)
+        return 1;
+
+    /* Unlink traces that jump to this tb. */
+    llvm_suppress_chaining(tb);
+
+    if (LLVMEnv::TransMode == TRANS_MODE_BLOCK) {
+        patch_jmp(tb_get_jmp_entry(tb), tb_get_jmp_next(tb));
+        ChainInfo::free(tb);
+        return 1;
+    }
+
+    LLVMEnv::TransCodeList &TransCode = LLEnv->getTransCode();
+    LLVMEnv::TransCodeMap &SortedCode = LLEnv->getSortedCode();
+    std::vector<BlockID> &DepTraces = ChainInfo::get(tb)->DepTraces;
+
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    /* Remove traces that contain this tb. */
+    if (DepTraces.empty())
+        return 0;
+
+    for (unsigned i = 0, e = DepTraces.size(); i != e; ++i) {
+        TranslationBlock *EntryTB = &tbs[DepTraces[i]];
+        if (EntryTB->tid == -1) {
+            /* This can happen when a trace block (not head) was removed
+             * before and at that time the tid of the trace head block is
+             * set to -1. Now, the trace head block is going to be removed
+             * and we just skip it. */
+            continue;
+        }
+
+        TranslatedCode *TC = TransCode[EntryTB->tid];
+        if (!TC->Active)
+            hqemu_error("fatal error.\n");
+
+        TC->Active = false;
+        SortedCode.erase((uintptr_t)TC->Code);
+        patch_jmp(tb_get_jmp_entry(EntryTB), tb_get_jmp_next(EntryTB));
+
+        /* For system-mode emulation, since the source traces do not directly
+         * jump to the trace code, we do not need to suppress the traces
+         * chaining to the trace head block. Unlinking the jump from the
+         * trace head block to the trace code is sufficient to make execution
+         * from going to the trace code. */
+#if defined(CONFIG_USER_ONLY)
+        llvm_suppress_chaining(EntryTB);
+#endif
+
+        EntryTB->mode = BLOCK_ACTIVE;
+        EntryTB->exec_count = 0;
+        EntryTB->opt_ptr = EntryTB->tc_ptr;
+        EntryTB->tid = -1;
+    }
+
+    DepTraces.clear();
+    ChainInfo::free(tb);
+
+    return 1;
+}
+
+/*
+ * llvm_resolve_address()
+ *  Given the value returned when leaving the code cache, return the patch
+ *  address for the region chaining.
+ */
+static uintptr_t llvm_resolve_address(uintptr_t addr)
+{
+    if (LLVMEnv::InitOnce == false)
+        return 0;
+
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    LLVMEnv::ChainSlot &ChainPoint = LLEnv->getChainPoint();
+    size_t Key = addr >> 2;
+    return ChainPoint[Key];
+}
+
+#if defined(CONFIG_USER_ONLY)
+#define cross_page(__tb)            (0)
+#define trace_add_jump(src, dst)    patch_jmp(next_tb, tb->opt_ptr)
+#else
+#define cross_page(__tb)            (__tb->page_addr[1] != (unsigned long)-1)
+#define trace_add_jump(src, dst)    patch_jmp(next_tb, tb->tc_ptr)
+#endif
+
+void llvm_handle_chaining(uintptr_t next_tb, TranslationBlock *tb)
+{
+    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_LLVM) {
+        next_tb = llvm_resolve_address(next_tb);
+        if (next_tb && !cross_page(tb)) {
+            /* Keep track of traces (i.e., next_tb) that jump to this tb. */
+            ChainInfo &Chain = *ChainInfo::get(tb);
+            Chain.insertChain(next_tb);
+
+            /* For system-mode emulation, we only let the source traces
+             * jump to the trace head 'block' in the block code cache. */
+            trace_add_jump(next_tb, tb);
+        }
+    } else if (next_tb != 0 && !cross_page(tb)) {
+        TranslationBlock *pred = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
+        int n = next_tb & TB_EXIT_MASK;
+        tb_add_jump(pred, n, tb);
+
+        GlobalCFG.insertLink(pred, tb);
+    }
+}
+
+int llvm_locate_trace(uintptr_t searched_pc)
+{
+    uintptr_t Start = (uintptr_t)LLVMEnv::TraceCache;
+    uintptr_t End = Start + LLVMEnv::TraceCacheSize;
+    return (searched_pc >= Start && searched_pc < End);
+}
+
+TranslationBlock *llvm_find_pc(CPUState *cpu, uintptr_t searched_pc)
+{
+    LLVMEnv::TransCodeMap &SortedCode = LLEnv->getSortedCode();
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+
+    if (LLVMEnv::InitOnce == false)
+        return nullptr;
+    if (!llvm_locate_trace(searched_pc))
+        return nullptr;
+
+    hqemu::MutexGuard locked(llvm_global_lock);
+
+    LLVMEnv::TransCodeMap::iterator I = SortedCode.upper_bound(searched_pc);
+    TranslatedCode *TC = (--I)->second;
+
+    if (env->restore_val >= TC->Restore.size()) {
+        auto HostDisAsm = LLEnv->getTranslator(0)->getHostDisAsm();
+        if (HostDisAsm)
+            HostDisAsm->PrintOutAsm((uint64_t)TC->Code, TC->Size);
+        hqemu_error("got exception at 0x%zx\n", searched_pc);
+    }
+
+    /* Since restore_val is no longer used, we set it to the
+     * the opc index so the later restore can quickly get it. */
+    std::pair<BlockID, uint16_t> RestoreInfo = TC->Restore[env->restore_val];
+    env->restore_val = RestoreInfo.second - 1;
+    return &tbs[RestoreInfo.first];
+}
+
+/*
+ * llvm_restore_state()
+ *  The cpu state corresponding to 'searched_pc' is restored.
+ */
+int llvm_restore_state(CPUState *cpu, TranslationBlock *tb,
+                       uintptr_t searched_pc)
+{
+    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
+    uint8_t *p = tb->tc_search;
+
+    /* Reconstruct the stored insn data while looking for the point at
+       which the end of the insn exceeds the searched_pc.  */
+    for (unsigned i = 0, e = tb->icount; i != e; ++i) {
+        for (unsigned j = 0; j < TARGET_INSN_START_WORDS; ++j) {
+            data[j] += decode_sleb128(&p);
+        }
+        host_pc += decode_sleb128(&p);
+        if (env->restore_val == i)
+            goto found;
+    }
+    return -1;
+
+found:
+    restore_state_to_opc(env, tb, data);
+
+    return 0;
+}
+
+/*
+ * llvm_fork_start()
+ *  Wrapper function to stop the optimization service before performing fork.
+ */
+void llvm_fork_start(void)
+{
+    if (!LLEnv->isThreading())
+        return;
+
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+    LLEnv->StopThread();
+}
+
+/*
+ * llvm_fork_end()
+ *  Wrapper function to restart the optimization service after performing fork.
+ */
+void llvm_fork_end(int child)
+{
+    if (!LLEnv->isThreading())
+        return;
+
+    dbg() << DEBUG_LLVM << __func__ << " entered.\n";
+
+    /* Now, restart the LLVM thread. */
+    if (child == 0) {
+        LLEnv->StartThread();
+    } else {
+        ThreadExit = true;
+        LLVMEnv::setTransMode(TRANS_MODE_NONE);
+
+        qemu_mutex_init(&LLEnv->mutex);
+    }
+}
+
+int llvm_has_annotation(target_ulong addr, int annotation)
+{
+    if (annotation == ANNOTATION_LOOP)
+        return AF->hasLoopAnnotation(addr) == true;
+    return 0;
+}
+
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/optimization.cpp b/llvm/optimization.cpp
new file mode 100644
index 0000000..15597bf
--- /dev/null
+++ b/llvm/optimization.cpp
@@ -0,0 +1,317 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file implements the basic optimization schemes including
+ *   (1) instruction TLB (iTLB),
+ *   (2) indirect branch target cache (IBTC),
+ *   (3) cross-page block linking (CPBL), and
+ *   (4) large page table (LPT).
+ */
+
+#include "tracer.h"
+#include "optimization.h"
+
+
+#if defined(ENALBE_CPU_PROFILE)
+#  define PROFILE(X) do { X; } while (0)
+#else
+#  define PROFILE(X) do { } while (0)
+#endif
+
+/* The following implements routines of the C interfaces for QEMU. */
+extern "C" {
+
+TranslationBlock *tbs;
+unsigned long alignment_count[2]; /* 0: misaligned, 1: aligned. */
+unsigned long aligned_boundary = 16;
+
+extern uint8_t *ibtc_ret_addr;
+
+/*
+ * iTLB (Instruction TLB)
+ */
+void itlb_update_entry(CPUArchState *env, TranslationBlock *tb)
+{
+    ITLB &itlb = cpu_get_itlb(env);
+    itlb.insert(tb->pc, tb->page_addr[0] & TARGET_PAGE_MASK);
+    if (tb->page_addr[1] != (tb_page_addr_t)-1)
+        itlb.insert(tb->pc + tb->size, tb->page_addr[1] & TARGET_PAGE_MASK);
+}
+
+int itlb_lookup(CPUArchState *env, target_ulong pc, uint64_t paddr)
+{
+    ITLB &itlb = cpu_get_itlb(env);
+    return itlb.get(pc) == (paddr & TARGET_PAGE_MASK);
+}
+
+/*
+ * IBTC (Indirect Branch Translation Cache)
+ */
+#if defined(ENABLE_IBTC)
+
+/* Update IBTC hash table.
+ * Note: we do not cache TBs that cross page boundary. */
+void ibtc_update_entry(CPUArchState *env, TranslationBlock *tb)
+{
+    IBTC &ibtc = cpu_get_ibtc(env);
+    if (!ibtc.needUpdate())
+        return;
+
+    ibtc.resetUpdate();
+
+#if defined(CONFIG_SOFTMMU)
+    if (tb->page_addr[1] != (tb_page_addr_t)-1)
+        return;
+#endif
+
+    ibtc.insert(tb->pc, tb);
+}
+
+/* Helper function to lookup the IBTC hash table. */
+void *helper_lookup_ibtc(CPUArchState *env)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    if (unlikely(cpu->tcg_exit_req != 0)) {
+        cpu->tcg_exit_req = 0;
+        return ibtc_ret_addr;
+    }
+
+    /* A match of 'pc', 'cs_base' and 'flags' results in a IBTC hit. Since
+     * cs_base is only meaningful with x86 guest and system mode (cs_base is
+     * always 0 for user-mode emulation and non-x86 guest), we only compare
+     * cs_base with system mode emulation of x86 guest. */
+
+    target_ulong pc = cpu_get_pc(env);
+    IBTC &ibtc = cpu_get_ibtc(env);
+    TranslationBlock *next_tb = ibtc.get(pc);
+
+    PROFILE( ibtc.incTotal() );
+
+    if (likely(next_tb)) {
+#if defined(CONFIG_SOFTMMU)
+        if (likely(itlb_lookup(env, pc, next_tb->page_addr[0])))
+#endif
+        if (likely(cpu_check_state(env, next_tb->cs_base, next_tb->flags))) {
+            cpu->current_tb = next_tb;
+            return next_tb->opt_ptr;
+        }
+    }
+
+    PROFILE( ibtc.incMiss() );
+
+    ibtc.setUpdate();
+    return ibtc_ret_addr;
+}
+#else
+void ibtc_update_entry(CPUArchState *env, TranslationBlock *tb) {}
+void *helper_lookup_ibtc(CPUArchState *env) { return ibtc_ret_addr; }
+#endif /* ENABLE_IBTC */
+
+
+/*
+ * CPBL (Cross-Page Block Linking)
+ */
+#if defined(ENABLE_CPBL)
+void *helper_lookup_cpbl(CPUArchState *env)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    if (unlikely(cpu->tcg_exit_req != 0)) {
+        cpu->tcg_exit_req = 0;
+        return ibtc_ret_addr;
+    }
+
+    /* A match of 'pc', 'cs_base' and 'flags' results in a CPBL hit. Since
+     * cs_base is only meaningful with x86 guest and system mode (cs_base is
+     * always 0 for user-mode emulation and non-x86 guest), we only compare
+     * cs_base with system mode emulation of x86 guest. */
+
+    target_ulong pc = cpu_get_pc(env);
+    TranslationBlock *next_tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
+
+    PROFILE( cpu_get_cpbl(env).incTotal() );
+
+    if (likely(next_tb && next_tb->pc == pc))
+    if (likely(cpu_check_state(env, next_tb->cs_base, next_tb->flags))) {
+        cpu->current_tb = next_tb;
+        return next_tb->opt_ptr;
+    }
+
+    PROFILE( cpu_get_cpbl(env).incMiss() );
+
+    return ibtc_ret_addr;
+}
+
+int helper_validate_cpbl(CPUArchState *env, target_ulong pc, int id)
+{
+    TranslationBlock *tb = &tbs[id];
+
+    PROFILE( cpu_get_cpbl(env).incValidateTotal() );
+    if (tb->page_addr[1] == (tb_page_addr_t)-1 &&
+        likely(itlb_lookup(env, pc, tb->page_addr[0])))
+        return 1;
+    if (likely(itlb_lookup(env, pc + TARGET_PAGE_SIZE, tb->page_addr[1])))
+        return 1;
+    PROFILE( cpu_get_cpbl(env).incValidateMiss() );
+    return 0;
+}
+
+#else
+void *helper_lookup_cpbl(CPUArchState *env) { return ibtc_ret_addr; }
+int helper_validate_cpbl(CPUArchState *env, target_ulong pc, int id) { return 0; }
+#endif /* ENABLE_CPBL */
+
+
+#if defined(ENABLE_LPAGE)
+int lpt_reset(CPUArchState *env)
+{
+    if (env->opt_link == nullptr)
+        return 0;
+    LargePageTable &lpt = cpu_get_lpt(env);
+    lpt.reset();
+    return 1;
+}
+/* Add a large page to LPT. */
+int lpt_add_page(CPUArchState *env, target_ulong addr, target_ulong size)
+{
+    LargePageTable &lpt = cpu_get_lpt(env);
+    lpt.insert(addr, size);
+    return 1;
+}
+
+/* Given an address, return 1 if this address overlaps with any tracked
+ * large page and return 0 otherwise. The large page record is NOT removed
+ * if it is found. */
+int lpt_search_page(CPUArchState *env, target_ulong addr, target_ulong *addrp,
+                    target_ulong *sizep)
+{
+    LargePageTable &lpt = cpu_get_lpt(env);
+    return lpt.search(addr, LargePageTable::SEARCH, addrp, sizep);
+}
+
+/* Given an address, return the pte index if this address overlaps with
+ * any tracked large page and return -1 otherwise. If a large page is found,
+ * remove it from the list. */
+int lpt_flush_page(CPUArchState *env, target_ulong addr, target_ulong *addrp,
+                   target_ulong *sizep)
+{
+    LargePageTable &lpt = cpu_get_lpt(env);
+    PROFILE( lpt.incTotal() );
+    if (lpt.search(addr, LargePageTable::FLUSH, addrp, sizep))
+        return 1;
+    PROFILE( lpt.incMiss() );
+    return 0;
+}
+#else
+int lpt_reset(CPUArchState *env) { return 0; }
+int lpt_add_page(CPUArchState *env, target_ulong addr, target_ulong size) { return 0; }
+int lpt_search_page(CPUArchState *env, target_ulong addr,
+                    target_ulong *addrp, target_ulong *sizep) { return 0; }
+int lpt_flush_page(CPUArchState *env, target_ulong addr,
+                   target_ulong *addrp, target_ulong *sizep) { return 0; }
+#endif
+
+/* Initialize the optimization schemes. */
+int optimization_init(CPUArchState *env)
+{
+    CPUState *cpu = ENV_GET_CPU(env);
+    if (cpu->cpu_index == 0) {
+        tbs = tcg_ctx.tb_ctx->tbs;
+        if (!tbs) {
+            std::cerr << __func__ << ": fatal error.\n";
+            exit(0);
+        }
+        if (get_cpu_size() != sizeof(CPUArchState)) {
+            std::cerr << "Inconsistent CPUArchState size in C and C++.\n"
+                         "This may be because sizeof empty struct in C is "
+                         "different with C++. Please fix this.\n";
+            exit(0);
+        }
+    }
+
+    /* Create a processor tracer for each env. */
+    BaseTracer *Tracer = BaseTracer::CreateTracer(env);
+
+    /* Create optimization facilities. */
+    CPUOptimization *Opt = new CPUOptimization(cpu, Tracer);
+
+    /* Make an uplink to the optimizaiton facility object. */
+    env->opt_link = Opt;
+    return 1;
+}
+
+/* Finalize the optimization schemes. */
+int optimization_finalize(CPUArchState *env)
+{
+    if (env->opt_link == nullptr)
+        return 0;
+
+    PROFILE( cpu_get_ibtc(env).dump() );
+#if defined(CONFIG_SOFTMMU)
+    PROFILE( cpu_get_cpbl(env).dump() );
+    PROFILE( cpu_get_lpt(env).dump() );
+#endif
+
+    BaseTracer::DeleteTracer(env);
+    delete (CPUOptimization *)env->opt_link;
+    return 1;
+}
+
+/* Reset to default values of the optimizatiion schemes. */
+int optimization_reset(CPUArchState *env, int force_flush)
+{
+    if (env->opt_link == nullptr)
+        return 0;
+
+    ITLB &itlb = cpu_get_itlb(env);
+    IBTC &ibtc = cpu_get_ibtc(env);
+
+    itlb.reset();
+    if (force_flush)
+        ibtc.reset();
+
+    tracer_reset(env);
+    return 1;
+}
+
+int optimization_remove_entry(CPUArchState *env, TranslationBlock *tb)
+{
+    IBTC &ibtc = cpu_get_ibtc(env);
+    ibtc.remove(tb);
+    return 1;
+}
+
+int optimization_flush_page(CPUArchState *env, target_ulong pc)
+{
+#if defined(CONFIG_SOFTMMU)
+    ITLB &itlb = cpu_get_itlb(env);
+    itlb.flush(pc);
+#else
+    IBTC &ibtc = cpu_get_ibtc(env);
+    ibtc.reset();
+#endif
+    return 1;
+}
+
+int optimization_init_tb(TranslationBlock *tb, int id)
+{
+    tb->id = id;
+    tb->tid = -1;
+    tb->mode = BLOCK_NONE;
+    tb->opt_ptr = nullptr;
+    tb->exec_count = 0;
+    tb->patch_jmp = 0;
+    tb->patch_next = 0;
+    tb->jmp_pc[0] = tb->jmp_pc[1] = (target_ulong)-1;
+    tb->image = nullptr;
+    tb->state = nullptr;
+    tb->chain = nullptr;
+    return 1;
+}
+
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/CombineCasts.cpp b/llvm/pass/CombineCasts.cpp
new file mode 100644
index 0000000..71a74ff
--- /dev/null
+++ b/llvm/pass/CombineCasts.cpp
@@ -0,0 +1,321 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm-target.h"
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineCasts"
+
+/*
+ * CombineCasts Pass
+ */
+class CombineCasts : public FunctionPass {
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Type *FloatTy;
+    Type *DoubleTy;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit CombineCasts() : FunctionPass(ID) {}
+    explicit CombineCasts(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+        FloatTy     = Type::getFloatTy(Context);
+        DoubleTy    = Type::getDoubleTy(Context);
+    }
+
+    Instruction *getUniqueUser(Instruction *I) {
+        if (I->hasOneUse())
+            return I->user_back();
+        return nullptr;
+    };
+
+    bool combineLoadCast(LoadInst *LI);
+    bool combineStoreCast(StoreInst *SI);
+    bool combineCastCast(Function &F);
+    bool simplifySignChange(Function &F);
+    bool runOnFunction(Function &F);
+};
+
+char CombineCasts::ID = 0;
+INITIALIZE_PASS(CombineCasts, "combinecast",
+        "Combine bitcast with guest memory loads/stores", false, false)
+
+FunctionPass *llvm::createCombineCasts(IRFactory *IF) 
+{
+    return new CombineCasts(IF);
+}
+
+static bool hasSameCastingTy(ArrayRef<BitCastInst *> IL) {
+    Type *SrcTy = IL[0]->getSrcTy();
+    Type *DstTy = IL[0]->getDestTy();
+    for (BitCastInst *I : IL) {
+        if (I->getSrcTy() != SrcTy)
+            return false;
+        if (I->getDestTy() != DstTy)
+            return false;
+    }
+    return true;
+}
+
+/* This function aims to change the load type if (1) the type of loaded data is
+ * casted to another type, (2) only one user of the load instruction is bitcast,
+ * and (3) all other users of the load instruction are stores.
+ *
+ * For example:
+ *  %0 = load <typeA>*              %0 = load <typeB>*
+ *  %1 = bitcast %0, <typeB>        %1 = bitcast %0, <typeA>
+ *
+ *  %2 = op <typeB> %1, ...    =>   %2 = op <typeB> %0, ...
+ *
+ *  store %0, <typeA>*              store %1, <typeA>*
+ *  store %1, <typeB>*              store %0, <typeB>*
+ */
+bool CombineCasts::combineLoadCast(LoadInst *LI)
+{
+    Instruction *Ptr = dyn_cast<Instruction>(LI->getPointerOperand());
+
+    if (!Ptr)
+        return false;
+
+    /* Find all bitcast users of this load. */
+    SmallVector<BitCastInst *, 4> BCIs;
+    for (User *U : LI->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        switch (UI->getOpcode()) {
+        default:
+            return false;
+        case Instruction::PHI:
+        case Instruction::Load:
+        case Instruction::Store:
+            break;
+        case Instruction::BitCast:
+            BCIs.push_back(cast<BitCastInst>(UI));
+            break;
+        }
+    }
+
+    if (BCIs.empty() || !hasSameCastingTy(BCIs))
+        return false;
+
+    Instruction *InsertPos = LI;
+    unsigned Alignment = LI->getAlignment();
+    unsigned Volatile = LI->isVolatile();
+    Type *SrcTy = LI->getType();
+    Type *DstTy = BCIs[0]->getDestTy();
+
+    Type *PtrTy = PointerType::get(DstTy, LI->getPointerAddressSpace());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), PtrTy, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, PtrTy, "", InsertPos);
+
+    Instruction *NewLI = new LoadInst(Ptr, "", Volatile, Alignment, InsertPos);
+    Instruction *NewBCI = new BitCastInst(NewLI, SrcTy, "", InsertPos);
+
+    if (MF->isGuestMemory(LI))
+        MF->setGuestMemory(NewLI);
+    for (BitCastInst *BCI : BCIs)
+        BCI->replaceAllUsesWith(NewLI);
+    LI->replaceAllUsesWith(NewBCI);
+
+    toErase.push_back(LI);
+    for (BitCastInst *BCI : BCIs)
+        toErase.push_back(BCI);
+
+    return true;
+}
+
+/* This function aims to change the store type if stored data is casted from
+ * another type.
+ *
+ * For example:
+ *  %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>   =>   store %0, <typeA>*
+ *  store %1, <typeB>*
+ */
+bool CombineCasts::combineStoreCast(StoreInst *SI)
+{
+    Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand());
+    Instruction *Data = dyn_cast<Instruction>(SI->getValueOperand());
+
+    if (!Ptr || !Data || !isa<BitCastInst>(Data))
+        return false;
+
+    Instruction *InsertPos = SI;
+    unsigned Alignment = SI->getAlignment();
+    unsigned Volatile = SI->isVolatile();
+    BitCastInst *BCI = cast<BitCastInst>(Data);
+    Value *V = BCI->getOperand(0);
+    Type *SrcTy = V->getType();
+
+    Type *PtrTy = PointerType::get(SrcTy, SI->getPointerAddressSpace());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), PtrTy, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, PtrTy, "", InsertPos);
+
+    Instruction *NewSI = new StoreInst(V, Ptr, Volatile, Alignment, InsertPos);
+
+    if (MF->isGuestMemory(SI))
+        MF->setGuestMemory(NewSI);
+
+    toErase.push_back(SI);
+    return true;
+}
+
+/* This function aims to eliminate redundant casts.
+ * For example:
+ *  %0 = <typeA>                   %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>  =>
+ *  %2 = bitcast %1, <typeC>       %2 = bitcast %0, <typeC>
+ *     = op <typeC> %2, ...           = op <typeC> %2, ...
+ *
+ * And if <typeA> is the same as <typeC>, the code is further optimized to
+ *  %0 = <typeA>                   %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>  =>
+ *  %2 = bitcast %1, <typeC>
+ *     = op <typeC> %2, ...           = op <typeA> %0, ...
+ */
+bool CombineCasts::combineCastCast(Function &F)
+{
+    SmallVector<Instruction*, 4> Worklist;
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (isa<BitCastInst>(I))
+            Worklist.push_back(I);
+    }
+
+    for (auto I : Worklist) {
+        BitCastInst *CI = cast<BitCastInst>(I);
+        BitCastInst *CSrc = dyn_cast<BitCastInst>(CI->getOperand(0));
+        if (!CSrc)
+            continue;
+
+        Type *SrcTy = CSrc->getOperand(0)->getType();
+        Type *DstTy = CI->getType();
+        Value *Result = (SrcTy == DstTy) ? CSrc->getOperand(0) :
+            new BitCastInst(CSrc->getOperand(0), CI->getType(), "", CI);
+        I->replaceAllUsesWith(Result);
+        toErase.push_back(I);
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+/* This function converts sign change of float/double data (i.e., -num),
+ * which is implemented with integer operations, to use float/double ops.
+ * For example:
+ *  %0 = bitcast float %num to i32
+ *  %1 = xor i32 %0, 0x80000000       =>    %0 = fsub float 0, %num
+ *  %2 = bitcast %1, float
+ */
+bool CombineCasts::simplifySignChange(Function &F)
+{
+    SmallVector<BitCastInst*, 16> Worklist;
+
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
+            Type *SrcTy = BCI->getSrcTy();
+            Type *DstTy = BCI->getDestTy();
+            if (SrcTy == FloatTy && DstTy == Int32Ty)
+                Worklist.push_back(BCI);
+            else if (SrcTy == DoubleTy && DstTy == Int64Ty)
+                Worklist.push_back(BCI);
+        }
+    }
+
+    for (auto I : Worklist) {
+        Type *Ty = I->getSrcTy();
+        Value *C = (Ty == FloatTy) ? CONST32(0x80000000)
+                                   : CONST64(0x8000000000000000LL);
+
+        /* Check whether the single user of this bitcast is Xor. */
+        Instruction *UI = getUniqueUser(I);
+        if (UI && UI->getOpcode() == Instruction::Xor && UI->getOperand(1) == C) {
+            /* Check whether the single user of this Xor is a bitcast
+             * instruction that casts the type back to the src type. */
+            Instruction *UUI = getUniqueUser(UI);
+            if (UUI && UUI->getOpcode() == Instruction::BitCast &&
+                cast<BitCastInst>(UUI)->getDestTy() == Ty) {
+                Value *V = BinaryOperator::Create(Instruction::FSub,
+                                                  ConstantFP::get(Ty, 0),
+                                                  I->getOperand(0), "", I);
+                UUI->replaceAllUsesWith(V);
+                toErase.push_back(UUI);
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+bool CombineCasts::runOnFunction(Function &F)
+{
+    bool Changed = false;
+    SmallVector<LoadInst *, 16> Loads;
+    SmallVector<StoreInst *, 16> Stores;
+
+    /* Collect all guest memory and non-volatile cpu state loads/stores. */
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+
+        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            if (MF->isGuestMemory(LI) || !LI->isVolatile())
+                Loads.push_back(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+            if (MF->isGuestMemory(SI) || !SI->isVolatile())
+                Stores.push_back(SI);
+        }
+    }
+
+    for (auto LI : Loads)
+        Changed |= combineLoadCast(LI);
+    for (auto SI : Stores)
+        Changed |= combineStoreCast(SI);
+
+    if (toErase.size())
+        ProcessErase(toErase);
+
+    Changed |= combineCastCast(F);
+    Changed |= simplifySignChange(F);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/CombineGuestMemory.cpp b/llvm/pass/CombineGuestMemory.cpp
new file mode 100644
index 0000000..0740a8b
--- /dev/null
+++ b/llvm/pass/CombineGuestMemory.cpp
@@ -0,0 +1,389 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineGuestMemory"
+
+/*
+ * CombineGuestMemory Pass
+ */
+class CombineGuestMemory : public FunctionPass {
+
+    struct StateInfo {
+        StateInfo() : Ptr(nullptr) {}
+        StateInfo(Value *ptr, APInt &offset, APInt &size)
+            : Ptr(ptr), Offset(offset), Size(size) {}
+        Value *Ptr;
+        APInt Offset;
+        APInt Size;
+    };
+
+    typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::map<size_t, size_t> StateMap;
+    typedef DenseMap<ValuePair, StateInfo> CSMap;
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Value *CPU;
+    Value *GuestBase;
+    Instruction *InitLastInst;
+    StateMap LegalStates;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit CombineGuestMemory() : FunctionPass(ID) {}
+    explicit CombineGuestMemory(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+
+        GuestBase = IF->getGuestBase();
+
+        addLegalStates();
+    }
+
+    unsigned getAddressSpaceOperand(Value *I) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(I))
+            return LI->getPointerAddressSpace();
+        if (StoreInst *SI = dyn_cast<StoreInst>(I))
+            return SI->getPointerAddressSpace();
+        return -1U;
+    }
+
+    int getNumUsers(Instruction *I) {
+        return distance(I->user_begin(), I->user_end());
+    }
+
+    void addLegalStates();
+    bool isLegalState(Value *Ptr, APInt &Offset, APInt &Size);
+    bool isConsecutiveAccess(Value *A, Value *B, Value *&Ptr, APInt &Offset, APInt &Size);
+    bool tryCombineLoad(Value *A, Value *B, CSMap &States);
+    bool tryCombineStore(Value *A, Value *B, CSMap &States);
+    bool combineMemory(SmallVector<Value *, 8> &Memory, SmallVector<Value *, 8> &States);
+    bool runOnFunction(Function &F);
+};
+
+char CombineGuestMemory::ID = 0;
+INITIALIZE_PASS(CombineGuestMemory, "combinegm",
+        "Combine guest memory loads and stores", false, false)
+
+FunctionPass *llvm::createCombineGuestMemory(IRFactory *IF) 
+{
+    return new CombineGuestMemory(IF);
+}
+
+
+void CombineGuestMemory::addLegalStates()
+{
+#if defined(TARGET_I386)
+    size_t Start = offsetof(CPUArchState, xmm_regs[0]);
+    size_t Size = sizeof(XMMReg);
+    for (int i = 0; i < CPU_NB_REGS; ++i)
+        LegalStates[Start + Size * i] = Size;
+#elif defined(TARGET_ARM)
+    size_t Start = offsetof(CPUArchState, vfp.regs[0]);
+    size_t Size = sizeof(float64) * 2;
+    for (int i = 0; i < 32; ++i)
+        LegalStates[Start + Size * i] = Size;
+#endif
+}
+
+bool CombineGuestMemory::isConsecutiveAccess(Value *A, Value *B, Value *&Ptr,
+                                             APInt &Offset, APInt &Size)
+{
+    Value *PtrA = getPointerOperand(A);
+    Value *PtrB = getPointerOperand(B);
+    unsigned ASA = getAddressSpaceOperand(A);
+    unsigned ASB = getAddressSpaceOperand(B);
+
+    if (!PtrA || !PtrB || (ASA != ASB))
+        return false;
+
+    Type *TyA = cast<PointerType>(PtrA->getType())->getElementType();
+    Type *TyB = cast<PointerType>(PtrB->getType())->getElementType();
+    if (DL->getTypeStoreSize(TyA) != DL->getTypeStoreSize(TyB))
+        return false;
+
+    unsigned PtrBitWidth = DL->getTypeSizeInBits(TyA);
+    APInt Sz(PtrBitWidth, DL->getTypeStoreSize(TyA));
+
+    APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+    PtrA = StripPointerWithConstantOffset(DL, PtrA, OffsetA, GuestBase);
+    PtrB = StripPointerWithConstantOffset(DL, PtrB, OffsetB, GuestBase);
+
+    APInt OffsetDelta = OffsetB - OffsetA;
+    if (PtrA == PtrB && OffsetDelta == Sz) {
+        Ptr = PtrA;
+        Offset = OffsetA;
+        Size = Sz + Sz;
+        return true;
+    }
+
+    return false;
+}
+
+bool CombineGuestMemory::isLegalState(Value *Ptr, APInt &Offset, APInt &Size)
+{
+    if (Ptr != CPU)
+        return false;
+    uint64_t Start = Offset.getZExtValue();
+    if (LegalStates.find(Start) == LegalStates.end() ||
+        Size.getZExtValue() > LegalStates[Start])
+        return false;
+    return true;
+}
+
+static bool hasMemoryViolation(Instruction *SA, Instruction *SB,
+                               Instruction *EA, Instruction *EB)
+{
+    std::set<Value*> Insts;
+    Insts.insert(SA);
+    Insts.insert(SB);
+    Insts.insert(EA);
+    Insts.insert(EB);
+
+    BasicBlock::iterator BI = BasicBlock::iterator(SA);
+    BasicBlock::iterator BE = BasicBlock::iterator(EA);
+    for (; BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isa<CallInst>(I))
+            return true;
+        if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+            continue;
+        if (Insts.find(I) == Insts.end())
+            return true;
+    }
+
+    BI = BasicBlock::iterator(SB);
+    BE = BasicBlock::iterator(EB);
+    for (; BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isa<CallInst>(I))
+            return true;
+        if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+            continue;
+        if (Insts.find(I) == Insts.end())
+            return true;
+    }
+    return false;
+}
+
+bool CombineGuestMemory::tryCombineLoad(Value *A, Value *B, CSMap &States)
+{
+    /* First, check if the guest loads are 'only' used by the store instructions
+     * to consecutive CPU states, and if any other loads/stores occurs between
+     * the queried operation. */
+    LoadInst *LA = cast<LoadInst>(A);
+    LoadInst *LB = cast<LoadInst>(B);
+    if (getNumUsers(LA) != 1 || getNumUsers(LB) != 1)
+        return false;
+
+    Value *VA = *LA->user_begin();
+    Value *VB = *LB->user_begin();
+    CSMap::iterator CSI = States.find(ValuePair(VA, VB));
+    if (CSI == States.end())
+        return false;
+
+    Instruction *SA = cast<Instruction>(VA);
+    Instruction *SB = cast<Instruction>(VB);
+
+    if (hasMemoryViolation(LA, LB, SA, SB))
+        return false;
+
+    /* Here we found the guest memory operations are loaded and stored to the
+     * CPU states immediately. The operations are safe to combine. */
+    Instruction *InsertPos = SA;
+    StateInfo &SI = CSI->second;
+    uint64_t Size = SI.Size.getZExtValue();
+    unsigned AS = getAddressSpaceOperand(LA);
+    unsigned Align = Size / 2;
+    Type *Ty = PointerType::get(VectorType::get(Int8Ty, Size), AS);
+    Instruction *Ptr = cast<Instruction>(LA->getPointerOperand());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), Ty, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, Ty, "", InsertPos);
+    Instruction *NewLI = new LoadInst(Ptr, "", true, Align, InsertPos);
+    MF->setGuestMemory(NewLI);
+
+    Ty = PointerType::getUnqual(VectorType::get(Int8Ty, Size));
+    Value *Offset = ConstantInt::get(Ty->getContext(), SI.Offset);
+    Ptr = GetElementPtrInst::CreateInBounds(CPU, Offset, "", InitLastInst);
+    Ptr = new BitCastInst(Ptr, Ty, "", InitLastInst);
+    new StoreInst(NewLI, Ptr, false, InsertPos);
+
+    States.erase(CSI);
+    toErase.push_back(SA);
+    toErase.push_back(SB);
+    return true;
+}
+
+bool CombineGuestMemory::tryCombineStore(Value *A, Value *B, CSMap &States)
+{
+    /* First, check if the CPU state loads are 'only' used by the guest store
+     * instructions, and if any other loads/stores occurs between the
+     * queried operation. */
+    StoreInst *SA = cast<StoreInst>(A);
+    StoreInst *SB = cast<StoreInst>(B);
+    Instruction *LA = dyn_cast<Instruction>(SA->getOperand(0));
+    Instruction *LB = dyn_cast<Instruction>(SB->getOperand(0));
+
+    if (!LA || !LB)
+        return false;
+    if (getNumUsers(LA) != 1 || getNumUsers(LB) != 1)
+        return false;
+
+    CSMap::iterator CSI = States.find(ValuePair(LA, LB));
+    if (CSI == States.end())
+        return false;
+
+    if (hasMemoryViolation(LA, LB, SA, SB))
+        return false;
+
+    /* Here we found the CPU states are loaded and stored to the guest memory
+     * immediately. The operations are safe to combine. */
+    Instruction *InsertPos = SA;
+    StateInfo &SI = CSI->second;
+    uint64_t Size = SI.Size.getZExtValue();
+    Type *Ty = PointerType::getUnqual(VectorType::get(Int8Ty, Size));
+    Value *Offset = ConstantInt::get(Ty->getContext(), SI.Offset);
+    Instruction *Ptr = GetElementPtrInst::CreateInBounds(CPU, Offset, "", InitLastInst);
+    Ptr = new BitCastInst(Ptr, Ty, "", InitLastInst);
+    Value *V = new LoadInst(Ptr, "", false, InsertPos);
+
+    unsigned AS = getAddressSpaceOperand(SA);
+    unsigned Align = Size / 2;
+    Ty = PointerType::get(VectorType::get(Int8Ty, Size), AS);
+    Ptr = cast<Instruction>(SA->getPointerOperand());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), Ty, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, Ty, "", InsertPos);
+    Instruction *NewSI = new StoreInst(V, Ptr, true, Align, InsertPos);
+    MF->setGuestMemory(NewSI);
+
+    States.erase(CSI);
+    toErase.push_back(SA);
+    toErase.push_back(SB);
+    return true;
+}
+
+bool CombineGuestMemory::combineMemory(SmallVector<Value *, 8> &Memory,
+                                       SmallVector<Value *, 8> &States)
+{
+    bool Changed = false;
+    SmallPtrSet<Value *, 4> Used;
+    CSMap ConsecutiveStates;
+    Value *Ptr;
+    APInt Offset, Size;
+
+    /* Find consecutive CPU states. */
+    for (unsigned i = 1, e = States.size(); i != e; i++) {
+        if (!isConsecutiveAccess(States[i-1], States[i], Ptr, Offset, Size))
+            continue;
+
+        if (!isLegalState(Ptr, Offset, Size))
+            continue;
+
+        ConsecutiveStates[ValuePair(States[i-1], States[i])] =
+            StateInfo(Ptr, Offset, Size);
+    }
+
+    if (ConsecutiveStates.size() == 0)
+        return false;
+
+    /* Find and combine consecutive guest memory accesses if their referrenced
+     * CPU states are also consecutive. */
+    for (unsigned i = 1, e = Memory.size(); i != e; i++) {
+        if (Used.count(Memory[i-1]) || Used.count(Memory[i]))
+            continue;
+        if (!isConsecutiveAccess(Memory[i-1], Memory[i], Ptr, Offset, Size))
+            continue;
+
+        bool ret = false;
+        if (isa<LoadInst>(Memory[i-1]) && isa<LoadInst>(Memory[i])) {
+            ret = tryCombineLoad(Memory[i-1], Memory[i], ConsecutiveStates);
+        } else if (isa<StoreInst>(Memory[i-1]) && isa<StoreInst>(Memory[i])) {
+            ret = tryCombineStore(Memory[i-1], Memory[i], ConsecutiveStates);
+        }
+        if (ret) {
+            Used.insert(Memory[i-1]);
+            Used.insert(Memory[i]);
+            Changed = true;
+        }
+    }
+    return Changed;
+}
+
+bool CombineGuestMemory::runOnFunction(Function &F)
+{
+    bool Changed = false;
+
+#if defined(CONFIG_SOFTMMU)
+    return Changed;
+#endif
+
+    /* Skip if no state is allowed to be combined. */
+    if (LegalStates.empty())
+        return Changed;
+
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << "CombineGuestMemory: Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    InitLastInst = F.getEntryBlock().getTerminator();
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        SmallVector<Value *, 8> Memory;
+        SmallVector<Value *, 8> States;
+        for (auto BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I)) {
+                Memory.push_back(I);
+            } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+                if (!LI->isVolatile())
+                    States.push_back(LI);
+            } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+                if (!SI->isVolatile())
+                    States.push_back(SI);
+            }
+        }
+        if (Memory.size() >= 2 && States.size() >= 2)
+            Changed |= combineMemory(Memory, States);
+    }
+
+    if (!toErase.empty())
+        ProcessErase(toErase);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/CombineZExtTrunc.cpp b/llvm/pass/CombineZExtTrunc.cpp
new file mode 100644
index 0000000..de9a87f
--- /dev/null
+++ b/llvm/pass/CombineZExtTrunc.cpp
@@ -0,0 +1,70 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm-target.h"
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineZExtTrunc"
+
+/*
+ * CombineZExtTrunc Pass
+ */
+class CombineZExtTrunc : public FunctionPass {
+public:
+    static char ID;
+    explicit CombineZExtTrunc() : FunctionPass(ID) {}
+    bool runOnFunction(Function &F);
+};
+
+char CombineZExtTrunc::ID = 0;
+INITIALIZE_PASS(CombineZExtTrunc, "combinezet",
+        "Combine ZExt followed by Trunc", false, false)
+
+FunctionPass *llvm::createCombineZExtTrunc()
+{
+    return new CombineZExtTrunc;
+}
+
+bool CombineZExtTrunc::runOnFunction(Function &F)
+{
+    bool Changed = false;
+    IVec toErase;
+
+    SmallVector<Instruction*, 4> Worklist;
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (isa<TruncInst>(I))
+            Worklist.push_back(I);
+    }
+
+    for (auto I : Worklist) {
+        TruncInst *TI = cast<TruncInst>(I);
+        ZExtInst *ZI = dyn_cast<ZExtInst>(TI->getOperand(0));
+        if (!ZI)
+            continue;
+
+        Type *SrcTy = ZI->getOperand(0)->getType();
+        Type *DstTy = TI->getType();
+        if (SrcTy == DstTy) {
+            I->replaceAllUsesWith(ZI->getOperand(0));
+            if (TI->use_empty())
+                toErase.push_back(TI);
+            Changed = true;
+        }
+    }
+
+    if (toErase.size())
+        ProcessErase(toErase);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/FastMathPass.cpp b/llvm/pass/FastMathPass.cpp
new file mode 100644
index 0000000..2b6a592
--- /dev/null
+++ b/llvm/pass/FastMathPass.cpp
@@ -0,0 +1,87 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "fpu/softfloat-native-def.h"
+
+#define PASS_DEBUG "FastMathPass"
+
+class FastMathPass : public FunctionPass {
+public:
+    static char ID;
+    std::map<std::string, std::string> FPUNameMap;
+
+    explicit FastMathPass() : FunctionPass(ID)
+    {
+        TCGHelperInfo *FPUHelper = (TCGHelperInfo *)get_native_fpu_helpers();
+        for (int i = 0, e = num_native_fpu_helpers(); i != e; ++i) {
+            /* ex: llvm_int32_to_float32 --> int32_to_float32 */
+            TCGHelperInfo &fpu = FPUHelper[i];
+            const char *native = fpu.name;
+            const char *soft =  native + 5;
+            FPUNameMap[soft] = native;
+        }
+    }
+    bool runOnFunction(Function &F);
+};
+
+bool FastMathPass::runOnFunction(Function &F)
+{
+    IVec toErase;
+    SmallVector<CallInst *, 16> InlineCalls;
+    Module *Mod = F.getParent();
+
+    for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+        if (CallInst *CI = dyn_cast<CallInst>(&*I)) {
+            if (CI->isInlineAsm() ||
+                CI->getCalledFunction() == nullptr ||
+                CI->getCalledFunction()->isIntrinsic())
+                continue;
+
+            std::string Fname = CI->getCalledFunction()->getName();
+            if (FPUNameMap.count(Fname) == 0)
+                continue;
+
+            Function *Fn = Mod->getFunction(FPUNameMap[Fname]);
+            FunctionType *FTy = cast<FunctionType>(
+                    cast<PointerType>(Fn->getType())->getElementType());
+
+            unsigned NumArgs = FTy->getNumParams();
+            assert(NumArgs <= CI->getNumArgOperands());
+
+            SmallVector<Value *, 4> Params;
+            for (unsigned i = 0; i != NumArgs; ++i)
+                Params.push_back(CI->getArgOperand(i));
+
+            CallInst *NewCI = CallInst::Create(Fn, Params, "", CI);
+            CI->replaceAllUsesWith(NewCI);
+            InlineCalls.push_back(NewCI);
+            toErase.push_back(CI);
+        }
+    }
+
+    ProcessErase(toErase);
+
+    while (!InlineCalls.empty())
+        InlineFunc(InlineCalls.pop_back_val());
+
+    return false;
+}
+
+char FastMathPass::ID = 0;
+INITIALIZE_PASS(FastMathPass, "fastmath",
+        "Transform softfloat subroutines to native FP operations", false, false)
+
+FunctionPass *llvm::createFastMathPass()
+{
+    return new FastMathPass();
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/ProfileExec.cpp b/llvm/pass/ProfileExec.cpp
new file mode 100644
index 0000000..56a68e1
--- /dev/null
+++ b/llvm/pass/ProfileExec.cpp
@@ -0,0 +1,172 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm-debug.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-pass.h"
+#include "llvm-opc.h"
+#include "llvm.h"
+#include "utils.h"
+
+#define PASS_NAME  "ProfileExec"
+
+extern LLVMEnv *LLEnv;
+
+/*
+ * Profile Pass
+ */
+class ProfileExec : public FunctionPass {
+    enum {
+        IDX_LOOP = 0,
+        IDX_EXIT,
+        IDX_INBR,
+    };
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+
+public:
+    static char ID;
+    explicit ProfileExec() : FunctionPass(ID) {}
+    explicit ProfileExec(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+    }
+    bool runOnFunction(Function &F);
+
+    Instruction *getInsertPos(BasicBlock *BB) {
+        if (BB == &BB->getParent()->getEntryBlock())
+            return &*++BB->begin();
+        return BB->getFirstNonPHI();
+    }
+};
+
+char ProfileExec::ID = 0;
+INITIALIZE_PASS(ProfileExec, "profile", "Profile trace execution", false, false)
+
+FunctionPass *llvm::createProfileExec(IRFactory *IF) 
+{
+    return new ProfileExec(IF);
+}
+
+bool ProfileExec::runOnFunction(Function &F)
+{
+    if (!LLEnv->isTraceMode())
+        return false;
+    if (!SP->isEnabled())
+        return false;
+
+    Instruction *CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << PASS_NAME << ": Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    TraceInfo *Trace = IF->getTrace();
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; FI++) {
+        BasicBlock *BB = &*FI;
+        if (distance(succ_begin(BB), succ_end(BB)) != 0)
+            continue;
+
+        /* Find exit points and indirect branches. */
+        Trace->NumExit++;
+        if (isa<IndirectBrInst>(BB->getTerminator()))
+            Trace->NumIndirectBr++;
+    }
+
+    /* Insert code to profile trace exit counts. */
+    if (SP->Mode & SPM_EXIT) {
+        Instruction *InsertPos = &*++BasicBlock::iterator(CPU);
+        Value *NumExitPtr = GetElementPtrInst::CreateInBounds(CPU,
+                            CONSTPtr(offsetof(CPUArchState, num_trace_exits)),
+                            "", InsertPos);
+        NumExitPtr = new BitCastInst(NumExitPtr, Int64PtrTy, "", InsertPos);
+        Instruction *NumExits = new LoadInst(NumExitPtr, "", true, InsertPos);
+        NumExits = BinaryOperator::Create(Instruction::Add, NumExits,
+                                          CONST64(1), "", InsertPos);
+        new StoreInst(NumExits, NumExitPtr, true, InsertPos);
+    }
+
+    if (!(SP->Mode & SPM_TRACE))
+        return false;
+
+    SmallVector<CallInst*, 16> InlineCalls;
+    Function *Helper = IF->ResolveFunction("helper_profile_exec");
+
+    /* Prepare counter structures. */
+    if (!Trace->ExecCount) {
+        Trace->ExecCount = new uint64_t *[MAX_SPM_THREADS];
+        for (int i = 0; i < MAX_SPM_THREADS; i++)
+            Trace->ExecCount[i] = new uint64_t[3] {0, 0, 0};
+    }
+
+    /* Find all profiling point. */
+    std::vector<std::pair<Instruction *, int> > ProfilePoint;
+
+    SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> BackEdges;
+    FindFunctionBackedges(F, BackEdges);
+    for (unsigned i = 0, e = BackEdges.size(); i != e; ++i) {
+        auto BackEdgeBB = const_cast<BasicBlock*>(BackEdges[i].first);
+        ProfilePoint.push_back(std::make_pair(BackEdgeBB->getTerminator(), IDX_LOOP));
+    }
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; FI++) {
+        BasicBlock *BB = &*FI;
+        if (distance(succ_begin(BB), succ_end(BB)) != 0)
+            continue;
+        bool isIndirectBr = isa<IndirectBrInst>(BB->getTerminator());
+        ProfilePoint.push_back(std::make_pair(getInsertPos(BB),
+                                    isIndirectBr ? IDX_INBR : IDX_EXIT));
+    }
+
+    /* Insert profiling routines. */
+    for (unsigned i = 0, e = ProfilePoint.size(); i != e; ++i) {
+        Instruction *InsertPos = ProfilePoint[i].first;
+        Value *Ty = CONST32(ProfilePoint[i].second);
+
+        Value *Counter = ConstantExpr::getIntToPtr(
+                            CONSTPtr((uintptr_t)Trace->ExecCount),
+                            PointerType::getUnqual(Int8Ty));
+
+        SmallVector<Value *, 4> Params;
+        Type *ParamTy = Helper->getFunctionType()->getParamType(0);
+        Value *Env = new BitCastInst(CPU, ParamTy, "", InsertPos);
+        Params.push_back(Env);
+        Params.push_back(Counter);
+        Params.push_back(Ty);
+
+        CallInst *CI = CallInst::Create(Helper, Params, "", InsertPos);
+        MF->setConst(CI);
+        InlineCalls.push_back(CI);
+    }
+
+    while (!InlineCalls.empty())
+        InlineFunc(InlineCalls.pop_back_val());
+
+    return true;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/RedundantStateElimination.cpp b/llvm/pass/RedundantStateElimination.cpp
new file mode 100644
index 0000000..2e5f715
--- /dev/null
+++ b/llvm/pass/RedundantStateElimination.cpp
@@ -0,0 +1,179 @@
+/*
+ *  (C) 2017 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "RedundantStateElimination"
+
+/*
+ * The RedundantStateElimination pass aims to remove
+ * (1) redundant stores to PC, and
+ * (2) redundant loads and stores enclosed by two helper function calls.
+ */
+class RedundantStateElimination : public FunctionPass {
+    IRFactory *IF;
+    MDFactory *MF;
+    const DataLayout *DL;
+    Value *CPU;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit RedundantStateElimination() : FunctionPass(ID) {}
+    explicit RedundantStateElimination(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), MF(IF->getMDFactory()), DL(IF->getDL()) {}
+
+    int getNumUsers(Instruction *I) {
+        return distance(I->user_begin(), I->user_end());
+    }
+
+    bool isStateOfPC(Value *Ptr) {
+        intptr_t Off = 0;
+        Value *Base = getBaseWithConstantOffset(DL, Ptr, Off);
+        if (Base == CPU && IRFactory::isStateOfPC(Off))
+            return true;
+        return false;
+    }
+
+    bool isDirectDominator(LoadInst *LI, StoreInst *SI) {
+        Instruction *A = LI, *B = SI;
+        if (A->getParent() != B->getParent())
+            return false;
+        for (auto II = BasicBlock::iterator(A), EE = A->getParent()->end();
+             II != EE; ++II) {
+            if (&*II == B)
+                return true;
+            /* If a non-const helper function is between the two instructions,
+             * this is not a direct domination because the helper function could
+             * cause side effect. */
+            auto CI = dyn_cast<CallInst>(II);
+            if (CI && !MDFactory::isConst(CI))
+                return false;
+        }
+        return false;
+    }
+
+    bool removePCState(Function &F);
+    bool removeHelperState(Function &F);
+    bool runOnFunction(Function &F);
+};
+
+char RedundantStateElimination::ID = 0;
+INITIALIZE_PASS(RedundantStateElimination, "rse",
+        "Eliminate redundant CPU state loads/stores", false, false)
+
+FunctionPass *llvm::createRedundantStateElimination(IRFactory *IF)
+{
+    return new RedundantStateElimination(IF);
+}
+
+/* Eliminate redundant stores to PC for each basic block. */
+bool RedundantStateElimination::removePCState(Function &F)
+{
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        bool Found = false;
+
+        for (auto BI = FI->rbegin(), BE = FI->rend(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I))
+                continue;
+
+            if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+                if (isStateOfPC(getPointerOperand(SI))) {
+                    if (!Found)
+                        Found = true;
+                    else
+                        toErase.push_back(SI);
+                }
+            } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+                if (isStateOfPC(getPointerOperand(LI)))
+                    Found = false;
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+
+}
+
+/* Eliminate redundant loads/stores enclosed by two helper function calls.
+ * The redundant loads and stores are generated by StateMappingPass for
+ * handling synchronization of CPU states around helper function calls.
+ * A load and store can be removed if a state value is loaded and immediately
+ * stored back to the same state. For example:
+ *
+ * Before optimization:                     After optimization:
+ *   instructions to sync states              instructions to sync states
+ *   call void @helper_function1()            call void @helper_function1()
+ *
+ *   %v0 = load i32, i32* %state0
+ *   %v1 = load i32, i32* %state1
+ *   store i32 %v0, i32* %state0
+ *   store i32 %v1, i32* %state1
+ *
+ *   call void @helper_function2()            call void @helper_function2()
+ *   instructions to reload states            instructions to reload states
+ */
+bool RedundantStateElimination::removeHelperState(Function &F)
+{
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        for (auto BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I))
+                continue;
+
+            StoreInst *SI = dyn_cast<StoreInst>(I);
+            if (!SI || SI->isVolatile())
+                continue;
+
+            LoadInst *LI = dyn_cast<LoadInst>(getValueOperand(SI));
+            if (LI && isDirectDominator(LI, SI)) {
+                /* We can try removing the store instruction if LI is a direct
+                 * dominator of SI. */
+                Value *PtrA = getPointerOperand(LI);
+                Value *PtrB = getPointerOperand(SI);
+                if (StripPointer(PtrA) == CPU && PtrA == PtrB)
+                    toErase.push_back(SI);
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+bool RedundantStateElimination::runOnFunction(Function &F)
+{
+    bool Changed = false;
+
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << "RedundantStateElimination: Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    Changed |= removePCState(F);
+#if 0
+    Changed |= removeHelperState(F);
+#endif
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/pass/ReplaceIntrinsic.cpp b/llvm/pass/ReplaceIntrinsic.cpp
new file mode 100644
index 0000000..62505f4
--- /dev/null
+++ b/llvm/pass/ReplaceIntrinsic.cpp
@@ -0,0 +1,137 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-types.h"
+#include "llvm-debug.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+
+
+#define PASS_NAME  "ReplaceIntrinsic"
+
+/*
+ * HQEMU does not allow helpers to contain any memory or debug intrinsics.
+ * This pass substitutes memory intrinsics to load/store instuctions and
+ * removes debug intrinsics (generated by Clang with -g flag).
+ */
+class ReplaceIntrinsic : public FunctionPass {
+    IVec toErase;
+public:
+    static char ID;
+    explicit ReplaceIntrinsic() : FunctionPass(ID) {}
+
+    Value *ConvertType(Value *V, Type *T, Instruction *InsertPos) {
+        if (likely(V->getType() == T))
+            return V;
+        return new BitCastInst(V, T, "", InsertPos);
+    }
+
+    bool replaceMemoryIntrinsic(IntrinsicInst *I);
+    bool runOnFunction(Function &F);
+};
+
+char ReplaceIntrinsic::ID = 0;
+INITIALIZE_PASS(ReplaceIntrinsic, "replaceintrinsic",
+                "Replace memory and debug intrinsics generated by clang",
+                false, false)
+
+FunctionPass *llvm::createReplaceIntrinsic()
+{
+    return new ReplaceIntrinsic();
+}
+
+
+/*
+ *  Transform memcpy/memmove/memset to load/store instruction.
+ *  Clang attempts to move memory data using LLVM memory intrinsic instructions.
+ *  This causes the statemapping pass to miss some guest states. (Statemapping
+ *  only considers guest states accessed by general load/store insts).
+ *  So, we simply rewrite the memory intrinsics to load/store instuctions.
+ */
+bool ReplaceIntrinsic::replaceMemoryIntrinsic(IntrinsicInst *I)
+{
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+        break;
+    default:
+        return false;
+    }
+
+    LLVMContext &Context = I->getContext();
+    Type *Int8PtrTy = Type::getInt8PtrTy(Context);
+    CallInst *CI = cast<CallInst>(I);
+
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        /* memcpy/memmove */
+        Value *Src = MTI->getSource();
+        Value *Dst = MTI->getDest();
+        Value *NumBytes = MTI->getLength();
+
+        if (CI->getArgOperand(0)->getType() != Int8PtrTy ||
+            CI->getArgOperand(1)->getType() != Int8PtrTy ||
+            !isa<ConstantInt>(NumBytes) ||
+            MTI->isVolatile())
+            return false;
+
+        /* Remove this instruction if the access size is zero. */
+        size_t Len = cast<ConstantInt>(NumBytes)->getZExtValue();
+        if (Len == 0)
+            goto done;
+
+        Type *Ty = Type::getIntNPtrTy(Context, Len * 8);
+        Src = ConvertType(Src, Ty, I);
+        Dst = ConvertType(Dst, Ty, I);
+        Src = new LoadInst(Src, "", false, I);
+        new StoreInst(Src, Dst, false, I);
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        /* memset */
+        Value *Src = MSI->getValue();
+        Value *Dst = MSI->getDest();
+        Value *NumBytes = MSI->getLength();
+
+        if (CI->getArgOperand(0)->getType() != Int8PtrTy ||
+            !isa<ConstantInt>(Src) ||
+            !isa<ConstantInt>(NumBytes) ||
+            MSI->isVolatile())
+            return false;
+
+        size_t Val = cast<ConstantInt>(Src)->getZExtValue();
+        size_t Len = cast<ConstantInt>(NumBytes)->getZExtValue();
+        if (Val != 0)
+            return false;
+        if (Len == 0)
+            goto done;
+
+        Type *Ty = Type::getIntNPtrTy(Context, Len * 8);
+        Src = ConstantInt::get(Type::getIntNTy(Context, Len * 8), 0);
+        Dst = ConvertType(Dst, Ty, I);
+        new StoreInst(Src, Dst, false, I);
+    }
+
+done:
+    toErase.push_back(I);
+    return true;
+}
+
+bool ReplaceIntrinsic::runOnFunction(Function &F)
+{
+    for (auto I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+        Instruction *Inst = &*I;
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+            if (replaceMemoryIntrinsic(II))
+                continue;
+            if (isa<DbgInfoIntrinsic>(II))
+                toErase.push_back(II);
+        }
+    }
+    ProcessErase(toErase);
+    return true;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pass/SimplifyPointer.cpp b/llvm/pass/SimplifyPointer.cpp
new file mode 100644
index 0000000..87afbdd
--- /dev/null
+++ b/llvm/pass/SimplifyPointer.cpp
@@ -0,0 +1,334 @@
+//===- SimplifyPointer.cpp - Reassociate guest pointer arithmetic ---------===//
+//
+//           The HQEMU Dynamic Binary Translator Infrastructure
+//
+// (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+//     COVART Laboratory, CSIE Department, National Taiwan University, Taiwan.
+//     See COPYRIGHT in top-level directory.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements a simple pointer arithmetic reassociator for easier
+// pointer stripping. It gets scalar evolution results of all guest pointers
+// which are in simplest form. Next, it inserts new instructions to evaluate the
+// simplified expressions to construct new pointers, and rewrites corresponding
+// guest load/store with new pointers.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "llvm-target.h"
+#include "utils.h"
+
+#define PASS_NAME "SIMPTR"
+#define DEBUG_TYPE "SIMPTR"
+
+//#define VERBOSE
+
+/// \brief Dump pass debug message with pass name mark.
+static inline llvm::raw_ostream &pout() {
+  return dbg() << DEBUG_PASS << PASS_NAME ": ";
+}
+
+/// \returns True if \p A dominates \p B.
+static bool dominates(Value *A, Value *B, DominatorTree *DT) {
+  auto *AI = dyn_cast<Instruction>(A);
+  auto *BI = dyn_cast<Instruction>(B);
+  if (AI && BI)
+    return DT->dominates(AI, BI);
+  return false;
+}
+
+class SimplifyPointer : public FunctionPass {
+public:
+  using ValueList  = SmallVector<Value *, 32>;
+  using InstrList  = SmallVector<Instruction *, 32>;
+  using ExprValMap = DenseMap<const SCEV *, Value *>;
+
+  // Pass identification, replacement for type id.
+  static char ID;
+
+  // LLVM pass constructor and destructor.
+  explicit SimplifyPointer() : FunctionPass(ID){};
+  explicit SimplifyPointer(IRFactory *IF)
+      : FunctionPass(ID), IF(IF), MF(IF->getMDFactory()), DL(IF->getDL()) {
+    // Initialize all.
+    initializeSimplifyPointerPass(*PassRegistry::getPassRegistry());
+  }
+
+  // LLVM pass public interfaces.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// \return The evaluation result of expression \p S or null if not cached.
+  Value *lookupBaseExpressionCache(const SCEV *S) const {
+    auto V = BaseExprVal.find(S);
+    if (V != BaseExprVal.end())
+      return V->second;
+    return nullptr;
+  }
+
+  /// \returns True if spread constants in the expression tree of \p S can be
+  /// collected by reassociation and reduced to \p FoldVal.
+  ///
+  /// It traverses the expression tree of \p S and propagates constant nodes
+  /// from add, multiply and recurrent add nodes, i.e., (%1 + %2 + 5) * (%3 - 7)
+  /// should return 5 * -7 = -35.
+  bool foldConstantExpression(const SCEV *S, int64_t &FoldVal) const;
+
+  /// \returns The first non-pointer value traced along the use-define chain of
+  /// casting which starts from \p V and ends with a IntToPtrInst, or null if
+  /// the length of searching chain exceeds \p MaxLookup.
+  ///
+  /// In the context of DBT, pointer type is represented and manipulated as
+  /// integer data until used as a pointer. Therefore, it follows:
+  ///
+  /// [Expression Tree]
+  ///    +   + +   +
+  ///     \ /   \ /
+  ///      - ... -
+  ///       \   /
+  ///         +
+  ///   [Scalar Root]
+  ///         |
+  ///     [Casting]
+  ///         |
+  ///    [Load/Store]
+  ///
+  /// This method targets the scalar root value.
+  Value *findPointerScalarRoot(Value *V, unsigned MaxLookup = 4);
+
+  /// \brief Simplify the pointer arithmetic of \p LSI based on scalar evolution
+  /// results which folds constants into simplest form. After extracting the
+  /// folded constant from the expression, the rest nodes can form a base
+  /// expression which is likely a common sub-expression of other \p LSI.
+  ///
+  /// It assumes \p LSI has the following use-define chain starting from its
+  /// pointer and containing only add, multiply and recurrent add nodes.
+  ///
+  /// [Expression Tree]      [Expression Tree]      [Expression Tree]
+  ///    +   A B   +            +   + B   A            +   +
+  ///     \ /   \ /              \ /   \ /              \ /
+  ///      - ... -                - ... -                -   (B-A)
+  ///       \   /                  \   /                  \   /
+  ///         +                      +                      +
+  ///   [Scalar Root]    >>    [Scalar Root]    >>    [Scalar Root]
+  ///         |                      |                      |
+  ///     [Casting]              [Casting]              [Casting]
+  ///         |                      |                      |
+  ///       [LSI]                  [LSI]                  [LSI]
+  ///
+  /// Suppose A and B are constants, they can be folded into (B-A) with scalar
+  /// evolution results. Need to insert instructions for other operations in
+  /// tree (e.g., the new sub in the right-most figure).
+  ///
+  /// First it tries to find the folded constant and substract it from root
+  /// expression to form the base expression. Then it generates instructions to
+  /// evaluate the base expression.
+  bool tryToSimplifyPointer(Instruction *I);
+
+  // HQEMU internal infrastructure.
+  IRFactory *IF = nullptr;
+  MDFactory *MF = nullptr;
+  // LLVM analysis and data type info.
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT    = nullptr;
+  ScalarEvolution *SE  = nullptr;
+
+  /// The cache of base expression to corresponding evaluated value map.
+  ExprValMap BaseExprVal;
+};
+
+bool SimplifyPointer::foldConstantExpression(const SCEV *S,
+                                             int64_t &FoldVal) const {
+  // Handle expression tree of scalar root containing only add, multiply and
+  // recurrent add nodes.
+  if (auto *AddSE = dyn_cast<SCEVAddExpr>(S)) {
+    FoldVal = 0;
+    for (auto Op : AddSE->operands()) {
+      int64_t Val;
+      if (foldConstantExpression(Op, Val))
+        FoldVal += Val;
+    }
+    return true;
+  } else if (auto *MulSE = dyn_cast<SCEVMulExpr>(S)) {
+    FoldVal = 1;
+    for (auto Op : MulSE->operands()) {
+      int64_t Val;
+      // If one operand of multiplication fails to report a constant, entire
+      // expression becomes non-constant as well.
+      if (foldConstantExpression(Op, Val))
+        FoldVal *= Val;
+      else
+        return false;
+    }
+    return true;
+  } else if (auto *RecSE = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Trace only the start expression, because the step expression must be
+    // multiplied by the loop trip count which is unlikely constant.
+    return foldConstantExpression(RecSE->getStart(), FoldVal);
+  } else if (auto *ConstSE = dyn_cast<SCEVConstant>(S)) {
+    FoldVal = ConstSE->getValue()->getValue().getSExtValue();
+    return true;
+  }
+  return false;
+}
+
+Value *SimplifyPointer::findPointerScalarRoot(Value *V, unsigned MaxLookup) {
+  if (!V || !V->getType()->isPointerTy())
+    return V;
+
+  for (unsigned i = 0; i < MaxLookup; ++i) {
+    if (BitCastInst *Cast = dyn_cast<BitCastInst>(V)) {
+      V = Cast->getOperand(0);
+    } else if (IntToPtrInst *Cast = dyn_cast<IntToPtrInst>(V)) {
+      // Found first scalar, return it.
+      V = Cast->getOperand(0);
+      return V;
+    }
+  }
+  return nullptr;
+}
+
+bool SimplifyPointer::tryToSimplifyPointer(Instruction *LSI) {
+  Value *Ptr   = getPointerOperand(LSI);
+  Value *Root  = findPointerScalarRoot(Ptr);
+  Type *RootTy = Root->getType();
+  Type *PtrTy  = Ptr->getType();
+  if (!Ptr || !Root || !RootTy->isIntegerTy())
+    return false;
+
+#ifdef VERBOSE
+  if (DM.getDebugMode() & DEBUG_PASS) {
+    pout() << "Visiting memory instruction.\n";
+    pout() << "- " << *LSI << ".\n";
+  }
+#endif
+
+  // Traverse the simplest form expression tree and collect folded constants.
+  // Note the folded constant can be zero (base = root) if no folded constant
+  // is found.
+  auto *RootSE         = SE->getSCEV(Root);
+  int64_t FoldConst = 0;
+  foldConstantExpression(RootSE, FoldConst);
+
+  // Substract offset constant from root expression to get the base expression,
+  // then query base expression cache to find whether it has been evaluated.
+  auto *BaseSE = SE->getMinusSCEV(RootSE,
+                                  SE->getConstant(RootTy, FoldConst, true));
+  Value *Base  = lookupBaseExpressionCache(BaseSE);
+
+  // Create instructions to evaluate base expression if cache miss or previously
+  // computed value doesn't dominate load/store instruction.
+  if (!Base || !dominates(Base, LSI, DT)) {
+#ifdef VERBOSE
+    pout() << "  Need to build base expression.\n";
+    pout() << "  - Base   " << *BaseSE << ".\n";
+    pout() << "  - Offset " << FoldConst << ".\n";
+#endif
+    // Expand the base expression if it is safe.
+    if (isSafeToExpand(BaseSE, *SE)) {
+#if defined(LLVM_V35)
+      SCEVExpander Expander(*SE, "");
+#else
+      SCEVExpander Expander(*SE, *DL, "");
+#endif
+      Base = Expander.expandCodeFor(BaseSE, RootTy, LSI);
+    }
+  } else {
+#ifdef VERBOSE
+    pout() << "  Use cached base expression value.\n";
+    pout() << "  - Base   " << *BaseSE << ".\n";
+    pout() << "  - Offset " << FoldConst << ".\n";
+#endif
+  }
+
+  // Neither using cached value nor re-computing works, abort.
+  if (!Base)
+    return false;
+
+  // Add back folded constant (offset) to new root value and feed the result as
+  // new pointer to load/store instruction.
+  IRBuilder<> Builder(IF->getContext());
+
+  bool FoldZero = (FoldConst == 0);
+  Value *Offset = ConstantInt::get(RootTy, FoldConst);
+
+  Builder.SetInsertPoint(LSI);
+  Value *NewRoot = FoldZero ? Base : Builder.CreateAdd(Base, Offset);
+  Value *NewPtr  = Builder.CreateIntToPtr(NewRoot, PtrTy);
+  LSI->replaceUsesOfWith(Ptr, NewPtr);
+
+  // Cache base expression value.
+  BaseExprVal[BaseSE] = Base;
+
+  return true;
+}
+
+void SimplifyPointer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+#if defined(LLVM_V35)
+  AU.addRequired<ScalarEvolution>();
+#else
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+#endif
+}
+
+bool SimplifyPointer::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+#if defined(LLVM_V35)
+  SE = &getAnalysis<ScalarEvolution>();
+#else
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+#endif
+
+  bool Changed = false;
+
+  InstrList MemoryInstrs;
+  for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = &*FI;
+
+    // Skip dead basic blocks.
+    if (!DT->isReachableFromEntry(BB))
+      continue;
+
+    // Collect all guest memory instructions.
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+      Instruction *I = &*BI;
+      if (MDFactory::isGuestMemory(I))
+        MemoryInstrs.push_back(I);
+    }
+  }
+
+  // Try to simplify pointers of collected load/store instructions.
+  for (Instruction *I : MemoryInstrs)
+    Changed |= tryToSimplifyPointer(I);
+
+  return Changed;
+}
+
+char SimplifyPointer::ID = 0;
+INITIALIZE_PASS_BEGIN(SimplifyPointer, "simplifypointer",
+                      "Reassiciate pointer arithmetic", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+#if defined(LLVM_V35)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+#else
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+#endif
+INITIALIZE_PASS_END(SimplifyPointer, "simplifypointer",
+                    "Reassiciate pointer arithmetic", false, false)
+
+FunctionPass *llvm::createSimplifyPointer(IRFactory *IF) {
+  return new SimplifyPointer(IF);
+}
+
+/*
+ * vim: ts=2 sts=2 sw=2 expandtab
+ */
diff --git a/llvm/pass/StateMappingPass.cpp b/llvm/pass/StateMappingPass.cpp
new file mode 100644
index 0000000..0d9dd9b
--- /dev/null
+++ b/llvm/pass/StateMappingPass.cpp
@@ -0,0 +1,885 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+
+#define PASS_NAME "StateMapping"
+
+
+/*
+ * StateMappingPass is used to eliminate the redundant loads and stores to the
+ * CPUArchState. The loads and stores of the guest memory operations are not
+ * removed in order not to violate the memory model of the guest architecture.
+ *
+ * The state mapping rules are:
+ * - A guest state is not overlapped: (i.e., same access size)
+ *   - Same type: map to this type.
+ *   - Different type: select type in the order: vector, float and integer;
+ *                     use bitcast to convert between different types.
+ * - A guest state is overlapped with other state(s):
+ *   - Query StateType to find state size (i.e., boundary) and type:
+ *     - Vector type: use insert/extract to manipulate a vector element.
+ *     - Other types: use shift to manipulate a vector element.
+ */
+class StateMappingPass : public FunctionPass  {
+    IRFactory *IF; /* Uplink to the IRFactory */
+
+public:
+    static char ID;
+    explicit StateMappingPass() : FunctionPass(ID) {}
+    explicit StateMappingPass(IRFactory *IF) : FunctionPass(ID), IF(IF) {}
+
+    bool runOnFunction(Function &F);
+};
+
+struct StateMapping {
+    StateMapping()
+        : State(nullptr), Addr(nullptr), Ty(nullptr), AI(nullptr),
+          hasLoad(false), hasStore(false) {}
+
+    StateData *State;
+    Value *Addr;
+    Type *Ty;
+    AllocaInst *AI;
+    bool hasLoad;
+    bool hasStore;
+
+    intptr_t getSize()  { return State->End - State->Start; }
+    intptr_t getStart() { return State->Start;     }
+    intptr_t getEnd()   { return State->End;       }
+    Value *getAddr()    { return Addr;             }
+    Type *getType()     { return Ty;               }
+    bool isVector()     { return Ty->isVectorTy(); }
+
+    bool overlap(StateRange &Range) {
+        if (Range.empty())
+            return false;
+        intptr_t Start = getStart();
+        intptr_t End = getEnd();
+        auto I = --Range.upper_bound(Start);
+        for (; I != Range.end() && I->first < End; ++I) {
+            if (I->second > Start)
+                return true;
+        }
+        return false;
+    }
+};
+
+struct ElementInfo {
+    ElementInfo() : Shift(0), NumElts(0), EltTy(nullptr), StateTy(nullptr) {}
+
+    intptr_t Shift;
+    unsigned NumElts;
+    Type *EltTy;
+    Type *StateTy;
+};
+
+class StateMapper {
+    typedef std::vector<StateMapping> StateMapList;
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    Instruction *CPU;         /* The CPU pointer */
+    Instruction *PreCastPos;  /* The position to cast CPU states */
+    Instruction *PreLoadPos;  /* The position to preload CPU states */
+    IVec toErase;             /* The instructions to be removed */
+
+    FlatType &StateType;
+    StateAnalyzer Analyzer;
+    StateMapList StateMaps;
+
+public:
+    StateMapper(IRFactory *IF)
+        : IF(IF), DL(IF->getDL()), StateType(IF->getTranslator().getStateType()),
+          Analyzer(DL) {}
+
+    bool run(Function &F) {
+        if (!init(F))
+            return false;
+
+        AnalyzeState(F);
+        if (!StateMaps.empty())
+            PromoteState(F);
+
+        ProcessErase(toErase);
+        return true;
+    }
+
+    /* Rearrange instructions in the 'init' block. */
+    bool init(Function &F);
+
+    /* Analyze instructions in a Function that access CPU states. */
+    void AnalyzeState(Function &F);
+
+    /* Compute state mapping information. */
+    void ComputeStateMap(StateMapping &StateMap, StateData &State);
+
+    /* Determine if the state can be operated as a vector. */
+    Type *TryVectorState(StateData &State, Type *Ty);
+
+    /* Map state references to the virtual states. */
+    void PromoteState(Function &F);
+
+    /* Rewrite state loads and stores. */
+    void RewriteLoad(StateMapping &StateMap, StateRef &Ref);
+    void RewriteStore(StateMapping &StateMap, StateRef &Ref);
+    void RewriteLoadVector(StateMapping &StateMap, StateRef &Ref);
+    void RewriteStoreVector(StateMapping &StateMap, StateRef &Ref);
+
+    /* Compute state and element types for element insertion and extraction. */
+    void getElementInfo(StateMapping &StateMap, StateRef &Ref, ElementInfo &Info);
+
+    /* Sync CPU states around helper calls. */
+    void SyncHelperState();
+
+    /* Store dirty states at the leaf blocks. */
+    void ProcessExitBB(BasicBlock *BB);
+
+    /* Get the pointer without GEP and BitCast. */
+    void StripPointer(Value *V, IVec &IV);
+
+    /* Move the pointer before InsertPos. */
+    void MoveStatePointer(Value *V);
+
+    /* Load state from Src and store it to Dest. */
+    void CopyState(Value *Dest, Value *Src, Instruction *InsertPos);
+
+    bool isLegalState(Value *Ptr, intptr_t &Off);
+
+    /* Return true if the input is alias of a state pointer. */
+    bool isStatePointer(Value *V) {
+        if (auto BCI = dyn_cast<BitCastInst>(V)) {
+            if (BCI->getOperand(0) == CPU)
+                return true;
+            return isStatePointer(BCI->getOperand(0));
+        } else if (auto GEP = dyn_cast<GetElementPtrInst>(V))
+            return GEP->getOperand(0) == CPU;
+        return false;
+    }
+
+    bool isSimpleFunction(Function *F) {
+        HelperMap &Helpers = IF->getHelpers();
+        if (Helpers.find(F->getName()) == Helpers.end() ||
+            Helpers[F->getName()]->hasNestedCall)
+            return false;
+        return true;
+    }
+
+    Value *ConvertType(Value *V, Type *Ty, Instruction *InsertPos) {
+        return V->getType() == Ty ? V : new BitCastInst(V, Ty, "", InsertPos);
+    }
+};
+
+/* Return a pre-defined state name. */
+static std::string getStateName(intptr_t Off)
+{
+#if defined(TARGET_I386)
+    if (Off == offsetof(CPUArchState,xmm_regs[0])) return "xmm0";
+    if (Off == offsetof(CPUArchState,xmm_regs[1])) return "xmm1";
+    if (Off == offsetof(CPUArchState,xmm_regs[2])) return "xmm2";
+    if (Off == offsetof(CPUArchState,xmm_regs[3])) return "xmm3";
+    if (Off == offsetof(CPUArchState,xmm_regs[4])) return "xmm4";
+    if (Off == offsetof(CPUArchState,xmm_regs[5])) return "xmm5";
+    if (Off == offsetof(CPUArchState,xmm_regs[6])) return "xmm6";
+    if (Off == offsetof(CPUArchState,xmm_regs[7])) return "xmm7";
+    if (Off == offsetof(CPUArchState,xmm_t0)) return "xmm_t0";
+#endif
+    return "";
+}
+
+/* Determine if the offset is to access the temporary state. */
+static inline bool isLocalState(intptr_t Off)
+{
+#if defined(TARGET_I386)
+    if (Off == offsetof(CPUArchState, xmm_t0))
+        return true;
+#endif
+    return false;
+}
+
+/* Return states that should be ignored during state mapping. */
+static bool isSkipState(intptr_t Off)
+{
+    if (Off == (intptr_t)(offsetof(CPUState, tcg_exit_req) - ENV_OFFSET))
+        return true;
+
+#define stateof(X) \
+    (Off >= (intptr_t)offsetof(CPUArchState,X) && \
+     Off < (intptr_t)(offsetof(CPUArchState,X) + sizeof(((CPUArchState*)0)->X)))
+#define is_fpstatus(X) \
+    (stateof(X.float_detect_tininess)       || \
+     stateof(X.float_rounding_mode)         || \
+     stateof(X.float_exception_flags)       || \
+     stateof(X.floatx80_rounding_precision) || \
+     stateof(X.flush_to_zero)               || \
+     stateof(X.flush_inputs_to_zero)        || \
+     stateof(X.default_nan_mode))
+
+#if defined(TARGET_ARM)
+    if (is_fpstatus(vfp.fp_status) || is_fpstatus(vfp.standard_fp_status))
+        return true;
+#elif defined(TARGET_I386)
+    if (is_fpstatus(fp_status))
+        return true;
+#endif
+    return false;
+
+#undef stateof
+#undef is_fpstatus
+}
+
+/* Check if the state is legal for state mapping. A legal state must have CPU
+ * as the base pointer, plus a positive constant offset. */
+bool StateMapper::isLegalState(Value *Ptr, intptr_t &Off)
+{
+    Value *Base = getBaseWithConstantOffset(DL, Ptr, Off);
+    if (Off < 0)
+        return false;
+    if (Base == CPU && !isSkipState(Off) && !IRFactory::isStateOfPC(Off))
+        return true;
+    return false;
+}
+
+/* Get the pointer without GEP and BitCast. The stripped GEP and BitCast
+ * instructions are returned to the caller. */
+void StateMapper::StripPointer(Value *V, IVec &IV)
+{
+    std::set<Value *> Visited;
+    Visited.insert(V);
+    do {
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+            IV.push_back(GEP);
+            V = GEP->getOperand(0);
+        } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+            IV.push_back(BCI);
+            V = BCI->getOperand(0);
+        } else
+            return;
+        if (Visited.find(V) != Visited.end())
+            break;
+        Visited.insert(V);
+    } while (true);
+}
+
+/* Move the pointer before InsertPos. */
+void StateMapper::MoveStatePointer(Value *V)
+{
+    IVec toMove;
+    StripPointer(V, toMove);
+    while (!toMove.empty()) {
+        Instruction *I = toMove.back();
+        toMove.pop_back();
+        if (I->getParent() == CPU->getParent())
+            continue;
+        I->moveBefore(PreCastPos);
+    }
+}
+
+/* Copy state data from src address to destination address. */
+void StateMapper::CopyState(Value *Dest, Value *Src, Instruction *InsertPos)
+{
+    if (!isa<AllocaInst>(Src)) {
+        MoveStatePointer(Src);
+        LoadInst *LI = new LoadInst(Src, "", false, InsertPos);
+        new StoreInst(LI, Dest, false, InsertPos);
+
+        if (Src->getType()->getPointerElementType()->isVectorTy())
+            LI->setAlignment(4);
+    } else {
+        MoveStatePointer(Dest);
+        LoadInst *LI = new LoadInst(Src, "", false, InsertPos);
+        StoreInst *SI = new StoreInst(LI, Dest, false, InsertPos);
+
+        if (Dest->getType()->getPointerElementType()->isVectorTy())
+            SI->setAlignment(4);
+    }
+}
+
+/* Store dirty states at the leaf blocks. */
+void StateMapper::ProcessExitBB(BasicBlock *BB)
+{
+    Instruction *InsertPos = nullptr;
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+        if (MDFactory::isExit(&*BI)) {
+            InsertPos = &*BI;
+            break;
+        }
+    }
+    if (!InsertPos)
+        InsertPos = BB->getTerminator();
+
+    for (auto &StateMap : StateMaps) {
+        if (!StateMap.hasStore || isLocalState(StateMap.getStart()))
+            continue;
+        CopyState(StateMap.Addr, StateMap.AI, InsertPos);
+    }
+}
+
+/* Sync CPU states around helper calls. */
+void StateMapper::SyncHelperState()
+{
+    CallList &Calls = Analyzer.getCalls();
+    if (Calls.empty())
+        return;
+
+    /*
+     * Rules of syncing states around calls:
+     * 1. Dirty states (i.e., stores) are written back before calls.
+     * 2. All states, including loads and stores, are read back after calls.
+     *
+     * If the helper is a simple function, only dependent states are synced.
+     * If the helper is a complicated function, all states are synced.
+     */
+    HelperMap &Helpers = IF->getHelpers();
+    DenseMap<CallInst*, std::set<unsigned> > StoreBeforeCall;
+    DenseMap<CallInst*, std::set<unsigned> > LoadAfterCall;
+
+    for (auto CI : Calls) {
+        Function *Func = CI->getCalledFunction();
+        std::string Name = Func->getName();
+
+        if (isSimpleFunction(Func)) {
+            /* A pre-defined helper without nested call. */
+            HelperInfo *Helper = Helpers[Name];
+            for (unsigned i = 0, e = StateMaps.size(); i != e; ++i) {
+                auto &StateMap = StateMaps[i];
+                if (StateMap.hasStore && StateMap.overlap(Helper->StateUse))
+                    StoreBeforeCall[CI].insert(i);
+
+                if (StateMap.overlap(Helper->StateDef))
+                    LoadAfterCall[CI].insert(i);
+
+                if (Helper->mayConflictArg) {
+                    unsigned NumArgs = CI->getNumArgOperands();
+                    for (unsigned j = 1; j < NumArgs; ++j) {
+                        intptr_t Off = 0;
+                        Value *Arg = CI->getArgOperand(j);
+                        if (!isLegalState(Arg, Off))
+                            continue;
+                        if (Off + Helper->ConflictSize <= StateMap.getStart() ||
+                            Off >= StateMap.getEnd())
+                            continue;
+                        if (StateMap.hasStore)
+                            StoreBeforeCall[CI].insert(i);
+                        LoadAfterCall[CI].insert(i);
+                    }
+                }
+            }
+        } else {
+            /* Sync states for a complicated function (an unknown helper or a
+             * helper with nested calls). */
+            for (unsigned i = 0, e = StateMaps.size(); i != e; ++i) {
+                auto &StateMap = StateMaps[i];
+                if (StateMap.hasStore)
+                    StoreBeforeCall[CI].insert(i);
+                LoadAfterCall[CI].insert(i);
+            }
+        }
+    }
+
+    /* Perform state syncing. */
+    for (auto CI : Calls) {
+        Instruction *InsertPos = CI;
+
+        if (!StoreBeforeCall.empty()) {
+            for (auto i : StoreBeforeCall[CI]) {
+                auto &StateMap = StateMaps[i];
+                CopyState(StateMap.Addr, StateMap.AI, InsertPos);
+            }
+        }
+
+        InsertPos = &*std::next(BasicBlock::iterator(InsertPos));
+        if (isa<UnreachableInst>(InsertPos)) {
+            /* No read back is required after tail call. */
+            continue;
+        }
+
+        if (!LoadAfterCall.empty()) {
+            for (auto i : LoadAfterCall[CI]) {
+                auto &StateMap = StateMaps[i];
+                CopyState(StateMap.AI, StateMap.Addr, InsertPos);
+            }
+        }
+    }
+}
+
+static inline bool isSameSize(StateMapping &StateMap, StateRef &Ref)
+{
+    return StateMap.getSize() == Ref.getSize();
+}
+
+/* Compute state and element types for element insertion and extraction. */
+void StateMapper::getElementInfo(StateMapping &StateMap, StateRef &Ref,
+                                 ElementInfo &Info)
+{
+    intptr_t StateSize = StateMap.getSize();
+    intptr_t Size = Ref.getSize();
+    intptr_t Shift = Ref.Start - StateMap.getStart();
+    Type *StateTy = StateMap.getType();
+    LLVMContext &Context = StateTy->getContext();
+
+    if (!StateMap.isVector()) {
+        /* Use int-N to emulate the state. */
+        Info.NumElts = 1;
+        Info.EltTy = Type::getIntNTy(Context, Size * 8);
+        Info.StateTy = Type::getIntNTy(Context, StateSize * 8);
+        Info.Shift = Shift;
+        return;
+    }
+
+    /* The state is emulated as a vector. */
+    if (StateSize % Size == 0 && Shift % Size == 0) {
+        Type *EltTy = Type::getIntNTy(Context, Size * 8);
+
+        Info.NumElts = 1;
+        Info.EltTy = EltTy;
+        Info.StateTy = VectorType::get(EltTy, StateSize / Size);
+        Info.Shift = Shift / Size;
+    } else {
+        VectorType *VecTy = cast<VectorType>(StateTy);
+        Type *EltTy = VecTy->getScalarType();
+        intptr_t EltSize = DL->getTypeSizeInBits(EltTy) / 8;
+
+        Info.NumElts = Size / EltSize;
+        Info.EltTy = VectorType::get(EltTy, Info.NumElts);
+        Info.StateTy = StateTy;
+        Info.Shift = Shift / EltSize;
+    }
+}
+
+void StateMapper::RewriteLoad(StateMapping &StateMap, StateRef &Ref)
+{
+    LoadInst *LI = cast<LoadInst>(Ref.I);
+    Type *Ty = LI->getType();
+    Instruction *InsertPos = LI;
+
+    /* The same reference size as the state size. */
+    if (isSameSize(StateMap, Ref)) {
+        Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+        V = ConvertType(V, Ty, InsertPos);
+        LI->replaceAllUsesWith(V);
+        toErase.push_back(LI);
+        return;
+    }
+
+    if (StateMap.isVector()) {
+        RewriteLoadVector(StateMap, Ref);
+        return;
+    }
+
+    /* This is a non-vector state. Transform the state to the type of Int-N
+     * and use logical shift to extract/insert element data. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Extract the element. */
+    if (Info.Shift) {
+        Value *Shift = ConstantInt::get(V->getType(), Info.Shift * 8);
+        V = BinaryOperator::Create(Instruction::LShr, V, Shift, "", InsertPos);
+    }
+    V = new TruncInst(V, Info.EltTy, "", InsertPos);
+    V = ConvertType(V, Ty, InsertPos);
+
+    LI->replaceAllUsesWith(V);
+    toErase.push_back(LI);
+}
+
+void StateMapper::RewriteStore(StateMapping &StateMap, StateRef &Ref)
+{
+    StoreInst *SI = cast<StoreInst>(Ref.I);
+    Value *Data = SI->getValueOperand();
+    Instruction *InsertPos = SI;
+
+    /* The same reference size as the state size. */
+    if (isSameSize(StateMap, Ref)) {
+        Value *V = ConvertType(Data, StateMap.getType(), InsertPos);
+        new StoreInst(V, StateMap.AI, false, InsertPos);
+        toErase.push_back(SI);
+        return;
+    }
+
+    if (StateMap.isVector()) {
+        RewriteStoreVector(StateMap, Ref);
+        return;
+    }
+
+    /* This is a non-vector state. Transform the state to the type of Int-N
+     * and use logical shift to extract/insert element data. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Insert the element. */
+    Data = ConvertType(Data, Info.EltTy, InsertPos);
+    Data = new ZExtInst(Data, Info.StateTy, "", InsertPos);
+
+    if (Info.Shift) {
+        Value *Shift = ConstantInt::get(Data->getType(), Info.Shift * 8);
+        Data = BinaryOperator::Create(Instruction::Shl, Data, Shift, "", InsertPos);
+    }
+
+    unsigned numBits = StateMap.getSize() * 8;
+    unsigned loBit = Info.Shift * 8, hiBit = loBit + Ref.getSize() * 8;
+    APInt mask = ~APInt::getBitsSet(numBits, loBit, hiBit);
+    Value *Mask = ConstantInt::get(Data->getContext(), mask);
+
+    V = BinaryOperator::Create(Instruction::And, V, Mask, "", InsertPos);
+    V = BinaryOperator::Create(Instruction::Or, V, Data, "", InsertPos);
+    V = ConvertType(V, StateMap.getType(), InsertPos);
+
+    new StoreInst(V, StateMap.AI, false, InsertPos);
+    toErase.push_back(SI);
+}
+
+void StateMapper::RewriteLoadVector(StateMapping &StateMap, StateRef &Ref)
+{
+    LoadInst *LI = cast<LoadInst>(Ref.I);
+    Type *Ty = LI->getType();
+    Instruction *InsertPos = LI;
+
+    /* Compute offset, size and element type of this vector operation. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Extract the element(s) from the vector value. */
+    IntegerType *I32 = IntegerType::get(V->getContext(), 32);
+
+    if (Info.EltTy->isVectorTy()) {
+        /* Multiple elements to load. Use shufflevector. */
+        Value *UndefVal = UndefValue::get(Info.StateTy);
+        SmallVector<Constant*, 8> Indices;
+        for (unsigned i = 0, e = Info.Shift; i != e; ++i)
+            Indices.push_back(ConstantInt::get(I32, Info.Shift + i));
+        Value *CV = ConstantVector::get(Indices);
+        V = new ShuffleVectorInst(V, UndefVal, CV, "", InsertPos);
+    } else {
+        /* Only one element. Use extractelement. */
+        V = ExtractElementInst::Create(V,
+                ConstantInt::get(I32, Info.Shift), "", InsertPos);
+    }
+
+    V = ConvertType(V, Ty, InsertPos);
+
+    LI->replaceAllUsesWith(V);
+    toErase.push_back(LI);
+}
+
+void StateMapper::RewriteStoreVector(StateMapping &StateMap, StateRef &Ref)
+{
+    StoreInst *SI = cast<StoreInst>(Ref.I);
+    Value *Data = SI->getValueOperand();
+    Instruction *InsertPos = SI;
+
+    /* Compute offset, size and element type of this vector operation. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+    Data = ConvertType(Data, Info.EltTy, InsertPos);
+
+    /* Extract element(s) from data and insert it into the vector value. */
+    IntegerType *I32 = IntegerType::get(V->getContext(), 32);
+
+    if (Info.EltTy->isVectorTy()) {
+        SmallVector<Value *, 8> Partial;
+        for (unsigned i = 0, e = Info.NumElts; i != e; ++i) {
+            Partial.push_back(ExtractElementInst::Create(Data,
+                        ConstantInt::get(I32, i), "", InsertPos));
+        }
+        for (unsigned i = 0, e = Info.NumElts; i != e; ++i) {
+            V = InsertElementInst::Create(V, Partial[i],
+                    ConstantInt::get(I32, Info.Shift + i), "", InsertPos);
+        }
+    } else {
+        /* Only one element. Use insertelement. */
+        V = InsertElementInst::Create(V, Data,
+                ConstantInt::get(I32, Info.Shift), "", InsertPos);
+    }
+
+    V = ConvertType(V, StateMap.getType(), InsertPos);
+
+    new StoreInst(V, StateMap.AI, false, InsertPos);
+    toErase.push_back(SI);
+}
+
+/* Map state references to the virtual states. */
+void StateMapper::PromoteState(Function &F)
+{
+    /* Pre-load CPU states. */
+    Type *IntPtrTy = DL->getIntPtrType(CPU->getContext());
+    for (auto &StateMap : StateMaps) {
+        if (!StateMap.Addr) {
+            Value *Off = ConstantInt::get(IntPtrTy, StateMap.getStart());
+            Value *GEP = GetElementPtrInst::CreateInBounds(CPU, Off, "",
+                                                           PreCastPos);
+            StateMap.Addr = new BitCastInst(GEP,
+                            PointerType::getUnqual(StateMap.getType()), "",
+                            PreCastPos);
+        }
+
+        std::string StateName = StateMap.Addr->getName();
+        if (StateName == "")
+            StateName = getStateName(StateMap.getStart());
+        if (StateName == "")
+            StateName = "state";
+        StateName.append(".a");
+
+        StateMap.AI = CreateAlloca(StateMap.getType(), 0, StateName, PreCastPos);
+        CopyState(StateMap.AI, StateMap.Addr, PreLoadPos);
+    }
+
+    /* Rewrite loads and stores. */
+    for (auto &StateMap : StateMaps) {
+        for (auto Ref : StateMap.State->Refs) {
+            if (isa<LoadInst>(Ref->I))
+                RewriteLoad(StateMap, *Ref);
+            else
+                RewriteStore(StateMap, *Ref);
+        }
+    }
+
+    /* Sync CPU states around helper calls. */
+    SyncHelperState();
+
+    /* Post-store dirty values back to CPU states for each exiting block. */
+    for (auto BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+        BasicBlock *BB = &*BI;
+        if (distance(succ_begin(BB), succ_end(BB)) == 0)    /* leaf node */
+            ProcessExitBB(BB);
+    }
+}
+
+/* Determine if the state can be operated as a vector. */
+Type *StateMapper::TryVectorState(StateData &State, Type *Ty)
+{
+    intptr_t StateStart = State.Start;
+    intptr_t StateEnd = State.End;
+    intptr_t StateSize = StateEnd - StateStart;
+
+    /* If the reference type (from the IR) is already a vector type, use it.
+     * Otherwise, query StateType to determine if it is a vector state. */
+    VectorType *VecTy = dyn_cast<VectorType>(Ty);
+    if (!VecTy) {
+        auto TI = --StateType.upper_bound(StateStart);
+        for (; TI != StateType.end() && TI->first < StateEnd; ++TI) {
+            if (TI->second->isVectorTy()) {
+                VecTy = cast<VectorType>(TI->second);
+                break;
+            }
+        }
+    }
+
+    if (!VecTy)
+        return nullptr;
+
+    /* This is a vector state. Now, we need to check whether all state refs can
+     * be composed by the vector element type: (a) the state size is a multiple
+     * of the vector element size, and (b) the size and shift of each state ref
+     * are both a multiple of the vector element size. */
+    Type *ElementTy = VecTy->getScalarType();
+    intptr_t ElementSize = DL->getTypeSizeInBits(ElementTy) / 8;
+    if (StateSize % ElementSize != 0)
+        return nullptr;
+
+    for (auto Ref : State.Refs) {
+        if (Ref->getSize() % ElementSize != 0 ||
+            (Ref->Start - StateStart) % ElementSize != 0)
+            return nullptr;
+    }
+    return VectorType::get(ElementTy, StateSize / ElementSize);
+}
+
+/* Compute state mapping information based on the state mapping rules. */
+void StateMapper::ComputeStateMap(StateMapping &StateMap, StateData &State)
+{
+    /* State mapping rule:
+     * - A guest state is not overlapped: (i.e., same access size)
+     *   - Same type: map to this type.
+     *   - Different type: select type in the order: vector, float and integer;
+     *                     use bitcast to convert between different types.
+     * - A guest state is overlapped with other state(s):
+     *   - Query StateType to find state size (i.e., boundary) and type:
+     *     - Vector type: use insert/extract to manipulate a vector element.
+     *     - Other types: use shift to manipulate a sub-register element. */
+    bool sameSize = true;
+    bool hasLoad = false;
+    bool hasStore = false;
+
+    for (auto Ref : State.Refs) {
+        hasLoad  |= isa<LoadInst>(Ref->I);
+        hasStore |= isa<StoreInst>(Ref->I);
+    }
+
+    StateRef *Ref = State.Refs.front();
+    Type *Ty = Ref->getType();
+    Value *Addr = getPointerOperand(Ref->I);
+    intptr_t Size = Ref->getSize();
+
+    for (unsigned i = 1, e = State.Refs.size(); i != e; ++i) {
+        StateRef *NextRef = State.Refs[i];
+        Type *NextTy = NextRef->getType();
+        Value *NextAddr = getPointerOperand(NextRef->I);
+
+        /* Check type. */
+        if (Ty != NextTy) {
+            /* Select type in the order: vector, float and integer. */
+            bool Swap = false;
+            if (Ty->isVectorTy() && NextTy->isVectorTy()) {
+                /* We prefer a vector type of small element type. */
+                Type *ATy = cast<VectorType>(Ty)->getScalarType();
+                Type *BTy = cast<VectorType>(NextTy)->getScalarType();
+                if (DL->getTypeSizeInBits(BTy) < DL->getTypeSizeInBits(ATy))
+                    Swap = true;
+            } else if (!Ty->isVectorTy() && NextTy->isVectorTy()) {
+                Swap = true;
+            } else if (Ty->isIntegerTy() && NextTy->isFloatTy()) {
+                Swap = true;
+            }
+
+            if (Swap) {
+                std::swap(Ty, NextTy);
+                std::swap(Addr, NextAddr);
+            }
+        }
+
+        /* Check size. */
+        if (Size != NextRef->getSize())
+            sameSize = false;
+    }
+
+    if (sameSize) {
+        /* The same reference size as the state size. */
+        StateMap.Ty = Ty;
+        StateMap.Addr = Addr;
+    } else {
+        /* Different reference sizes. */
+        intptr_t StateSize = State.End - State.Start;
+        Type *VecTy = TryVectorState(State, Ty);
+        StateMap.Ty = VecTy ? VecTy
+                            : Type::getIntNTy(Ty->getContext(), StateSize * 8);
+        StateMap.Addr = nullptr;
+    }
+    StateMap.State = &State;
+    StateMap.hasLoad = hasLoad;
+    StateMap.hasStore = hasStore;
+}
+
+/* Analyze instructions in a Function that access CPU states. */
+void StateMapper::AnalyzeState(Function &F)
+{
+    /* Collect instructions (load/store/call) that access CPU states.
+     * Loads/stores that access guest memory or are tagged with volatile
+     * (e.g., accessing the states: %pc and %tcg_exit_req) are ignored. */
+
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; ++II) {
+        Instruction *I = &*II;
+        intptr_t Off = 0;
+        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            if (MDFactory::isGuestMemory(I) || LI->isVolatile())
+                continue;
+
+            if (isLegalState(LI->getPointerOperand(), Off))
+                Analyzer.addStateRef(I, Off);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+            if (MDFactory::isGuestMemory(I) || SI->isVolatile())
+                continue;
+
+            if (isLegalState(SI->getPointerOperand(), Off))
+                Analyzer.addStateRef(I, Off);
+        } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+            /* Skip const helper, inlineasm and intrinsic function call. */
+            if (MDFactory::isConst(CI))
+                continue;
+            if (CI->isInlineAsm() || isa<IntrinsicInst>(CI))
+                continue;
+
+            Analyzer.addCall(CI);
+        }
+    }
+
+    /* Ask Analyzer to put state references into groups. */
+    Analyzer.computeState();
+
+    StateList &States = Analyzer.getStateList();
+    if (States.empty())
+        return;
+
+    /* Compute state mapping info. */
+    StateMaps.resize(States.size());
+    for (unsigned i = 0, e = States.size(); i != e; ++i)
+        ComputeStateMap(StateMaps[i], States[i]);
+}
+
+/* Rearrange instructions in the 'init' block. */
+bool StateMapper::init(Function &F)
+{
+    /*
+     * We would like to rearrange the instructions in the 'init' block, in which
+     * gep/cast instructions are in front of other instructions in the block.
+     * For example:
+     *   %0 = getelementptr i8* %cpu, i64 0
+     *   %1 = bitcast i8* %0 to i32*               # gep/cast insns
+     *   --------------------------------------    # precast_pos
+     *   --------------------------------------    # preload_pos
+     *   %2 = load i32, i32* %1                    # the other insns
+     *   br label %entry
+     */
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU || CPU->getParent() != &F.getEntryBlock())
+        return false;
+
+    Instruction *InsertPos = &*std::next(BasicBlock::iterator(CPU));
+    PreLoadPos = new UnreachableInst(CPU->getContext(), InsertPos);
+    PreCastPos = new UnreachableInst(CPU->getContext(), PreLoadPos);
+
+    toErase.push_back(PreLoadPos);
+    toErase.push_back(PreCastPos);
+
+    /* Move gep/cast instructions. */
+    IVec toMove;
+    BasicBlock *BB = CPU->getParent();
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isStatePointer(I))
+            toMove.push_back(I);
+    }
+    for (auto I : toMove)
+        I->moveBefore(PreCastPos);
+
+    return true;
+}
+
+/*
+ * StateMappingPass
+ */
+bool StateMappingPass::runOnFunction(Function &F)
+{
+    return StateMapper(IF).run(F);
+}
+
+char StateMappingPass::ID = 0;
+INITIALIZE_PASS(StateMappingPass, "statemap",
+        "Eliminate redundant loads/stores by mapping CPU states", false, false)
+
+FunctionPass *llvm::createStateMappingPass(IRFactory *IF)
+{
+    return new StateMappingPass(IF);
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pmu/arm/arm-events.cpp b/llvm/pmu/arm/arm-events.cpp
new file mode 100644
index 0000000..3da7339
--- /dev/null
+++ b/llvm/pmu/arm/arm-events.cpp
@@ -0,0 +1,42 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "pmu/pmu-global.h"
+
+namespace pmu {
+
+/* ARMv8 recommended implementation defined event types. 
+ * (copied from linux-4.x/arch/arm64/kernel/perf_event.c) */
+#define ICACHE_MISS_CONFIG (0x01)
+#define MEM_LOADS_CONFIG   (0x06)
+#define MEM_STORES_CONFIG  (0x07)
+
+
+extern EventID PreEvents[PMU_EVENT_MAX];  /* Pre-defined events.   */
+
+static void ARMSetupEventCode()
+{
+#define SetupEvent(_Event,_Config)            \
+    PreEvents[_Event].Type   = PERF_TYPE_RAW; \
+    PreEvents[_Event].Config = _Config;
+
+    SetupEvent(PMU_ICACHE_MISSES, ICACHE_MISS_CONFIG);
+    SetupEvent(PMU_MEM_LOADS, MEM_LOADS_CONFIG);
+    SetupEvent(PMU_MEM_STORES, MEM_STORES_CONFIG);
+
+#undef SetEventCode
+}
+
+int ARMInit()
+{
+    ARMSetupEventCode();
+    return PMU_OK;
+}
+
+} /* namespace pmu */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pmu/pmu-events.cpp b/llvm/pmu/pmu-events.cpp
new file mode 100644
index 0000000..d3f2d08
--- /dev/null
+++ b/llvm/pmu/pmu-events.cpp
@@ -0,0 +1,414 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <algorithm>
+#include <signal.h>
+#include <sys/time.h>
+#include "llvm-soft-perfmon.h"
+#include "pmu/pmu-global.h"
+#include "pmu/pmu-events.h"
+
+
+namespace {
+
+/* Mutex  */
+class Mutex {
+    pthread_mutex_t M;
+public:
+    Mutex() { pthread_mutex_init(&M, nullptr); }
+    inline void acquire() { pthread_mutex_lock(&M);    }
+    inline void release() { pthread_mutex_unlock(&M);  }
+    inline bool trylock()  { return pthread_mutex_trylock(&M) == 0; }
+};
+
+class MutexGuard {
+    Mutex &M;
+public:
+    MutexGuard(Mutex &M) : M(M) { M.acquire(); }
+    ~MutexGuard() { M.release(); }
+};
+
+}
+
+/*
+ * Performance Monitoring Unit (PMU).
+ */
+namespace pmu {
+
+static Mutex Lock;
+
+SampleList *ReadSampleData(PMUEvent *Event);
+
+/* The timer interrupt handler. */
+void DefaultHandler(int signum, siginfo_t *info, void *data)
+{
+    /* If the thread is signaled while it is currently holding the lock, we
+     * might enter deadlock if we attempt to acquire the lock. Use trylock to
+     * detect such a condition and return from this handler if we cannot
+     * successfully acquire the lock. */
+    if (Lock.trylock() == false)
+        return;
+
+    /* We have hold the lock. Iterate over all sampling events and process
+     * the sample buffer. */
+
+    auto &SampleEvents = EventMgr->SampleEvents;
+    if (SampleEvents.empty()) {
+        Lock.release();
+        return;
+    }
+
+    struct timeval Start, End, Elapse;
+    if (SP->Mode & SPM_HPM)
+        gettimeofday(&Start, nullptr);
+
+    for (auto I = SampleEvents.begin(), E = SampleEvents.end(); I != E; ++I) {
+        PMUEvent *Event = *I;
+        if (Event->Mode & MODE_SAMPLE) {
+            SampleList *Data = ReadSampleData(Event);
+            if (Data)
+                Event->SampleHandler(Event->Hndl, SampleDataPtr(Data),
+                                     Event->Opaque);
+        }
+    }
+
+    auto &ChangedEvents = EventMgr->ChangedEvents;
+    if (!ChangedEvents.empty()) {
+        for (auto *Event : ChangedEvents) {
+            if (Event->State == STATE_GOTO_STOP) {
+                Event->State = STATE_STOP;
+                SampleEvents.remove(Event);
+            } else if (Event->State == STATE_GOTO_START) {
+                Event->State = STATE_START;
+                SampleEvents.push_back(Event);
+            }
+        }
+        ChangedEvents.clear();
+    }
+
+    if (SP->Mode & SPM_HPM) {
+        gettimeofday(&End, nullptr);
+        timersub(&End, &Start, &Elapse);
+        SP->SampleTime += Elapse.tv_sec * 1e6 + Elapse.tv_usec;
+    }
+
+    if (!SampleEvents.empty())
+        EventMgr->EventTimer->Start();
+    Lock.release();
+}
+
+/*
+ * Event Manager
+ */
+EventManager::EventManager()
+{
+    for (unsigned i = 0; i < PMU_MAX_EVENTS; ++i) {
+        Events[i].Hndl = i;
+        FreeEvents.push_back(&Events[i]);
+    }
+
+    /* Install the signal handler for the timer. */
+    struct sigaction act;
+    memset(&act, 0, sizeof(struct sigaction));
+    act.sa_sigaction = DefaultHandler;
+    act.sa_flags = SA_SIGINFO;
+    sigaction(PMU_SIGNAL_NUM, &act, 0);
+
+    EventTimer = new Timer(PMU_SIGNAL_NUM, SysConfig.SignalReceiver);
+}
+
+EventManager::~EventManager()
+{
+    EventTimer->Stop();
+    delete EventTimer;
+}
+
+/* Return the event of the input handle. */
+PMUEvent *EventManager::GetEvent(Handle Hndl)
+{
+    if (Hndl >= PMU_MAX_EVENTS)
+        return nullptr;
+    return &Events[Hndl];
+}
+
+/* Add a counting event and return its handle. */
+Handle EventManager::AddEvent(int fd)
+{
+    MutexGuard Locked(Lock);
+
+    if (FreeEvents.empty())
+        return PMU_INVALID_HNDL;
+
+    auto Event = FreeEvents.front();
+    FreeEvents.pop_front();
+
+    Event->FD.push_back(fd);
+    Event->Data.Base = nullptr;
+    Event->Aux.Base = nullptr;
+    Event->OverflowHandler = nullptr;
+
+    Event->Mode = MODE_COUNTER;
+    Event->State = STATE_STOP;
+
+    return Event->Hndl;
+}
+
+/* Add a sampling event and return its handle. */
+Handle EventManager::AddSampleEvent(unsigned NumFDs, int *FD, uint64_t DataSize,
+                                    void *Data, uint32_t Mode,
+                                    SampleConfig &Config)
+{
+    MutexGuard Locked(Lock);
+
+    if (FreeEvents.empty())
+        return PMU_INVALID_HNDL;
+
+    auto Event = FreeEvents.front();
+    FreeEvents.pop_front();
+
+    for (unsigned i = 0; i < NumFDs; ++i)
+        Event->FD.push_back(FD[i]);
+
+    Event->Data.Base = Data;
+    Event->Data.Size = DataSize;
+    Event->Data.Prev = 0;
+    Event->Aux.Base = nullptr;
+    Event->Aux.Size = 0;
+    Event->Aux.Prev = 0;
+    Event->Watermark = std::min(Config.Watermark, DataSize);
+    Event->SampleHandler = Config.SampleHandler;
+    Event->Opaque = Config.Opaque;
+
+    Event->Mode = MODE_SAMPLE | Mode;
+    Event->State = STATE_STOP;
+
+    return Event->Hndl;
+}
+
+/* Notify that an event is started. */
+void EventManager::StartEvent(PMUEvent *Event, bool ShouldLock)
+{
+    if (ShouldLock) {
+        MutexGuard Locked(Lock);
+
+        /* We don't add this event to the sampling event list if user doesn't
+         * provide a valid overflow handler for a sampling event. */
+        if (Event->State == STATE_STOP && Event->OverflowHandler) {
+            SampleEvents.push_back(Event);
+            EventTimer->Start();
+        }
+        Event->State = STATE_START;
+    } else {
+        /* We are within the overflow handling and it's not safe to change the
+         * structure of the sampling event list. Here we only change the state
+         * of the event and the event list will be fixed at the end of the
+         * overflow handling. */
+        if (Event->State == STATE_STOP && Event->OverflowHandler) {
+            Event->State = STATE_GOTO_START;
+            ChangedEvents.push_back(Event);
+        }
+    }
+}
+
+/* Notify that an event is stopped. */
+void EventManager::StopEvent(PMUEvent *Event, bool ShouldLock)
+{
+    if (ShouldLock) {
+        /* If this is a sampling event and is currently under sampling, remove
+         * it from the sampling event list. */
+        Lock.acquire();
+        if (Event->State == STATE_START && Event->OverflowHandler) {
+            SampleEvents.remove(Event);
+            if (SampleEvents.empty())
+                EventTimer->Stop();
+        }
+        Event->State = STATE_STOP;
+        Lock.release();
+    } else {
+        /* We are within the overflow handling and it's not safe to change the
+         * structure of the sampling event list. Here we only change the state
+         * of the event and the event list will be fixed at the end of the
+         * overflow handling. */
+        if (Event->State == STATE_START && Event->OverflowHandler) {
+            Event->State = STATE_GOTO_STOP;
+            ChangedEvents.push_back(Event);
+        }
+    }
+}
+
+/* Notify that an event is deleted. */
+void EventManager::DeleteEvent(PMUEvent *Event)
+{
+    MutexGuard Locked(Lock);
+
+    Event->FD.clear();
+    FreeEvents.push_back(Event);
+}
+
+/* Stop the event manager. */
+void EventManager::Pause()
+{
+    MutexGuard Locked(Lock);
+    if (!SampleEvents.empty())
+        EventTimer->Stop();
+}
+
+/* Restart the event manager. */
+void EventManager::Resume()
+{
+    MutexGuard Locked(Lock);
+    if (!SampleEvents.empty())
+        EventTimer->Start();
+}
+
+/*
+ * Buffer processing
+ */
+static uint8_t *CopyData(uint8_t *Data, uint64_t DataSize, uint64_t Head, uint64_t Tail) {
+    uint64_t Mask = DataSize - 1;
+    uint64_t Size = Head - Tail;
+    uint64_t HeadOff = Head & Mask;
+    uint64_t TailOff = Tail & Mask;
+    uint8_t *Buf = new uint8_t[Size];
+
+    if (HeadOff > TailOff) {
+        memcpy(Buf, Data + TailOff, Size);
+    } else {
+        uint64_t UpperSize = DataSize - TailOff;
+        memcpy(Buf, Data + TailOff, UpperSize);
+        memcpy(&Buf[UpperSize], Data, HeadOff);
+    }
+    return Buf;
+}
+
+/* Process and decode the sample buffer. */
+SampleList *ReadSampleData(PMUEvent *Event)
+{
+    uint64_t Head = perf_read_data_head(Event->Data.Base);
+    uint64_t Old = Event->Data.Prev;
+    uint64_t Size = Head - Old;
+    uint8_t *Data = (uint8_t *)Event->Data.Base + SysConfig.PageSize;
+    uint64_t DataSize = Event->Data.Size - SysConfig.PageSize;
+    SampleList *OutData = nullptr;
+
+    if (Size < Event->Watermark)
+        return OutData;
+
+    OutData = new SampleList;
+    if (Size == 0)
+        return OutData;
+
+    /* Overwrite head if we failed to keep up with the mmap data. */
+    if (Size > DataSize) {
+        Event->Data.Prev = Head;
+        perf_write_data_tail(Event->Data.Base, Head);
+        return OutData;
+    }
+
+    /* Process the buffer. */
+    uint8_t *Buf = CopyData(Data, DataSize, Head, Old);
+    uint8_t *Orig = Buf, *BufEnd = Buf + Size;
+    bool SampleIP = Event->Mode & MODE_SAMPLE_IP;
+    bool ReadFormat = Event->Mode & MODE_SAMPLE_READ;
+    bool ReadGroup = Event->FD.size() > 1;
+
+    while (1) {
+        /* Check if we have enough size for the event header. */
+        if (Buf + sizeof(struct perf_event_header) > BufEnd)
+            break;
+
+        auto *Header = (struct perf_event_header *)Buf;
+        Buf += sizeof(struct perf_event_header);
+
+        /* Check if we have enough size for the sample payload. */
+        if (Buf + Header->size > BufEnd)
+            break;
+
+        if (Header->size == 0)
+            continue;
+
+        /* Skip this sample if it's not a PERF_RECORD_SAMPLE type. */
+        if (Header->type != PERF_RECORD_SAMPLE) {
+            Buf += Header->size;
+            continue;
+        }
+
+        if (SampleIP) {     /* if PERF_SAMPLE_IP */
+            uint64_t ip = *(uint64_t *)Buf;
+            Buf += 8;
+            OutData->push_back(ip);
+        }
+        if (ReadFormat) {   /* if PERF_SAMPLE_READ */
+            if (ReadGroup) {
+                uint64_t nr = *(uint64_t *)Buf;
+                Buf += 8;
+                while (nr--) {
+                    uint64_t value = *(uint64_t *)Buf;
+                    Buf += 8;
+                    OutData->push_back(value);
+                }
+            } else {
+                uint64_t value = *(uint64_t *)Buf;
+                Buf += 8;
+                OutData->push_back(value);
+            }
+        }
+    }
+
+    delete [] Orig;
+
+    /* We have finished the buffer. Update data tail. */
+    Event->Data.Prev = Head;
+    perf_write_data_tail(Event->Data.Base, Head);
+
+    return OutData;
+}
+
+/*
+ * Timer
+ */
+Timer::Timer(int Signum, int TID)
+{
+    struct sigevent ev;
+    memset(&ev, 0, sizeof(ev));
+    ev.sigev_value.sival_int = 0;
+    ev.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
+    ev.sigev_signo = Signum;
+    ev._sigev_un._tid = TID;
+    timer_create(CLOCK_REALTIME, &ev, &T);
+}
+
+Timer::~Timer()
+{
+    Stop();
+    timer_delete(T);
+}
+
+/* Fire a timer which expires just once.  */
+void Timer::Start()
+{
+    struct itimerspec Timeout;
+    Timeout.it_interval.tv_sec = 0;
+    Timeout.it_interval.tv_nsec = 0; /* 0 for one-shot timer */
+    Timeout.it_value.tv_sec =  0;
+    Timeout.it_value.tv_nsec = SysConfig.Timeout;
+    timer_settime(T, 0 /* RELATIVE */, &Timeout, NULL);
+}
+
+void Timer::Stop()
+{
+    struct itimerspec Timeout;
+    Timeout.it_interval.tv_sec = 0;
+    Timeout.it_interval.tv_nsec = 0; /* 0 for one-shot timer */
+    Timeout.it_value.tv_sec =  0;
+    Timeout.it_value.tv_nsec = 0;
+    timer_settime(T, 0 /* RELATIVE */, &Timeout, NULL);
+}
+
+} /* namespace pmu */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pmu/pmu.cpp b/llvm/pmu/pmu.cpp
new file mode 100644
index 0000000..640997f
--- /dev/null
+++ b/llvm/pmu/pmu.cpp
@@ -0,0 +1,491 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <errno.h>
+#include <sys/mman.h>
+#include "pmu/pmu-global.h"
+#include "pmu/pmu-events.h"
+
+/*
+ * Performance Monitoring Unit (PMU) tools.
+ */
+namespace pmu {
+
+static bool InitOnce;
+
+EventManager *EventMgr;            /* Event manager. */
+GlobalConfig SysConfig;            /* System-wide configuration. */
+EventID PreEvents[PMU_EVENT_MAX];  /* Pre-defined events. */
+
+
+/* Initialize system-wide configuration. */
+static void SetupGlobalConfig(PMUConfig &Config)
+{
+    /* Get page size. */
+    SysConfig.PageSize = getpagesize();
+
+    /* Configure timeout and signal receiver for the timer. */
+    SysConfig.SignalReceiver = Config.SignalReceiver;
+    if (SysConfig.SignalReceiver <= 0)
+        SysConfig.SignalReceiver = getpid();
+
+    SysConfig.Timeout = Config.Timeout;
+    if (SysConfig.Timeout == 0)
+        SysConfig.Timeout = PMU_TIMER_PERIOD;
+
+    SysConfig.Timeout *= 1000; /* nanosecond */
+
+    /* Determine the Linux Perf version used by this tool and the kernel.
+     * We set the last few bytes of the perf_event_attr structure and see the
+     * size field returned from the kernel. */
+
+    SysConfig.PerfVersion = 0;
+    SysConfig.OSPerfVersion = 0;
+
+    struct perf_event_attr attr;
+    perf_attr_init(&attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES);
+    attr.aux_watermark = 1;
+    int fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+    close(fd);
+
+#define CheckPerfVersion(_Ver)                     \
+    do {                                           \
+        SysConfig.PerfVersion = _Ver;              \
+        if (attr.size == PERF_ATTR_SIZE_VER##_Ver) \
+            SysConfig.OSPerfVersion = _Ver;        \
+    } while(0)
+
+    CheckPerfVersion(1);
+    CheckPerfVersion(2);
+    CheckPerfVersion(3);
+    CheckPerfVersion(4);
+    CheckPerfVersion(5);
+
+#undef CheckPerfVersion
+}
+
+/* Initialize pre-defined events. */
+static void SetupDefaultEvent()
+{
+    for (unsigned i = 0; i < PMU_EVENT_MAX; ++i) {
+        PreEvents[i].Type = -1;
+        PreEvents[i].Config = -1;
+    }
+
+#define SetupEvent(_Event,_Config)                 \
+    PreEvents[_Event].Type   = PERF_TYPE_HARDWARE; \
+    PreEvents[_Event].Config = _Config;
+
+    /* Basic events. */
+    SetupEvent(PMU_CPU_CYCLES, PERF_COUNT_HW_CPU_CYCLES);
+    SetupEvent(PMU_REF_CPU_CYCLES, PERF_COUNT_HW_REF_CPU_CYCLES);
+    SetupEvent(PMU_INSTRUCTIONS, PERF_COUNT_HW_INSTRUCTIONS);
+    SetupEvent(PMU_LLC_REFERENCES, PERF_COUNT_HW_CACHE_REFERENCES);
+    SetupEvent(PMU_LLC_MISSES, PERF_COUNT_HW_CACHE_MISSES);
+    SetupEvent(PMU_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+    SetupEvent(PMU_BRANCH_MISSES, PERF_COUNT_HW_BRANCH_MISSES);
+
+#undef SetEventCode
+}
+
+/* Initialize the PMU module. */
+int PMU::Init(PMUConfig &Config)
+{
+    if (InitOnce == true)
+        return PMU_OK;
+
+    /* Set the global configuration. */
+    SetupGlobalConfig(Config);
+
+    /* Initialize pre-defined event codes. */
+    SetupDefaultEvent();
+
+    /* Allocate event manager. */
+    EventMgr = new EventManager;
+
+    /* Initialize target-specific events. */
+#if defined(__i386__) || defined(__x86_64__)
+    X86Init();
+#elif defined(__arm__) || defined (__aarch64__)
+    ARMInit();
+#elif defined(_ARCH_PPC) || defined(_ARCH_PPC64)
+    PPCInit();
+#endif
+
+    Config.PerfVersion = SysConfig.PerfVersion;
+    Config.OSPerfVersion = SysConfig.OSPerfVersion;
+
+    InitOnce = true;
+    return PMU_OK;
+}
+
+/* Finalize the PMU module. */
+int PMU::Finalize(void)
+{
+    if (InitOnce == false)
+        return PMU_OK;
+
+    delete EventMgr;
+
+    InitOnce = false;
+    return PMU_OK;
+}
+
+/* Stop the PMU module. */
+int PMU::Pause(void)
+{
+    EventMgr->Pause();
+    return PMU_OK;
+}
+
+/* Restart the PMU module. */
+int PMU::Resume(void)
+{
+    EventMgr->Resume();
+    return PMU_OK;
+}
+
+/* Start a counting/sampling/tracing event. */
+int PMU::Start(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+
+    if (perf_event_start(Event->getFD()) != 0)
+        return PMU_EEVENT;
+
+    EventMgr->StartEvent(Event);
+
+    return PMU_OK;
+}
+
+/* Stop a counting/sampling/tracing event. */
+int PMU::Stop(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+
+    if (perf_event_stop(Event->getFD()) != 0)
+        return PMU_EEVENT;
+
+    EventMgr->StopEvent(Event);
+
+    return PMU_OK;
+}
+
+/* Reset the hardware counter. */
+int PMU::Reset(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+
+    if (perf_event_reset(Event->getFD()) != 0)
+        return PMU_EEVENT;
+    return PMU_OK;
+}
+
+/* Remove an event. */
+int PMU::Cleanup(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+
+    /* Do stop the event if the user hasn't called it. */
+    if (Event->State != STATE_STOP) {
+        int EC = Stop(Hndl);
+        if (EC != PMU_OK)
+            return EC;
+    }
+
+    /* At this point, this event has been removed from the sampling list and we
+     * no longer get overflow handling (if this is a sampling event). We are
+     * now able to release all resources. */
+
+    /* Stop all events in a group. */
+    for (auto fd : Event->FD)
+        perf_event_stop(fd);
+
+    /* Release allocated buffers. */
+    if (Event->Data.Base)
+        munmap(Event->Data.Base, Event->Data.Size);
+    if (Event->Aux.Base)
+        munmap(Event->Aux.Base, Event->Aux.Size);
+
+    for (auto fd : Event->FD)
+        close(fd);
+
+    EventMgr->DeleteEvent(Event);
+    return PMU_OK;
+}
+
+/* Start/stop a sampling/tracing event without acquiring a lock.
+ * Note that these two function should only be used within the overflow
+ * handler. Since the overflow handling is already in a locked section,
+ * acquiring a lock is not required. */
+int PMU::StartUnlocked(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+    if (Event->Mode & MODE_COUNTER)
+        return PMU_EINVAL;
+
+    if (perf_event_start(Event->getFD()) != 0)
+        return PMU_EEVENT;
+
+    EventMgr->StartEvent(Event, false);
+
+    return PMU_OK;
+}
+
+int PMU::StopUnlocked(Handle Hndl)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+    if (Event->Mode & MODE_COUNTER)
+        return PMU_EINVAL;
+
+    if (perf_event_stop(Event->getFD()) != 0)
+        return PMU_EEVENT;
+
+    EventMgr->StopEvent(Event, false);
+
+    return PMU_OK;
+}
+
+/* Open an event using the pre-defined event code. */
+int PMU::CreateEvent(unsigned EventCode, Handle &Hndl)
+{
+    int fd;
+    struct perf_event_attr Attr;
+
+    Hndl = PMU_INVALID_HNDL;
+
+    if (EventCode >= PMU_EVENT_MAX)
+        return PMU_EINVAL;
+    if (PreEvents[EventCode].Type == -1)
+        return PMU_ENOEVENT;
+
+    perf_attr_init(&Attr, PreEvents[EventCode].Type, PreEvents[EventCode].Config);
+    fd = sys_perf_event_open(&Attr, 0, -1, -1, 0);
+    if (fd < 0)
+        return ErrorCode(errno);
+
+    Hndl = EventMgr->AddEvent(fd);
+    if (Hndl == PMU_INVALID_HNDL) {
+        close(fd);
+        return PMU_ENOMEM;
+    }
+    return PMU_OK;
+}
+
+/* Open an event using the raw event number and umask value.
+ * The raw event code is computed as (RawEvent | (Umask << 8)). */
+int PMU::CreateRawEvent(unsigned RawEvent, unsigned Umask, Handle &Hndl)
+{
+    int fd;
+    struct perf_event_attr Attr;
+
+    Hndl = PMU_INVALID_HNDL;
+
+    perf_attr_init(&Attr, PERF_TYPE_RAW, RawEvent | (Umask << 8));
+    fd = sys_perf_event_open(&Attr, 0, -1, -1, 0);
+    if (fd < 0)
+        return ErrorCode(errno);
+
+    Hndl = EventMgr->AddEvent(fd);
+    if (Hndl == PMU_INVALID_HNDL) {
+        close(fd);
+        return PMU_ENOMEM;
+    }
+    return PMU_OK;
+}
+
+/* Open a sampling event, with the 1st EventCode as the interrupt event.
+ * The sample data will be recorded in a vector of type 'uint64_t'.
+ * The following vector shows the data format of sampling with N events:
+ *     { pc, val1, val2, ..., valN,      # 1st sample
+ *       ...
+ *       pc, val1, val2, ..., valN };    # nth sample
+ *
+ * Note that ownwership of the output vector is transferred to the user.
+ * It is the user's responsibility to free the resource of the vector. */
+int PMU::CreateSampleEvent(SampleConfig &Config, Handle &Hndl)
+{
+    unsigned i, NumEvents = Config.NumEvents;
+    unsigned NumPages = Config.NumPages;
+    uint64_t Period = Config.Period;
+    int fds[PMU_GROUP_EVENTS], EC = PMU_ENOMEM;
+    uint64_t DataSize;
+    void *Data;
+
+    Hndl = PMU_INVALID_HNDL;
+
+    if (NumPages == 0)
+        NumPages = PMU_SAMPLE_PAGES;
+    if (Period < 1e3)
+        Period = PMU_SAMPLE_PERIOD;
+
+    if (NumEvents == 0 || NumEvents > PMU_GROUP_EVENTS || !isPowerOf2(NumPages))
+        return PMU_EINVAL;
+
+    /* Check event codes. */
+    for (i = 0; i < NumEvents; ++i) {
+        unsigned EventCode = Config.EventCode[i];
+        if (EventCode >= PMU_EVENT_MAX)
+            return PMU_EINVAL;
+        if (PreEvents[EventCode].Type == -1)
+            return PMU_ENOEVENT;
+    }
+
+    /* Open the events. If more than one event is requested, set read_format
+     * to PERF_FORMAT_GROUP. */
+    fds[0] = -1;
+    for (i = 0; i < NumEvents; ++i) {
+        struct perf_event_attr Attr;
+        unsigned EventCode = Config.EventCode[i];
+        perf_attr_init(&Attr, PreEvents[EventCode].Type, PreEvents[EventCode].Config);
+
+        Attr.disabled = !i;
+        if (i == 0) {
+            Attr.sample_period = Period;
+            Attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ;
+            Attr.read_format = (NumEvents > 1) ? PERF_FORMAT_GROUP : 0;
+        }
+
+        fds[i] = sys_perf_event_open(&Attr, 0, -1, fds[0], 0);
+        if (fds[i] < 0) {
+            EC = ErrorCode(errno);
+            goto failed;
+        }
+    }
+
+    /* Allocate buffer for the sampling data. */
+    DataSize = (1 + NumPages) * SysConfig.PageSize;
+    Data = mmap(nullptr, DataSize, PROT_READ|PROT_WRITE, MAP_SHARED, fds[0], 0);
+    if (Data == MAP_FAILED)
+        goto failed;
+
+    Hndl = EventMgr->AddSampleEvent(NumEvents, fds, DataSize, Data,
+                                    MODE_SAMPLE_IP | MODE_SAMPLE_READ, Config);
+    if (Hndl == PMU_INVALID_HNDL) {
+        munmap(Data, DataSize);
+        goto failed;
+    }
+    return PMU_OK;
+
+failed:
+    while (--i)
+        close(fds[i]);
+    return EC;
+}
+
+/* Generate an IP histogram using EventCode as the interrupt event.
+ * The IP histogram will be recorded in a vector of type 'uint64_t' with
+ * the format: { pc1, pc2, pc3, ..., pcN }.
+ * Note that ownwership of the output vector is transferred to the user.
+ * It is the user's responsibility to free the resource of the vector. */
+int PMU::CreateSampleIP(Sample1Config &Config, Handle &Hndl)
+{
+    int fd;
+    unsigned EventCode = Config.EventCode;
+    unsigned NumPages = Config.NumPages;
+    uint64_t Period = Config.Period;
+    uint64_t DataSize;
+    void *Data;
+
+    Hndl = PMU_INVALID_HNDL;
+
+    if (NumPages == 0)
+        NumPages = PMU_SAMPLE_PAGES;
+    if (Period < 1e3)
+        Period = PMU_SAMPLE_PERIOD;
+
+    if (!isPowerOf2(NumPages))
+        return PMU_EINVAL;
+
+    /* Check the events. */
+    if (EventCode >= PMU_EVENT_MAX)
+        return PMU_EINVAL;
+    if (PreEvents[EventCode].Type == -1)
+        return PMU_ENOEVENT;
+
+    struct perf_event_attr Attr;
+    perf_attr_init(&Attr, PreEvents[EventCode].Type, PreEvents[EventCode].Config);
+
+    Attr.disabled = 1;
+    Attr.sample_period = Period;
+    Attr.sample_type = PERF_SAMPLE_IP;
+
+    fd = sys_perf_event_open(&Attr, 0, -1, -1, 0);
+    if (fd < 0)
+        return ErrorCode(errno);
+
+    /* Allocate buffer for the sampling data. */
+    DataSize = (1 + NumPages) * SysConfig.PageSize;
+    Data = mmap(nullptr, DataSize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    if (Data == MAP_FAILED)
+        goto failed;
+
+    /* Set the sampling config. */
+    SampleConfig SConfig;
+    SConfig.NumEvents = 1;
+    SConfig.EventCode[0] = Config.EventCode;
+    SConfig.NumPages = NumPages;
+    SConfig.Period = Period;
+    SConfig.Watermark = Config.Watermark;
+    SConfig.SampleHandler = Config.SampleHandler;
+    SConfig.Opaque = Config.Opaque;
+
+    Hndl = EventMgr->AddSampleEvent(1, &fd, DataSize, Data, MODE_SAMPLE_IP, SConfig);
+    if (Hndl == PMU_INVALID_HNDL) {
+        munmap(Data, DataSize);
+        goto failed;
+    }
+    return PMU_OK;
+
+failed:
+    close(fd);
+    return PMU_ENOMEM;
+}
+
+/* Read value from the hardware counter. */
+int PMU::ReadEvent(Handle Hndl, uint64_t &Value)
+{
+    auto Event = EventMgr->GetEvent(Hndl);
+    if (!Event)
+        return PMU_EINVAL;
+
+    if (read(Event->getFD(), &Value, sizeof(uint64_t)) != sizeof(uint64_t))
+        return PMU_EEVENT;
+    return PMU_OK;
+}
+
+/* Convert error code to string. */
+const char *PMU::strerror(int ErrCode)
+{
+    switch (ErrCode) {
+        case PMU_OK:       return "Success";
+        case PMU_EINVAL:   return "Invalid argument";
+        case PMU_ENOMEM:   return "Insufficient memory";
+        case PMU_ENOEVENT: return "Pre-defined event not available";
+        case PMU_EEVENT:   return "Hardware event error";
+        case PMU_EPERM:    return "Permission denied";
+        case PMU_EINTER:   return "Internal error";
+        case PMU_EDECODER: return "Decoder error";
+        default:           return "Unknown error";
+    }
+}
+
+} /* namespace pmu */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pmu/ppc/ppc-events.cpp b/llvm/pmu/ppc/ppc-events.cpp
new file mode 100644
index 0000000..249de52
--- /dev/null
+++ b/llvm/pmu/ppc/ppc-events.cpp
@@ -0,0 +1,37 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "pmu/pmu-global.h"
+
+namespace pmu {
+
+#define ICACHE_MISS_CONFIG (0x200fd)
+#define MEM_LOADS_CONFIG   (0x100fc)
+
+extern EventID PreEvents[PMU_EVENT_MAX];  /* Pre-defined events.   */
+
+static void PPCSetupEventCode()
+{
+#define SetupEvent(_Event,_Config)            \
+    PreEvents[_Event].Type   = PERF_TYPE_RAW; \
+    PreEvents[_Event].Config = _Config;
+
+    SetupEvent(PMU_ICACHE_MISSES, ICACHE_MISS_CONFIG);
+    SetupEvent(PMU_MEM_LOADS, MEM_LOADS_CONFIG);
+
+#undef SetEventCode
+}
+
+int PPCInit()
+{
+    PPCSetupEventCode();
+    return PMU_OK;
+}
+
+} /* namespace pmu */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/pmu/x86/x86-events.cpp b/llvm/pmu/x86/x86-events.cpp
new file mode 100644
index 0000000..fe25f70
--- /dev/null
+++ b/llvm/pmu/x86/x86-events.cpp
@@ -0,0 +1,41 @@
+/*
+ *  (C) 2018 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "pmu/pmu-global.h"
+
+namespace pmu {
+
+#define ICACHE_HIT_CONFIG  (0x83 | (0x1 << 8))   /* skylake/event=0x83,umask=0x1/ */
+#define ICACHE_MISS_CONFIG (0x83 | (0x2 << 8))   /* skylake/event=0x83,umask=0x2/ */
+#define MEM_LOADS_CONFIG   (0xd0 | (0x81 << 8 )) /* skylake/event=0xd0,umask=0x81/ */
+#define MEM_STORES_CONFIG  (0xd0 | (0x82 << 8 )) /* skylake/event=0xd0,umask=0x82/ */
+
+extern EventID PreEvents[PMU_EVENT_MAX];  /* Pre-defined events.   */
+
+static void X86SetupEventCode()
+{
+#define SetupEvent(_Event,_Config)            \
+    PreEvents[_Event].Type   = PERF_TYPE_RAW; \
+    PreEvents[_Event].Config = _Config;
+
+    SetupEvent(PMU_ICACHE_HITS, ICACHE_HIT_CONFIG);
+    SetupEvent(PMU_ICACHE_MISSES, ICACHE_MISS_CONFIG);
+    SetupEvent(PMU_MEM_LOADS, MEM_LOADS_CONFIG);
+    SetupEvent(PMU_MEM_STORES, MEM_STORES_CONFIG);
+
+#undef SetEventCode
+}
+
+int X86Init()
+{
+    X86SetupEventCode();
+    return PMU_OK;
+}
+
+} /* namespace pmu */
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/tracer.cpp b/llvm/tracer.cpp
new file mode 100644
index 0000000..9e37442
--- /dev/null
+++ b/llvm/tracer.cpp
@@ -0,0 +1,365 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ *
+ *   This file implements the trace/region formation algorithm.
+ */
+
+
+#include "utils.h"
+#include "tracer.h"
+#include "llvm-state.h"
+
+#define USE_RELAXED_NET
+
+
+unsigned ProfileThreshold = NET_PROFILE_THRESHOLD;
+unsigned PredictThreshold = NET_PREDICT_THRESHOLD;
+
+static inline void start_trace_profiling(TranslationBlock *tb)
+{
+    /* Turn on trace profiling by jumping to the next instruction. */
+    uintptr_t jmp_addr = tb_get_jmp_entry(tb);
+#if defined(TCG_TARGET_I386)
+    patch_jmp(jmp_addr, jmp_addr + 5);
+#elif defined(TCG_TARGET_ARM) || defined(TCG_TARGET_AARCH64)
+    patch_jmp(jmp_addr, jmp_addr + 4);
+#elif defined(TCG_TARGET_PPC64)
+    patch_jmp(jmp_addr, jmp_addr + 16);
+#endif
+}
+
+static inline void copy_image(CPUArchState *env, TranslationBlock *tb)
+{
+#if defined(CONFIG_LLVM) && defined(CONFIG_SOFTMMU)
+    char *p = new char[tb->size];
+    for (int i = 0, e = tb->size; i != e; ++i)
+        p[i] = cpu_ldub_code(env, tb->pc + i);
+    tb->image = (void *)p;
+#endif
+}
+
+static inline void tracer_handle_chaining(uintptr_t next_tb, TranslationBlock *tb)
+{
+#if defined(CONFIG_LLVM)
+    llvm_handle_chaining(next_tb, tb);
+#else
+    /* see if we can patch the calling TB. When the TB spans two pages, we
+     * cannot safely do a direct jump. */
+    if (next_tb != 0 && tb->page_addr[1] == (tb_page_addr_t)-1
+        && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK),
+                    next_tb & TB_EXIT_MASK, tb);
+    }
+#endif
+}
+
+
+#if defined(CONFIG_LLVM)
+#include "llvm.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-hard-perfmon.h"
+static inline void OptimizeBlock(CPUArchState *env, TranslationBlock *TB)
+{
+    auto Request = OptimizationInfo::CreateRequest(TB);
+    LLVMEnv::OptimizeBlock(env, std::move(Request));
+}
+static inline void OptimizeTrace(CPUArchState *env, NETTracer::TBVec &TBs,
+                                 int LoopHeadIdx)
+{
+    auto Request = OptimizationInfo::CreateRequest(TBs, LoopHeadIdx);
+    LLVMEnv::OptimizeTrace(env, std::move(Request));
+}
+static inline void RegisterThread(CPUArchState *env, BaseTracer *tracer)
+{
+    if (ENV_GET_CPU(env)->cpu_index < 0)
+        return;
+    HP->RegisterThread(tracer);
+}
+static inline void UnregisterThread(CPUArchState *env, BaseTracer *tracer)
+{
+    if (ENV_GET_CPU(env)->cpu_index < 0)
+        return;
+    HP->UnregisterThread(tracer);
+    SP->NumTraceExits += env->num_trace_exits;
+}
+static inline void NotifyCacheEnter(CPUArchState *env)
+{
+    if (ENV_GET_CPU(env)->cpu_index < 0)
+        return;
+    HP->NotifyCacheEnter(cpu_get_tracer(env));
+}
+static inline void NotifyCacheLeave(CPUArchState *env)
+{
+    if (ENV_GET_CPU(env)->cpu_index < 0)
+        return;
+    HP->NotifyCacheLeave(cpu_get_tracer(env));
+}
+#else
+static inline void OptimizeBlock(CPUArchState *, TranslationBlock *) {}
+static inline void OptimizeTrace(CPUArchState *, NETTracer::TBVec &, int) {}
+static inline void RegisterThread(CPUArchState *, BaseTracer *) {}
+static inline void UnregisterThread(CPUArchState *, BaseTracer *) {}
+static inline void NotifyCacheEnter(CPUArchState *) {}
+static inline void NotifyCacheLeave(CPUArchState *) {}
+#endif
+
+
+/*
+ * BaseTracer
+ */
+BaseTracer *BaseTracer::CreateTracer(CPUArchState *env)
+{
+#if defined(CONFIG_LLVM)
+    switch (LLVMEnv::TransMode) {
+        case TRANS_MODE_NONE:
+            return new BaseTracer(env);
+        case TRANS_MODE_BLOCK:
+            return new SingleBlockTracer(env);
+        case TRANS_MODE_HYBRIDS:
+            return new NETTracer(env, TRANS_MODE_HYBRIDS);
+        case TRANS_MODE_HYBRIDM:
+            return new NETTracer(env, TRANS_MODE_HYBRIDM);
+        default:
+            break;
+    }
+#endif
+    return new BaseTracer(env);
+}
+
+void BaseTracer::DeleteTracer(CPUArchState *env)
+{
+    auto Tracer = cpu_get_tracer(env);
+    if (Tracer) {
+        delete Tracer;
+        Tracer = nullptr;
+    }
+}
+
+
+/*
+ * SingleBlockTracer
+ */
+SingleBlockTracer::SingleBlockTracer(CPUArchState *env) : BaseTracer(env)
+{
+    if (tracer_mode == TRANS_MODE_NONE)
+        tracer_mode = TRANS_MODE_BLOCK;
+}
+
+void SingleBlockTracer::Record(uintptr_t next_tb, TranslationBlock *tb)
+{
+    /* Optimize the block if we see this block for the first time. */
+    if (update_tb_mode(tb, BLOCK_NONE, BLOCK_ACTIVE))
+        OptimizeBlock(Env, tb);
+    TB = tb;
+}
+
+
+/*
+ * NETTracer
+ */
+NETTracer::NETTracer(CPUArchState *env, int Mode) : BaseTracer(env)
+{
+    if (tracer_mode == TRANS_MODE_NONE)
+        tracer_mode = Mode;
+    RegisterThread(Env, this);
+}
+
+NETTracer::~NETTracer()
+{
+    UnregisterThread(Env, this);
+}
+
+void NETTracer::Reset()
+{
+    TBs.clear();
+    Env->start_trace_prediction = 0;
+}
+
+void NETTracer::Record(uintptr_t next_tb, TranslationBlock *tb)
+{
+    bool NewTB = (tb->mode == BLOCK_NONE);
+
+    /* Promote tb to the active state before any checks if it is a new tb. */
+    if (update_tb_mode(tb, BLOCK_NONE, BLOCK_ACTIVE)) {
+        tcg_save_state(Env, tb);
+        copy_image(Env, tb);
+    }
+
+    if (isTraceHead(next_tb, tb, NewTB)) {
+        if (update_tb_mode(tb, BLOCK_ACTIVE, BLOCK_TRACEHEAD))
+            start_trace_profiling(tb);
+    }
+
+    Env->fallthrough = 0;
+}
+
+/* Determine whether tb is a potential trace head. tb is a trace head if it is
+ * (1) a target of an existing trace exit,
+ * (2) a target of an indirect branch,
+ * (3) (relaxed  NET) a block in a cyclic path (i.e., seen more than once), or
+ *     (original NET) a target of a backward branch. */
+bool NETTracer::isTraceHead(uintptr_t next_tb, TranslationBlock *tb, bool NewTB)
+{
+    /* Rule 1: a target of an existing trace exit. */
+    if ((next_tb & TB_EXIT_MASK) == TB_EXIT_LLVM)
+        return true;
+
+    /* Rule 2: a target of an indirect branch.
+     * Here we check 'next_tb == 0', which can cover the cases other than the
+     * indirect branches (e.g., system calls and exceptions). It is fine to
+     * also start trace formation from the successors of these blocks. */
+    if (next_tb == 0 && Env->fallthrough == 0)
+        return true;
+
+#ifdef USE_RELAXED_NET
+    /* Rule 3: a block in a cyclic path (i.e., seen more than once). */
+    if (!NewTB)
+        return true;
+#else
+    /* Rule 3: a target of a backward branch. */
+    if (next_tb != 0) {
+        TranslationBlock *pred = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
+        if (tb->pc <= pred->pc)
+            return true;
+    }
+#endif
+    return false;
+}
+
+void NETTracer::Profile(TranslationBlock *tb)
+{
+    if (Atomic<uint32_t>::inc_return(&tb->exec_count) != ProfileThreshold)
+        return;
+
+#if 0
+    /* If the execution is already in the prediction mode, process the
+     * previously recorded trace. */
+    if (Env->start_trace_prediction && !TBs.empty()) {
+        OptimizeTrace(Env, TBs, -1);
+        Reset();
+    }
+#endif
+
+    /* We reach a profile threshold, stop trace profiling and start trace tail
+     * prediction. The profiling is disabled by setting the jump directly to
+     * trace prediction stub. */
+    patch_jmp(tb_get_jmp_entry(tb), tb_get_jmp_next(tb));
+    Env->start_trace_prediction = 1;
+}
+
+void NETTracer::Predict(TranslationBlock *tb)
+{
+    /* The trace prediction will terminate if a cyclic path is detected.
+     * (i.e., current tb has existed in the tracing butter either in the
+     * head or middle of the buffer.) */
+    int LoopHeadIdx = -1;
+
+#if defined(CONFIG_LLVM)
+    /* Skip this trace if the next block is an annotated loop head and
+     * is going to be included in the middle of a trace. */
+    if (!TBs.empty() && TBs[0] != tb &&
+        llvm_has_annotation(tb->pc, ANNOTATION_LOOP)) {
+        goto trace_building;
+    }
+#endif
+
+#if defined(USE_TRACETREE_ONLY)
+    /* We would like to have a straight-line or O-shape trace.
+     * (the 6-shape trace is excluded) */
+    if (!TBs.empty() && tb == TBs[0]) {
+        LoopHeadIdx = 0;
+        goto trace_building;
+    }
+#elif defined(USE_RELAXED_NET)
+    /* Find any cyclic path in recently recorded blocks. */
+    for (int i = 0, e = TBs.size(); i != e; ++i) {
+        if (tb == TBs[i]) {
+            LoopHeadIdx = i;
+            goto trace_building;
+        }
+    }
+#else
+    if (!TBs.empty()) {
+        if (tb == TBs[0]) {
+            /* Cyclic path. */
+            LoopHeadIdx = 0;
+            goto trace_building;
+        }
+        if (tb->pc <= TBs[TBs.size() - 1]->pc) {
+            /* Backward branch. */
+            goto trace_building;
+        }
+    }
+#endif
+
+    TBs.push_back(tb);
+
+    /* Stop if the maximum prediction length is reached. */
+    if (TBs.size() == PredictThreshold)
+        goto trace_building;
+
+    return;
+
+trace_building:
+    /* If the trace is a loop with a branch to the middle of the loop body,
+     * we forms two sub-traces: (1) the loop starting from the loopback to
+     * the end of the trace and (2) the original trace. */
+    /* NOTE: We want to find more traces so the original trace is included. */
+
+    if (LoopHeadIdx > 0) {
+        /* Loopback at the middle. The sub-trace (1) is optimized first. */
+        TBVec Loop(TBs.begin() + LoopHeadIdx, TBs.end());
+        update_tb_mode(Loop[0], BLOCK_ACTIVE, BLOCK_TRACEHEAD);
+        OptimizeTrace(Env, Loop, 0);
+    }
+    OptimizeTrace(Env, TBs, LoopHeadIdx);
+
+    Reset();
+}
+
+
+/* The follows implement routines of the C interfaces for QEMU. */
+extern "C" {
+
+int tracer_mode = TRANS_MODE_NONE;
+
+void tracer_reset(CPUArchState *env)
+{
+    auto Tracer = cpu_get_tracer(env);
+    Tracer->Reset();
+}
+
+/* This routine is called when QEMU is going to leave the dispatcher and enter
+ * the code cache to execute block code `tb'. Here, we determine whether tb is
+ * a potential trace head and should perform trace formation. */
+void tracer_exec_tb(CPUArchState *env, uintptr_t next_tb, TranslationBlock *tb)
+{
+    auto Tracer = cpu_get_tracer(env);
+    Tracer->Record(next_tb, tb);
+
+    tracer_handle_chaining(next_tb, tb);
+}
+
+
+/* Helper function to perform trace profiling. */
+void helper_NET_profile(CPUArchState *env, int id)
+{
+    auto &Tracer = getNETTracer(env);
+    Tracer.Profile(&tbs[id]);
+}
+
+/* Helper function to perform trace prediction. */
+void helper_NET_predict(CPUArchState *env, int id)
+{
+    auto &Tracer = getNETTracer(env);
+    Tracer.Predict(&tbs[id]);
+}
+
+} /* extern "C" */
+
+
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/llvm/utils.cpp b/llvm/utils.cpp
new file mode 100644
index 0000000..69e77af
--- /dev/null
+++ b/llvm/utils.cpp
@@ -0,0 +1,223 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include "utils.h"
+
+
+/* Remove a CFG starting from Root. */
+void GraphNode::DeleteCFG(GraphNode *Root)
+{
+    NodeVec VisitStack;
+    NodeSet Visited;
+    VisitStack.push_back(Root);
+    do {
+        GraphNode *Parent = VisitStack.back();
+        VisitStack.pop_back();
+        if (Visited.find(Parent) == Visited.end()) {
+            Visited.insert(Parent);
+            for (auto Child : Parent->getChildren())
+                VisitStack.push_back(Child);
+        }
+    } while(!VisitStack.empty());
+
+    for (auto I = Visited.begin(), E = Visited.end(); I != E; ++I)
+        delete *I;
+}
+
+#ifdef LOCK_FREE
+/*  Lock-free FIFO queue algorithm of Michael and Scott (MS-queue).
+ *  The code is based on the paper published in PODC'96:
+ *      Maged M. Michael and Michael L. Scott, "Simple, Fast, and Practical
+ *      Non-Blocking and Blocking Concurrent Queue Algorithms," Proc. 15th ACM
+ *      Symp. on Principles of Distributed Computing, pages 267-275, 1996.
+ */
+static inline char CAS2(volatile struct pointer_t *ptr,
+                        struct pointer_t _old,
+                        struct pointer_t _new)
+{
+    char flag = 0;
+
+#if defined(__i386__)
+    asm volatile("lock; cmpxchg8b %0; setz %1;"
+        : "=m" (*ptr), "=q" (flag)
+        : "d" (_old.count), "a" (_old.ptr), "c" (_new.count), "b" (_new.ptr)
+        : "memory", "cc");
+#elif defined(__x86_64__)
+    asm volatile("lock; cmpxchg16b %0; setz %1;"
+        : "=m" (*ptr), "=q" (flag)
+        : "d" (_old.count), "a" (_old.ptr), "c" (_new.count), "b" (_new.ptr)
+        : "memory", "cc");
+#elif defined(__arm__)
+    unsigned long oldval, res;
+    asm volatile("ldrex  %1, [%3]\n"
+                 "mov    %0, #0\n"
+                 "teq    %1, %4\n"
+                 "strexeq %0, %5, [%3]\n"
+        : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr->ptr)
+        : "r" (ptr->ptr), "Ir" (_old.ptr), "r" (_new.ptr)
+        : "cc");
+    flag = !res;
+#endif
+    return flag;
+}
+
+Queue::Queue()
+{
+    node_t *dummy = new_node(nullptr);
+    Q.head.ptr = Q.tail.ptr = dummy;
+    Q.head.count = Q.tail.count = 0;
+}
+
+void Queue::enqueue(void *data)
+{
+    pointer_t tail, next, insert;
+    node_t *node = new_node(data);
+    insert.ptr = node;
+
+    for (;;) {
+        tail = Q.tail;
+        next = tail.ptr->next;
+        
+        /* If Tail is consistent (addresses and versions are not changed),
+           continue to enqueue. */
+        if (CAS2(&Q.tail, tail, Q.tail)) {
+            /* If Tail is pointing to the last node, continue to enqueue.
+               Otherwise, try to advance Tail because it might be pointing
+               to the second last node. */
+            if (next.ptr == nullptr) {  /* Last node */
+                /* Try to insert node at the end of the linked list.
+                   if it succeeds, exit the loop. */
+                insert.count = next.count + 1;
+                if (CAS2(&(tail.ptr->next), next, insert))
+                    break;
+            } else {
+                next.count = tail.count + 1;
+                CAS2(&Q.tail, tail, next);
+            }
+        }
+    }
+    
+    /* Enqueue is done, try to swing Tail to the inserted node. */
+    insert.count = tail.count + 1;
+    CAS2(&Q.tail, tail, insert);
+}
+
+void *Queue::dequeue()
+{
+    pointer_t head, tail, next;
+    void *data;
+
+    for (;;) {
+        head = Q.head;
+        tail = Q.tail;
+        next = head.ptr->next;
+        
+        /* If Head is consistent (addresses and versions are not changed),
+           continue to dequeue. */
+        if (CAS2(&Q.head, head, Q.head)) {
+            /* If Queue is empty, stop dequeueing. If Tail falling behind, 
+               try to advance it. Otherwise, continue to dequeue. */
+            if (head.ptr == tail.ptr) {
+                if (next.ptr == nullptr) /* Queue is empty */
+                    return nullptr;
+                
+                /* Tail is falling behand, try to advance it. */
+                next.count = tail.count + 1;
+                CAS2(&Q.tail, tail, next);
+            } else {
+                /* We must read value before CAS, otherwise another dequeue 
+                   might free the next node. */
+                data = next.ptr->value;
+                next.count = head.count + 1;
+                if (CAS2(&Q.head, head, next))
+                    break;
+            }
+        }
+    }
+    
+    /* Dequeue succeeded. It is safe to free the dummy node.
+       Node pointed by Head becomes the new dummy node */
+    delete_node(head.ptr);
+
+    return data;
+}
+#else
+Queue::Queue(void)
+{
+    node_t *dummy = new node_t(nullptr);
+    Q.head = Q.tail = dummy;
+    pthread_mutex_init(&lock, nullptr);
+}
+
+void Queue::enqueue(void *data)
+{
+    node_t *node = new node_t(data);
+
+    pthread_mutex_lock(&lock);
+    Q.tail->next = node;
+    Q.tail = node;
+    pthread_mutex_unlock(&lock);
+}
+
+void *Queue::dequeue()
+{
+    node_t *node, *new_head;
+    void *data;
+
+    pthread_mutex_lock(&lock);
+    node = Q.head;
+    new_head = node->next;
+    if (new_head == nullptr) {
+        pthread_mutex_unlock(&lock);
+        return nullptr;
+    }
+
+    data = new_head->value;
+    Q.head = new_head;
+    pthread_mutex_unlock(&lock);
+
+    delete node;
+    return data;
+}
+#endif
+
+/* Get the thread ID. */
+pid_t gettid()
+{
+#ifdef SYS_gettid
+    return (pid_t)syscall(SYS_gettid);
+#elif defined(__NR_gettid)
+    return (pid_t)syscall(__NR_gettid);
+#else
+    return -1;
+#endif
+}
+
+
+/* Patch a direct jump from patch_addr to addr. */
+void patch_jmp(volatile uintptr_t patch_addr, volatile uintptr_t addr)
+{
+#if defined(__i386__) || defined(__x86_64__)
+    tb_set_jmp_target1(patch_addr + 1, addr);
+#elif defined(__aarch64__)
+    tb_set_jmp_target1(patch_addr, addr);
+#elif defined(__arm__)
+    *(uintptr_t *)patch_addr = addr;
+#elif defined(_ARCH_PPC) || defined(_ARCH_PPC64)
+    tb_set_jmp_target1(patch_addr, addr);
+#endif
+}
+
+void patch_jmp(volatile uintptr_t patch_addr, volatile void *addr)
+{
+    patch_jmp(patch_addr, (uintptr_t)addr);
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/llvm/xml/tinyxml2.cpp b/llvm/xml/tinyxml2.cpp
new file mode 100644
index 0000000..354200c
--- /dev/null
+++ b/llvm/xml/tinyxml2.cpp
@@ -0,0 +1,2013 @@
+/*
+Original code by Lee Thomason (www.grinninglizard.com)
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any
+damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any
+purpose, including commercial applications, and to alter it and
+redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must
+not claim that you wrote the original software. If you use this
+software in a product, an acknowledgment in the product documentation
+would be appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and
+must not be misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+#include "tinyxml2.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <cstddef>
+
+#include <fcntl.h>
+using namespace tinyxml2;
+
+static const char LINE_FEED                = (char)0x0a;            // all line endings are normalized to LF
+static const char LF = LINE_FEED;
+static const char CARRIAGE_RETURN        = (char)0x0d;            // CR gets filtered out
+static const char CR = CARRIAGE_RETURN;
+static const char SINGLE_QUOTE            = '\'';
+static const char DOUBLE_QUOTE            = '\"';
+
+// Bunch of unicode info at:
+//        http://www.unicode.org/faq/utf_bom.html
+//    ef bb bf (Microsoft "lead bytes") - designates UTF-8
+
+static const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
+static const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
+static const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
+
+
+#define DELETE_NODE( node )    {            \
+    if ( node ) {                        \
+        MemPool* pool = node->memPool;    \
+        node->~XMLNode();                \
+        pool->Free( node );                \
+    }                                    \
+}
+#define DELETE_ATTRIBUTE( attrib ) {        \
+    if ( attrib ) {                            \
+        MemPool* pool = attrib->memPool;    \
+        attrib->~XMLAttribute();            \
+        pool->Free( attrib );                \
+    }                                        \
+}
+
+struct Entity {
+    const char* pattern;
+    int length;
+    char value;
+};
+
+static const int NUM_ENTITIES = 5;
+static const Entity entities[NUM_ENTITIES] =
+{
+    { "quot", 4,    DOUBLE_QUOTE },
+    { "amp", 3,        '&'  },
+    { "apos", 4,    SINGLE_QUOTE },
+    { "lt",    2,         '<'     },
+    { "gt",    2,        '>'     }
+};
+
+
+StrPair::~StrPair()
+{
+    Reset();
+}
+
+
+void StrPair::Reset()
+{
+    if ( flags & NEEDS_DELETE ) {
+        delete [] start;
+    }
+    flags = 0;
+    start = 0;
+    end = 0;
+}
+
+
+void StrPair::SetStr( const char* str, int flags )
+{
+    Reset();
+    size_t len = strlen( str );
+    start = new char[ len+1 ];
+    memcpy( start, str, len+1 );
+    end = start + len;
+    this->flags = flags | NEEDS_DELETE;
+}
+
+
+char* StrPair::ParseText( char* p, const char* endTag, int strFlags )
+{
+    TIXMLASSERT( endTag && *endTag );
+
+    char* start = p;    // fixme: hides a member
+    char  endChar = *endTag;
+    size_t length = strlen( endTag );
+
+    // Inner loop of text parsing.
+    while ( *p ) {
+        if ( *p == endChar && strncmp( p, endTag, length ) == 0 ) {
+            Set( start, p, strFlags );
+            return p + length;
+        }
+        ++p;
+    }
+    return 0;
+}
+
+
+char* StrPair::ParseName( char* p )
+{
+    char* start = p;
+
+    if ( !start || !(*start) ) {
+        return 0;
+    }
+
+    if ( !XMLUtil::IsAlpha( *p ) ) {
+        return 0;
+    }
+
+    while( *p && (
+               XMLUtil::IsAlphaNum( (unsigned char) *p )
+            || *p == '_'
+            || *p == '-'
+            || *p == '.'
+            || *p == ':' ))
+    {
+        ++p;
+    }
+
+    if ( p > start ) {
+        Set( start, p, 0 );
+        return p;
+    }
+    return 0;
+}
+
+
+
+const char* StrPair::GetStr()
+{
+    if ( flags & NEEDS_FLUSH ) {
+        *end = 0;
+        flags ^= NEEDS_FLUSH;
+
+        if ( flags ) {
+            char* p = start;    // the read pointer
+            char* q = start;    // the write pointer
+
+            while( p < end ) {
+                if ( (flags & NEEDS_NEWLINE_NORMALIZATION) && *p == CR ) {
+                    // CR-LF pair becomes LF
+                    // CR alone becomes LF
+                    // LF-CR becomes LF
+                    if ( *(p+1) == LF ) {
+                        p += 2;
+                    }
+                    else {
+                        ++p;
+                    }
+                    *q++ = LF;
+                }
+                else if ( (flags & NEEDS_NEWLINE_NORMALIZATION) && *p == LF ) {
+                    if ( *(p+1) == CR ) {
+                        p += 2;
+                    }
+                    else {
+                        ++p;
+                    }
+                    *q++ = LF;
+                }
+                else if ( (flags & NEEDS_ENTITY_PROCESSING) && *p == '&' ) {
+                    // Entities handled by tinyXML2:
+                    // - special entities in the entity table [in/out]
+                    // - numeric character reference [in]
+                    //   &#20013; or &#x4e2d;
+
+                    if ( *(p+1) == '#' ) {
+                        char buf[10] = { 0 };
+                        int len;
+                        p = const_cast<char*>( XMLUtil::GetCharacterRef( p, buf, &len ) );
+                        for( int i=0; i<len; ++i ) {
+                            *q++ = buf[i];
+                        }
+                        TIXMLASSERT( q <= p );
+                    }
+                    else {
+                        int i=0;
+                        for(; i<NUM_ENTITIES; ++i ) {
+                            if (    strncmp( p+1, entities[i].pattern, entities[i].length ) == 0
+                                 && *(p+entities[i].length+1) == ';' )
+                            {
+                                // Found an entity convert;
+                                *q = entities[i].value;
+                                ++q;
+                                p += entities[i].length + 2;
+                                break;
+                            }
+                        }
+                        if ( i == NUM_ENTITIES ) {
+                            // fixme: treat as error?
+                            ++p;
+                            ++q;
+                        }
+                    }
+                }
+                else {
+                    *q = *p;
+                    ++p;
+                    ++q;
+                }
+            }
+            *q = 0;
+        }
+        flags = (flags & NEEDS_DELETE);
+    }
+    return start;
+}
+
+
+
+
+// --------- XMLUtil ----------- //
+
+const char* XMLUtil::ReadBOM( const char* p, bool* bom )
+{
+    *bom = false;
+    const unsigned char* pu = reinterpret_cast<const unsigned char*>(p);
+    // Check for BOM:
+    if (    *(pu+0) == TIXML_UTF_LEAD_0
+         && *(pu+1) == TIXML_UTF_LEAD_1
+         && *(pu+2) == TIXML_UTF_LEAD_2 )
+    {
+        *bom = true;
+        p += 3;
+    }
+    return p;
+}
+
+
+void XMLUtil::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
+{
+    int i;
+    const unsigned long BYTE_MASK = 0xBF;
+    const unsigned long BYTE_MARK = 0x80;
+    const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+    if (input < 0x80)
+        *length = 1;
+    else if ( input < 0x800 )
+        *length = 2;
+    else if ( input < 0x10000 )
+        *length = 3;
+    else if ( input < 0x200000 )
+        *length = 4;
+    else
+        { *length = 0; return; }    // This code won't covert this correctly anyway.
+
+    output += *length;
+
+    for (i = *length; i > 0; --i) {
+        if (i == 1) {
+            --output;
+            *output = (char)(input | FIRST_BYTE_MARK[*length]);
+        } else {
+            --output;
+            *output = (char)((input | BYTE_MARK) & BYTE_MASK);
+            input >>= 6;
+	}
+    }
+}
+
+
+const char* XMLUtil::GetCharacterRef( const char* p, char* value, int* length )
+{
+    // Presume an entity, and pull it out.
+    *length = 0;
+
+    if ( *(p+1) == '#' && *(p+2) )
+    {
+        unsigned long ucs = 0;
+        ptrdiff_t delta = 0;
+        unsigned mult = 1;
+
+        if ( *(p+2) == 'x' )
+        {
+            // Hexadecimal.
+            if ( !*(p+3) ) return 0;
+
+            const char* q = p+3;
+            q = strchr( q, ';' );
+
+            if ( !q || !*q ) return 0;
+
+            delta = q-p;
+            --q;
+
+            while ( *q != 'x' )
+            {
+                if ( *q >= '0' && *q <= '9' )
+                    ucs += mult * (*q - '0');
+                else if ( *q >= 'a' && *q <= 'f' )
+                    ucs += mult * (*q - 'a' + 10);
+                else if ( *q >= 'A' && *q <= 'F' )
+                    ucs += mult * (*q - 'A' + 10 );
+                else
+                    return 0;
+                mult *= 16;
+                --q;
+            }
+        }
+        else
+        {
+            // Decimal.
+            if ( !*(p+2) ) return 0;
+
+            const char* q = p+2;
+            q = strchr( q, ';' );
+
+            if ( !q || !*q ) return 0;
+
+            delta = q-p;
+            --q;
+
+            while ( *q != '#' )
+            {
+                if ( *q >= '0' && *q <= '9' )
+                    ucs += mult * (*q - '0');
+                else
+                    return 0;
+                mult *= 10;
+                --q;
+            }
+        }
+        // convert the UCS to UTF-8
+        ConvertUTF32ToUTF8( ucs, value, length );
+        return p + delta + 1;
+    }
+    return p+1;
+}
+
+
+void XMLUtil::ToStr( int v, char* buffer, int bufferSize )
+{
+    TIXML_SNPRINTF( buffer, bufferSize, "%d", v );
+}
+
+
+void XMLUtil::ToStr( unsigned v, char* buffer, int bufferSize )
+{
+    TIXML_SNPRINTF( buffer, bufferSize, "%u", v );
+}
+
+
+void XMLUtil::ToStr( bool v, char* buffer, int bufferSize )
+{
+    TIXML_SNPRINTF( buffer, bufferSize, "%d", v ? 1 : 0 );
+}
+
+
+void XMLUtil::ToStr( float v, char* buffer, int bufferSize )
+{
+    TIXML_SNPRINTF( buffer, bufferSize, "%f", v );
+}
+
+
+void XMLUtil::ToStr( double v, char* buffer, int bufferSize )
+{
+    TIXML_SNPRINTF( buffer, bufferSize, "%f", v );
+}
+
+
+bool XMLUtil::ToInt( const char* str, int* value )
+{
+    if ( TIXML_SSCANF( str, "%d", value ) == 1 )
+        return true;
+    return false;
+}
+
+bool XMLUtil::ToUnsigned( const char* str, unsigned *value )
+{
+    if ( TIXML_SSCANF( str, "%u", value ) == 1 )
+        return true;
+    return false;
+}
+
+bool XMLUtil::ToBool( const char* str, bool* value )
+{
+    int ival = 0;
+    if ( ToInt( str, &ival )) {
+        *value = (ival==0) ? false : true;
+        return true;
+    }
+    if ( StringEqual( str, "true" ) ) {
+        *value = true;
+        return true;
+    }
+    else if ( StringEqual( str, "false" ) ) {
+        *value = false;
+        return true;
+    }
+    return false;
+}
+
+
+bool XMLUtil::ToFloat( const char* str, float* value )
+{
+    if ( TIXML_SSCANF( str, "%f", value ) == 1 ) {
+        return true;
+    }
+    return false;
+}
+
+bool XMLUtil::ToDouble( const char* str, double* value )
+{
+    if ( TIXML_SSCANF( str, "%lf", value ) == 1 ) {
+        return true;
+    }
+    return false;
+}
+
+
+char* XMLDocument::Identify( char* p, XMLNode** node )
+{
+    XMLNode* returnNode = 0;
+    char* start = p;
+    p = XMLUtil::SkipWhiteSpace( p );
+    if( !p || !*p )
+    {
+        return p;
+    }
+
+    // What is this thing?
+    // - Elements start with a letter or underscore, but xml is reserved.
+    // - Comments: <!--
+    // - Decleration: <?
+    // - Everthing else is unknown to tinyxml.
+    //
+
+    static const char* xmlHeader        = { "<?" };
+    static const char* commentHeader    = { "<!--" };
+    static const char* dtdHeader        = { "<!" };
+    static const char* cdataHeader        = { "<![CDATA[" };
+    static const char* elementHeader    = { "<" };    // and a header for everything else; check last.
+
+    static const int xmlHeaderLen        = 2;
+    static const int commentHeaderLen    = 4;
+    static const int dtdHeaderLen        = 2;
+    static const int cdataHeaderLen        = 9;
+    static const int elementHeaderLen    = 1;
+
+#if defined(_MSC_VER)
+#pragma warning ( push )
+#pragma warning ( disable : 4127 )
+#endif
+    TIXMLASSERT( sizeof( XMLComment ) == sizeof( XMLUnknown ) );        // use same memory pool
+    TIXMLASSERT( sizeof( XMLComment ) == sizeof( XMLDeclaration ) );    // use same memory pool
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
+    if ( XMLUtil::StringEqual( p, xmlHeader, xmlHeaderLen ) ) {
+        returnNode = new (commentPool.Alloc()) XMLDeclaration( this );
+        returnNode->memPool = &commentPool;
+        p += xmlHeaderLen;
+    }
+    else if ( XMLUtil::StringEqual( p, commentHeader, commentHeaderLen ) ) {
+        returnNode = new (commentPool.Alloc()) XMLComment( this );
+        returnNode->memPool = &commentPool;
+        p += commentHeaderLen;
+    }
+    else if ( XMLUtil::StringEqual( p, cdataHeader, cdataHeaderLen ) ) {
+        XMLText* text = new (textPool.Alloc()) XMLText( this );
+        returnNode = text;
+        returnNode->memPool = &textPool;
+        p += cdataHeaderLen;
+        text->SetCData( true );
+    }
+    else if ( XMLUtil::StringEqual( p, dtdHeader, dtdHeaderLen ) ) {
+        returnNode = new (commentPool.Alloc()) XMLUnknown( this );
+        returnNode->memPool = &commentPool;
+        p += dtdHeaderLen;
+    }
+    else if ( XMLUtil::StringEqual( p, elementHeader, elementHeaderLen ) ) {
+        returnNode = new (elementPool.Alloc()) XMLElement( this );
+        returnNode->memPool = &elementPool;
+        p += elementHeaderLen;
+    }
+    else {
+        returnNode = new (textPool.Alloc()) XMLText( this );
+        returnNode->memPool = &textPool;
+        p = start;    // Back it up, all the text counts.
+    }
+
+    *node = returnNode;
+    return p;
+}
+
+
+bool XMLDocument::Accept( XMLVisitor* visitor ) const
+{
+    if ( visitor->VisitEnter( *this ) )
+    {
+        for ( const XMLNode* node=FirstChild(); node; node=node->NextSibling() )
+        {
+            if ( !node->Accept( visitor ) )
+                break;
+        }
+    }
+    return visitor->VisitExit( *this );
+}
+
+
+// --------- XMLNode ----------- //
+
+XMLNode::XMLNode( XMLDocument* doc ) :
+    document( doc ),
+    parent( 0 ),
+    firstChild( 0 ), lastChild( 0 ),
+    prev( 0 ), next( 0 )
+{
+}
+
+
+XMLNode::~XMLNode()
+{
+    DeleteChildren();
+    if ( parent ) {
+        parent->Unlink( this );
+    }
+}
+
+
+void XMLNode::SetValue( const char* str, bool staticMem )
+{
+    if ( staticMem )
+        value.SetInternedStr( str );
+    else
+        value.SetStr( str );
+}
+
+
+void XMLNode::DeleteChildren()
+{
+    while( firstChild ) {
+        XMLNode* node = firstChild;
+        Unlink( node );
+
+        DELETE_NODE( node );
+    }
+    firstChild = lastChild = 0;
+}
+
+
+void XMLNode::Unlink( XMLNode* child )
+{
+    TIXMLASSERT( child->parent == this );
+    if ( child == firstChild )
+        firstChild = firstChild->next;
+    if ( child == lastChild )
+        lastChild = lastChild->prev;
+
+    if ( child->prev ) {
+        child->prev->next = child->next;
+    }
+    if ( child->next ) {
+        child->next->prev = child->prev;
+    }
+    child->parent = 0;
+}
+
+
+void XMLNode::DeleteChild( XMLNode* node )
+{
+    TIXMLASSERT( node->parent == this );
+    DELETE_NODE( node );
+}
+
+
+XMLNode* XMLNode::InsertEndChild( XMLNode* addThis )
+{
+    if ( lastChild ) {
+        TIXMLASSERT( firstChild );
+        TIXMLASSERT( lastChild->next == 0 );
+        lastChild->next = addThis;
+        addThis->prev = lastChild;
+        lastChild = addThis;
+
+        addThis->next = 0;
+    }
+    else {
+        TIXMLASSERT( firstChild == 0 );
+        firstChild = lastChild = addThis;
+
+        addThis->prev = 0;
+        addThis->next = 0;
+    }
+    addThis->parent = this;
+    return addThis;
+}
+
+
+XMLNode* XMLNode::InsertFirstChild( XMLNode* addThis )
+{
+    if ( firstChild ) {
+        TIXMLASSERT( lastChild );
+        TIXMLASSERT( firstChild->prev == 0 );
+
+        firstChild->prev = addThis;
+        addThis->next = firstChild;
+        firstChild = addThis;
+
+        addThis->prev = 0;
+    }
+    else {
+        TIXMLASSERT( lastChild == 0 );
+        firstChild = lastChild = addThis;
+
+        addThis->prev = 0;
+        addThis->next = 0;
+    }
+    addThis->parent = this;
+    return addThis;
+}
+
+
+XMLNode* XMLNode::InsertAfterChild( XMLNode* afterThis, XMLNode* addThis )
+{
+    TIXMLASSERT( afterThis->parent == this );
+    if ( afterThis->parent != this )
+        return 0;
+
+    if ( afterThis->next == 0 ) {
+        // The last node or the only node.
+        return InsertEndChild( addThis );
+    }
+    addThis->prev = afterThis;
+    addThis->next = afterThis->next;
+    afterThis->next->prev = addThis;
+    afterThis->next = addThis;
+    addThis->parent = this;
+    return addThis;
+}
+
+
+
+
+const XMLElement* XMLNode::FirstChildElement( const char* value ) const
+{
+    for( XMLNode* node=firstChild; node; node=node->next ) {
+        XMLElement* element = node->ToElement();
+        if ( element ) {
+            if ( !value || XMLUtil::StringEqual( element->Name(), value ) ) {
+                return element;
+            }
+        }
+    }
+    return 0;
+}
+
+
+const XMLElement* XMLNode::LastChildElement( const char* value ) const
+{
+    for( XMLNode* node=lastChild; node; node=node->prev ) {
+        XMLElement* element = node->ToElement();
+        if ( element ) {
+            if ( !value || XMLUtil::StringEqual( element->Name(), value ) ) {
+                return element;
+            }
+        }
+    }
+    return 0;
+}
+
+
+const XMLElement* XMLNode::NextSiblingElement( const char* value ) const
+{
+    for( XMLNode* element=this->next; element; element = element->next ) {
+        if (    element->ToElement()
+             && (!value || XMLUtil::StringEqual( value, element->Value() )))
+        {
+            return element->ToElement();
+        }
+    }
+    return 0;
+}
+
+
+const XMLElement* XMLNode::PreviousSiblingElement( const char* value ) const
+{
+    for( XMLNode* element=this->prev; element; element = element->prev ) {
+        if (    element->ToElement()
+             && (!value || XMLUtil::StringEqual( value, element->Value() )))
+        {
+            return element->ToElement();
+        }
+    }
+    return 0;
+}
+
+
+char* XMLNode::ParseDeep( char* p, StrPair* parentEnd )
+{
+    // This is a recursive method, but thinking about it "at the current level"
+    // it is a pretty simple flat list:
+    //        <foo/>
+    //        <!-- comment -->
+    //
+    // With a special case:
+    //        <foo>
+    //        </foo>
+    //        <!-- comment -->
+    //
+    // Where the closing element (/foo) *must* be the next thing after the opening
+    // element, and the names must match. BUT the tricky bit is that the closing
+    // element will be read by the child.
+    //
+    // 'endTag' is the end tag for this node, it is returned by a call to a child.
+    // 'parentEnd' is the end tag for the parent, which is filled in and returned.
+
+    while( p && *p ) {
+        XMLNode* node = 0;
+
+        p = document->Identify( p, &node );
+        if ( p == 0 || node == 0 ) {
+            break;
+        }
+
+        StrPair endTag;
+        p = node->ParseDeep( p, &endTag );
+        if ( !p ) {
+            DELETE_NODE( node );
+            node = 0;
+            if ( !document->Error() ) {
+                document->SetError( XML_ERROR_PARSING, 0, 0 );
+            }
+            break;
+        }
+
+        // We read the end tag. Return it to the parent.
+        if ( node->ToElement() && node->ToElement()->ClosingType() == XMLElement::CLOSING ) {
+            if ( parentEnd ) {
+                *parentEnd = static_cast<XMLElement*>(node)->value;
+            }
+            DELETE_NODE( node );
+            return p;
+        }
+
+        // Handle an end tag returned to this level.
+        // And handle a bunch of annoying errors.
+        XMLElement* ele = node->ToElement();
+        if ( ele ) {
+            if ( endTag.Empty() && ele->ClosingType() == XMLElement::OPEN ) {
+                document->SetError( XML_ERROR_MISMATCHED_ELEMENT, node->Value(), 0 );
+                p = 0;
+            }
+            else if ( !endTag.Empty() && ele->ClosingType() != XMLElement::OPEN ) {
+                document->SetError( XML_ERROR_MISMATCHED_ELEMENT, node->Value(), 0 );
+                p = 0;
+            }
+            else if ( !endTag.Empty() ) {
+                if ( !XMLUtil::StringEqual( endTag.GetStr(), node->Value() )) {
+                    document->SetError( XML_ERROR_MISMATCHED_ELEMENT, node->Value(), 0 );
+                    p = 0;
+                }
+            }
+        }
+        if ( p == 0 ) {
+            DELETE_NODE( node );
+            node = 0;
+        }
+        if ( node ) {
+            this->InsertEndChild( node );
+        }
+    }
+    return 0;
+}
+
+// --------- XMLText ---------- //
+char* XMLText::ParseDeep( char* p, StrPair* )
+{
+    const char* start = p;
+    if ( this->CData() ) {
+        p = value.ParseText( p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION );
+        if ( !p ) {
+            document->SetError( XML_ERROR_PARSING_CDATA, start, 0 );
+        }
+        return p;
+    }
+    else {
+        p = value.ParseText( p, "<", document->ProcessEntities() ? StrPair::TEXT_ELEMENT : StrPair::TEXT_ELEMENT_LEAVE_ENTITIES );
+        if ( !p ) {
+            document->SetError( XML_ERROR_PARSING_TEXT, start, 0 );
+        }
+        if ( p && *p ) {
+            return p-1;
+        }
+    }
+    return 0;
+}
+
+
+XMLNode* XMLText::ShallowClone( XMLDocument* doc ) const
+{
+    if ( !doc ) {
+        doc = document;
+    }
+    XMLText* text = doc->NewText( Value() );    // fixme: this will always allocate memory. Intern?
+    text->SetCData( this->CData() );
+    return text;
+}
+
+
+bool XMLText::ShallowEqual( const XMLNode* compare ) const
+{
+    return ( compare->ToText() && XMLUtil::StringEqual( compare->ToText()->Value(), Value() ));
+}
+
+
+bool XMLText::Accept( XMLVisitor* visitor ) const
+{
+    return visitor->Visit( *this );
+}
+
+
+// --------- XMLComment ---------- //
+
+XMLComment::XMLComment( XMLDocument* doc ) : XMLNode( doc )
+{
+}
+
+
+XMLComment::~XMLComment()
+{
+    //printf( "~XMLComment\n" );
+}
+
+
+char* XMLComment::ParseDeep( char* p, StrPair* )
+{
+    // Comment parses as text.
+    const char* start = p;
+    p = value.ParseText( p, "-->", StrPair::COMMENT );
+    if ( p == 0 ) {
+        document->SetError( XML_ERROR_PARSING_COMMENT, start, 0 );
+    }
+    return p;
+}
+
+
+XMLNode* XMLComment::ShallowClone( XMLDocument* doc ) const
+{
+    if ( !doc ) {
+        doc = document;
+    }
+    XMLComment* comment = doc->NewComment( Value() );    // fixme: this will always allocate memory. Intern?
+    return comment;
+}
+
+
+bool XMLComment::ShallowEqual( const XMLNode* compare ) const
+{
+    return ( compare->ToComment() && XMLUtil::StringEqual( compare->ToComment()->Value(), Value() ));
+}
+
+
+bool XMLComment::Accept( XMLVisitor* visitor ) const
+{
+    return visitor->Visit( *this );
+}
+
+
+// --------- XMLDeclaration ---------- //
+
+XMLDeclaration::XMLDeclaration( XMLDocument* doc ) : XMLNode( doc )
+{
+}
+
+
+XMLDeclaration::~XMLDeclaration()
+{
+    //printf( "~XMLDeclaration\n" );
+}
+
+
+char* XMLDeclaration::ParseDeep( char* p, StrPair* )
+{
+    // Declaration parses as text.
+    const char* start = p;
+    p = value.ParseText( p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION );
+    if ( p == 0 ) {
+        document->SetError( XML_ERROR_PARSING_DECLARATION, start, 0 );
+    }
+    return p;
+}
+
+
+XMLNode* XMLDeclaration::ShallowClone( XMLDocument* doc ) const
+{
+    if ( !doc ) {
+        doc = document;
+    }
+    XMLDeclaration* dec = doc->NewDeclaration( Value() );    // fixme: this will always allocate memory. Intern?
+    return dec;
+}
+
+
+bool XMLDeclaration::ShallowEqual( const XMLNode* compare ) const
+{
+    return ( compare->ToDeclaration() && XMLUtil::StringEqual( compare->ToDeclaration()->Value(), Value() ));
+}
+
+
+
+bool XMLDeclaration::Accept( XMLVisitor* visitor ) const
+{
+    return visitor->Visit( *this );
+}
+
+// --------- XMLUnknown ---------- //
+
+XMLUnknown::XMLUnknown( XMLDocument* doc ) : XMLNode( doc )
+{
+}
+
+
+XMLUnknown::~XMLUnknown()
+{
+}
+
+
+char* XMLUnknown::ParseDeep( char* p, StrPair* )
+{
+    // Unknown parses as text.
+    const char* start = p;
+
+    p = value.ParseText( p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION );
+    if ( !p ) {
+        document->SetError( XML_ERROR_PARSING_UNKNOWN, start, 0 );
+    }
+    return p;
+}
+
+
+XMLNode* XMLUnknown::ShallowClone( XMLDocument* doc ) const
+{
+    if ( !doc ) {
+        doc = document;
+    }
+    XMLUnknown* text = doc->NewUnknown( Value() );    // fixme: this will always allocate memory. Intern?
+    return text;
+}
+
+
+bool XMLUnknown::ShallowEqual( const XMLNode* compare ) const
+{
+    return ( compare->ToUnknown() && XMLUtil::StringEqual( compare->ToUnknown()->Value(), Value() ));
+}
+
+
+bool XMLUnknown::Accept( XMLVisitor* visitor ) const
+{
+    return visitor->Visit( *this );
+}
+
+// --------- XMLAttribute ---------- //
+char* XMLAttribute::ParseDeep( char* p, bool processEntities )
+{
+    // Parse using the name rules: bug fix, was using ParseText before
+    p = name.ParseName( p );
+    if ( !p || !*p ) return 0;
+
+    // Skip white space before =
+    p = XMLUtil::SkipWhiteSpace( p );
+    if ( !p || *p != '=' ) return 0;
+
+    ++p;    // move up to opening quote
+    p = XMLUtil::SkipWhiteSpace( p );
+    if ( *p != '\"' && *p != '\'' ) return 0;
+
+    char endTag[2] = { *p, 0 };
+    ++p;    // move past opening quote
+
+    p = value.ParseText( p, endTag, processEntities ? StrPair::ATTRIBUTE_VALUE : StrPair::ATTRIBUTE_VALUE_LEAVE_ENTITIES );
+    return p;
+}
+
+
+void XMLAttribute::SetName( const char* n )
+{
+    name.SetStr( n );
+}
+
+
+int XMLAttribute::QueryIntValue( int* value ) const
+{
+    if ( XMLUtil::ToInt( Value(), value ))
+        return XML_NO_ERROR;
+    return XML_WRONG_ATTRIBUTE_TYPE;
+}
+
+
+int XMLAttribute::QueryUnsignedValue( unsigned int* value ) const
+{
+    if ( XMLUtil::ToUnsigned( Value(), value ))
+        return XML_NO_ERROR;
+    return XML_WRONG_ATTRIBUTE_TYPE;
+}
+
+
+int XMLAttribute::QueryBoolValue( bool* value ) const
+{
+    if ( XMLUtil::ToBool( Value(), value )) {
+        return XML_NO_ERROR;
+    }
+    return XML_WRONG_ATTRIBUTE_TYPE;
+}
+
+
+int XMLAttribute::QueryFloatValue( float* value ) const
+{
+    if ( XMLUtil::ToFloat( Value(), value ))
+        return XML_NO_ERROR;
+    return XML_WRONG_ATTRIBUTE_TYPE;
+}
+
+
+int XMLAttribute::QueryDoubleValue( double* value ) const
+{
+    if ( XMLUtil::ToDouble( Value(), value ))
+        return XML_NO_ERROR;
+    return XML_WRONG_ATTRIBUTE_TYPE;
+}
+
+
+void XMLAttribute::SetAttribute( const char* v )
+{
+    value.SetStr( v );
+}
+
+
+void XMLAttribute::SetAttribute( int v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    value.SetStr( buf );
+}
+
+
+void XMLAttribute::SetAttribute( unsigned v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    value.SetStr( buf );
+}
+
+
+void XMLAttribute::SetAttribute( bool v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    value.SetStr( buf );
+}
+
+void XMLAttribute::SetAttribute( double v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    value.SetStr( buf );
+}
+
+void XMLAttribute::SetAttribute( float v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    value.SetStr( buf );
+}
+
+
+// --------- XMLElement ---------- //
+XMLElement::XMLElement( XMLDocument* doc ) : XMLNode( doc ),
+    closingType( 0 ),
+    rootAttribute( 0 )
+{
+}
+
+
+XMLElement::~XMLElement()
+{
+    while( rootAttribute ) {
+        XMLAttribute* next = rootAttribute->next;
+        DELETE_ATTRIBUTE( rootAttribute );
+        rootAttribute = next;
+    }
+}
+
+
+XMLAttribute* XMLElement::FindAttribute( const char* name )
+{
+    XMLAttribute* a = 0;
+    for( a=rootAttribute; a; a = a->next ) {
+        if ( XMLUtil::StringEqual( a->Name(), name ) )
+            return a;
+    }
+    return 0;
+}
+
+
+const XMLAttribute* XMLElement::FindAttribute( const char* name ) const
+{
+    XMLAttribute* a = 0;
+    for( a=rootAttribute; a; a = a->next ) {
+        if ( XMLUtil::StringEqual( a->Name(), name ) )
+            return a;
+    }
+    return 0;
+}
+
+
+const char* XMLElement::Attribute( const char* name, const char* value ) const
+{
+    const XMLAttribute* a = FindAttribute( name );
+    if ( !a )
+        return 0;
+    if ( !value || XMLUtil::StringEqual( a->Value(), value ))
+        return a->Value();
+    return 0;
+}
+
+
+const char* XMLElement::GetText() const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        return FirstChild()->ToText()->Value();
+    }
+    return 0;
+}
+
+
+int XMLElement::QueryIntText( int* _value ) const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        const char* t = FirstChild()->ToText()->Value();
+        if ( XMLUtil::ToInt( t, _value ) ) {
+            return XML_SUCCESS;
+        }
+        return XML_CAN_NOT_CONVERT_TEXT;
+    }
+    return XML_NO_TEXT_NODE;
+}
+
+
+int XMLElement::QueryUnsignedText( unsigned* _value ) const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        const char* t = FirstChild()->ToText()->Value();
+        if ( XMLUtil::ToUnsigned( t, _value ) ) {
+            return XML_SUCCESS;
+        }
+        return XML_CAN_NOT_CONVERT_TEXT;
+    }
+    return XML_NO_TEXT_NODE;
+}
+
+
+int XMLElement::QueryBoolText( bool* _value ) const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        const char* t = FirstChild()->ToText()->Value();
+        if ( XMLUtil::ToBool( t, _value ) ) {
+            return XML_SUCCESS;
+        }
+        return XML_CAN_NOT_CONVERT_TEXT;
+    }
+    return XML_NO_TEXT_NODE;
+}
+
+
+int XMLElement::QueryDoubleText( double* _value ) const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        const char* t = FirstChild()->ToText()->Value();
+        if ( XMLUtil::ToDouble( t, _value ) ) {
+            return XML_SUCCESS;
+        }
+        return XML_CAN_NOT_CONVERT_TEXT;
+    }
+    return XML_NO_TEXT_NODE;
+}
+
+
+int XMLElement::QueryFloatText( float* _value ) const
+{
+    if ( FirstChild() && FirstChild()->ToText() ) {
+        const char* t = FirstChild()->ToText()->Value();
+        if ( XMLUtil::ToFloat( t, _value ) ) {
+            return XML_SUCCESS;
+        }
+        return XML_CAN_NOT_CONVERT_TEXT;
+    }
+    return XML_NO_TEXT_NODE;
+}
+
+
+
+XMLAttribute* XMLElement::FindOrCreateAttribute( const char* name )
+{
+    XMLAttribute* last = 0;
+    XMLAttribute* attrib = 0;
+    for( attrib = rootAttribute;
+         attrib;
+         last = attrib, attrib = attrib->next )
+    {
+        if ( XMLUtil::StringEqual( attrib->Name(), name ) ) {
+            break;
+        }
+    }
+    if ( !attrib ) {
+        attrib = new (document->attributePool.Alloc() ) XMLAttribute();
+        attrib->memPool = &document->attributePool;
+        if ( last ) {
+            last->next = attrib;
+        }
+        else {
+            rootAttribute = attrib;
+        }
+        attrib->SetName( name );
+    }
+    return attrib;
+}
+
+
+void XMLElement::DeleteAttribute( const char* name )
+{
+    XMLAttribute* prev = 0;
+    for( XMLAttribute* a=rootAttribute; a; a=a->next ) {
+        if ( XMLUtil::StringEqual( name, a->Name() ) ) {
+            if ( prev ) {
+                prev->next = a->next;
+            }
+            else {
+                rootAttribute = a->next;
+            }
+            DELETE_ATTRIBUTE( a );
+            break;
+        }
+        prev = a;
+    }
+}
+
+
+char* XMLElement::ParseAttributes( char* p )
+{
+    const char* start = p;
+    XMLAttribute* prevAttribute = 0;
+
+    // Read the attributes.
+    while( p ) {
+        p = XMLUtil::SkipWhiteSpace( p );
+        if ( !p || !(*p) ) {
+            document->SetError( XML_ERROR_PARSING_ELEMENT, start, Name() );
+            return 0;
+        }
+
+        // attribute.
+        if ( XMLUtil::IsAlpha( *p ) ) {
+            XMLAttribute* attrib = new (document->attributePool.Alloc() ) XMLAttribute();
+            attrib->memPool = &document->attributePool;
+
+            p = attrib->ParseDeep( p, document->ProcessEntities() );
+            if ( !p || Attribute( attrib->Name() ) ) {
+                DELETE_ATTRIBUTE( attrib );
+                document->SetError( XML_ERROR_PARSING_ATTRIBUTE, start, p );
+                return 0;
+            }
+            // There is a minor bug here: if the attribute in the source xml
+            // document is duplicated, it will not be detected and the
+            // attribute will be doubly added. However, tracking the 'prevAttribute'
+            // avoids re-scanning the attribute list. Preferring performance for
+            // now, may reconsider in the future.
+            if ( prevAttribute ) {
+                prevAttribute->next = attrib;
+            }
+            else {
+                rootAttribute = attrib;
+            }
+            prevAttribute = attrib;
+        }
+        // end of the tag
+        else if ( *p == '/' && *(p+1) == '>' ) {
+            closingType = CLOSED;
+            return p+2;    // done; sealed element.
+        }
+        // end of the tag
+        else if ( *p == '>' ) {
+            ++p;
+            break;
+        }
+        else {
+            document->SetError( XML_ERROR_PARSING_ELEMENT, start, p );
+            return 0;
+        }
+    }
+    return p;
+}
+
+
+//
+//    <ele></ele>
+//    <ele>foo<b>bar</b></ele>
+//
+char* XMLElement::ParseDeep( char* p, StrPair* strPair )
+{
+    // Read the element name.
+    p = XMLUtil::SkipWhiteSpace( p );
+    if ( !p ) return 0;
+
+    // The closing element is the </element> form. It is
+    // parsed just like a regular element then deleted from
+    // the DOM.
+    if ( *p == '/' ) {
+        closingType = CLOSING;
+        ++p;
+    }
+
+    p = value.ParseName( p );
+    if ( value.Empty() ) return 0;
+
+    p = ParseAttributes( p );
+    if ( !p || !*p || closingType )
+        return p;
+
+    p = XMLNode::ParseDeep( p, strPair );
+    return p;
+}
+
+
+
+XMLNode* XMLElement::ShallowClone( XMLDocument* doc ) const
+{
+    if ( !doc ) {
+        doc = document;
+    }
+    XMLElement* element = doc->NewElement( Value() );                    // fixme: this will always allocate memory. Intern?
+    for( const XMLAttribute* a=FirstAttribute(); a; a=a->Next() ) {
+        element->SetAttribute( a->Name(), a->Value() );                    // fixme: this will always allocate memory. Intern?
+    }
+    return element;
+}
+
+
+bool XMLElement::ShallowEqual( const XMLNode* compare ) const
+{
+    const XMLElement* other = compare->ToElement();
+    if ( other && XMLUtil::StringEqual( other->Value(), Value() )) {
+
+        const XMLAttribute* a=FirstAttribute();
+        const XMLAttribute* b=other->FirstAttribute();
+
+        while ( a && b ) {
+            if ( !XMLUtil::StringEqual( a->Value(), b->Value() ) ) {
+                return false;
+            }
+            a = a->Next();
+            b = b->Next();
+        }
+        if ( a || b ) {
+            // different count
+            return false;
+        }
+        return true;
+    }
+    return false;
+}
+
+
+bool XMLElement::Accept( XMLVisitor* visitor ) const
+{
+    if ( visitor->VisitEnter( *this, rootAttribute ) )
+    {
+        for ( const XMLNode* node=FirstChild(); node; node=node->NextSibling() )
+        {
+            if ( !node->Accept( visitor ) )
+                break;
+        }
+    }
+    return visitor->VisitExit( *this );
+}
+
+
+// --------- XMLDocument ----------- //
+XMLDocument::XMLDocument( bool _processEntities ) :
+    XMLNode( 0 ),
+    writeBOM( false ),
+    processEntities( _processEntities ),
+    errorID( 0 ),
+    errorStr1( 0 ),
+    errorStr2( 0 ),
+    charBuffer( 0 )
+{
+    document = this;    // avoid warning about 'this' in initializer list
+}
+
+
+XMLDocument::~XMLDocument()
+{
+    DeleteChildren();
+    delete [] charBuffer;
+
+#if 0
+    textPool.Trace( "text" );
+    elementPool.Trace( "element" );
+    commentPool.Trace( "comment" );
+    attributePool.Trace( "attribute" );
+#endif
+
+    TIXMLASSERT( textPool.CurrentAllocs() == 0 );
+    TIXMLASSERT( elementPool.CurrentAllocs() == 0 );
+    TIXMLASSERT( commentPool.CurrentAllocs() == 0 );
+    TIXMLASSERT( attributePool.CurrentAllocs() == 0 );
+}
+
+
+void XMLDocument::InitDocument()
+{
+    errorID = XML_NO_ERROR;
+    errorStr1 = 0;
+    errorStr2 = 0;
+
+    delete [] charBuffer;
+    charBuffer = 0;
+
+}
+
+
+XMLElement* XMLDocument::NewElement( const char* name )
+{
+    XMLElement* ele = new (elementPool.Alloc()) XMLElement( this );
+    ele->memPool = &elementPool;
+    ele->SetName( name );
+    return ele;
+}
+
+
+XMLComment* XMLDocument::NewComment( const char* str )
+{
+    XMLComment* comment = new (commentPool.Alloc()) XMLComment( this );
+    comment->memPool = &commentPool;
+    comment->SetValue( str );
+    return comment;
+}
+
+
+XMLText* XMLDocument::NewText( const char* str )
+{
+    XMLText* text = new (textPool.Alloc()) XMLText( this );
+    text->memPool = &textPool;
+    text->SetValue( str );
+    return text;
+}
+
+
+XMLDeclaration* XMLDocument::NewDeclaration( const char* str )
+{
+    XMLDeclaration* dec = new (commentPool.Alloc()) XMLDeclaration( this );
+    dec->memPool = &commentPool;
+    dec->SetValue( str ? str : "xml version=\"1.0\" encoding=\"UTF-8\"" );
+    return dec;
+}
+
+
+XMLUnknown* XMLDocument::NewUnknown( const char* str )
+{
+    XMLUnknown* unk = new (commentPool.Alloc()) XMLUnknown( this );
+    unk->memPool = &commentPool;
+    unk->SetValue( str );
+    return unk;
+}
+
+
+int XMLDocument::LoadFile( const char* filename )
+{
+    DeleteChildren();
+    InitDocument();
+
+#if defined(_MSC_VER)
+#pragma warning ( push )
+#pragma warning ( disable : 4996 )        // Fail to see a compelling reason why this should be deprecated.
+#endif
+    FILE* fp = fopen( filename, "rb" );
+#if defined(_MSC_VER)
+#pragma warning ( pop )
+#endif
+    if ( !fp ) {
+        SetError( XML_ERROR_FILE_NOT_FOUND, filename, 0 );
+        return errorID;
+    }
+    LoadFile( fp );
+    fclose( fp );
+    return errorID;
+}
+
+
+int XMLDocument::LoadFile( FILE* fp )
+{
+    DeleteChildren();
+    InitDocument();
+
+    fseek( fp, 0, SEEK_END );
+    unsigned size = ftell( fp );
+    fseek( fp, 0, SEEK_SET );
+
+    if ( size == 0 ) {
+        return errorID;
+    }
+
+    charBuffer = new char[size+1];
+    size_t read = fread( charBuffer, 1, size, fp );
+    if ( read != size ) {
+        SetError( XML_ERROR_FILE_READ_ERROR, 0, 0 );
+        return errorID;
+    }
+
+    charBuffer[size] = 0;
+
+    const char* p = charBuffer;
+    p = XMLUtil::SkipWhiteSpace( p );
+    p = XMLUtil::ReadBOM( p, &writeBOM );
+    if ( !p || !*p ) {
+        SetError( XML_ERROR_EMPTY_DOCUMENT, 0, 0 );
+        return errorID;
+    }
+
+    ParseDeep( charBuffer + (p-charBuffer), 0 );
+    return errorID;
+}
+
+
+int XMLDocument::SaveFile( const char* filename )
+{
+#if defined(_MSC_VER)
+#pragma warning ( push )
+#pragma warning ( disable : 4996 )        // Fail to see a compelling reason why this should be deprecated.
+#endif
+    int fd = open(filename, O_RDWR|O_CREAT, 0644);
+    FILE* fp = fdopen(fd, "w");
+    //FILE* fp = fopen( filename, "w" );
+#if defined(_MSC_VER)
+#pragma warning ( pop )
+#endif
+    if ( !fp ) {
+        SetError( XML_ERROR_FILE_COULD_NOT_BE_OPENED, filename, 0 );
+        return errorID;
+    }
+    SaveFile(fp);
+    fclose( fp );
+    return errorID;
+}
+
+
+int XMLDocument::SaveFile( FILE* fp )
+{
+    XMLPrinter stream( fp );
+    Print( &stream );
+    return errorID;
+}
+
+
+int XMLDocument::Parse( const char* p )
+{
+    DeleteChildren();
+    InitDocument();
+
+    if ( !p || !*p ) {
+        SetError( XML_ERROR_EMPTY_DOCUMENT, 0, 0 );
+        return errorID;
+    }
+    p = XMLUtil::SkipWhiteSpace( p );
+    p = XMLUtil::ReadBOM( p, &writeBOM );
+    if ( !p || !*p ) {
+        SetError( XML_ERROR_EMPTY_DOCUMENT, 0, 0 );
+        return errorID;
+    }
+
+    size_t len = strlen( p );
+    charBuffer = new char[ len+1 ];
+    memcpy( charBuffer, p, len+1 );
+
+
+    ParseDeep( charBuffer, 0 );
+    return errorID;
+}
+
+
+void XMLDocument::Print( XMLPrinter* streamer )
+{
+    XMLPrinter stdStreamer( stdout );
+    if ( !streamer )
+        streamer = &stdStreamer;
+    Accept( streamer );
+}
+
+
+void XMLDocument::SetError( int error, const char* str1, const char* str2 )
+{
+    errorID = error;
+    errorStr1 = str1;
+    errorStr2 = str2;
+}
+
+
+void XMLDocument::PrintError() const
+{
+    if ( errorID ) {
+        static const int LEN = 20;
+        char buf1[LEN] = { 0 };
+        char buf2[LEN] = { 0 };
+
+        if ( errorStr1 ) {
+            TIXML_SNPRINTF( buf1, LEN, "%s", errorStr1 );
+        }
+        if ( errorStr2 ) {
+            TIXML_SNPRINTF( buf2, LEN, "%s", errorStr2 );
+        }
+
+        printf( "XMLDocument error id=%d str1=%s str2=%s\n",
+                errorID, buf1, buf2 );
+    }
+}
+
+
+XMLPrinter::XMLPrinter( FILE* file, bool compact ) :
+    elementJustOpened( false ),
+    firstElement( true ),
+    fp( file ),
+    depth( 0 ),
+    textDepth( -1 ),
+    processEntities( true ),
+    compactMode( compact )
+{
+    for( int i=0; i<ENTITY_RANGE; ++i ) {
+        entityFlag[i] = false;
+        restrictedEntityFlag[i] = false;
+    }
+    for( int i=0; i<NUM_ENTITIES; ++i ) {
+        TIXMLASSERT( entities[i].value < ENTITY_RANGE );
+        if ( entities[i].value < ENTITY_RANGE ) {
+            entityFlag[ (int)entities[i].value ] = true;
+        }
+    }
+    restrictedEntityFlag[(int)'&'] = true;
+    restrictedEntityFlag[(int)'<'] = true;
+    restrictedEntityFlag[(int)'>'] = true;    // not required, but consistency is nice
+    buffer.Push( 0 );
+}
+
+
+void XMLPrinter::Print( const char* format, ... )
+{
+    va_list     va;
+    va_start( va, format );
+
+    if ( fp ) {
+        vfprintf( fp, format, va );
+    }
+    else {
+        // This seems brutally complex. Haven't figured out a better
+        // way on windows.
+        #ifdef _MSC_VER
+            int len = -1;
+            int expand = 1000;
+            while ( len < 0 ) {
+                len = vsnprintf_s( accumulator.Mem(), accumulator.Capacity(), _TRUNCATE, format, va );
+                if ( len < 0 ) {
+                    expand *= 3/2;
+                    accumulator.PushArr( expand );
+                }
+            }
+            char* p = buffer.PushArr( len ) - 1;
+            memcpy( p, accumulator.Mem(), len+1 );
+        #else
+            int len = vsnprintf( 0, 0, format, va );
+            // Close out and re-start the va-args
+            va_end( va );
+            va_start( va, format );
+            char* p = buffer.PushArr( len ) - 1;
+            vsnprintf( p, len+1, format, va );
+        #endif
+    }
+    va_end( va );
+}
+
+
+void XMLPrinter::PrintSpace( int depth )
+{
+    for( int i=0; i<depth; ++i ) {
+        Print( "    " );
+    }
+}
+
+
+void XMLPrinter::PrintString( const char* p, bool restricted )
+{
+    // Look for runs of bytes between entities to print.
+    const char* q = p;
+    const bool* flag = restricted ? restrictedEntityFlag : entityFlag;
+
+    if ( processEntities ) {
+        while ( *q ) {
+            // Remember, char is sometimes signed. (How many times has that bitten me?)
+            if ( *q > 0 && *q < ENTITY_RANGE ) {
+                // Check for entities. If one is found, flush
+                // the stream up until the entity, write the
+                // entity, and keep looking.
+                if ( flag[(unsigned)(*q)] ) {
+                    while ( p < q ) {
+                        Print( "%c", *p );
+                        ++p;
+                    }
+                    for( int i=0; i<NUM_ENTITIES; ++i ) {
+                        if ( entities[i].value == *q ) {
+                            Print( "&%s;", entities[i].pattern );
+                            break;
+                        }
+                    }
+                    ++p;
+                }
+            }
+            ++q;
+        }
+    }
+    // Flush the remaining string. This will be the entire
+    // string if an entity wasn't found.
+    if ( !processEntities || (q-p > 0) ) {
+        Print( "%s", p );
+    }
+}
+
+
+void XMLPrinter::PushHeader( bool writeBOM, bool writeDec )
+{
+    static const unsigned char bom[] = { TIXML_UTF_LEAD_0, TIXML_UTF_LEAD_1, TIXML_UTF_LEAD_2, 0 };
+    if ( writeBOM ) {
+        Print( "%s", bom );
+    }
+    if ( writeDec ) {
+        PushDeclaration( "xml version=\"1.0\"" );
+    }
+}
+
+
+void XMLPrinter::OpenElement( const char* name )
+{
+    if ( elementJustOpened ) {
+        SealElement();
+    }
+    stack.Push( name );
+
+    if ( textDepth < 0 && !firstElement && !compactMode ) {
+        Print( "\n" );
+        PrintSpace( depth );
+    }
+
+    Print( "<%s", name );
+    elementJustOpened = true;
+    firstElement = false;
+    ++depth;
+}
+
+
+void XMLPrinter::PushAttribute( const char* name, const char* value )
+{
+    TIXMLASSERT( elementJustOpened );
+    Print( " %s=\"", name );
+    PrintString( value, false );
+    Print( "\"" );
+}
+
+
+void XMLPrinter::PushAttribute( const char* name, int v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    PushAttribute( name, buf );
+}
+
+
+void XMLPrinter::PushAttribute( const char* name, unsigned v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    PushAttribute( name, buf );
+}
+
+
+void XMLPrinter::PushAttribute( const char* name, bool v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    PushAttribute( name, buf );
+}
+
+
+void XMLPrinter::PushAttribute( const char* name, double v )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( v, buf, BUF_SIZE );
+    PushAttribute( name, buf );
+}
+
+
+void XMLPrinter::CloseElement()
+{
+    --depth;
+    const char* name = stack.Pop();
+
+    if ( elementJustOpened ) {
+        Print( "/>" );
+    }
+    else {
+        if ( textDepth < 0 && !compactMode) {
+            Print( "\n" );
+            PrintSpace( depth );
+        }
+        Print( "</%s>", name );
+    }
+
+    if ( textDepth == depth )
+        textDepth = -1;
+    if ( depth == 0 && !compactMode)
+        Print( "\n" );
+    elementJustOpened = false;
+}
+
+
+void XMLPrinter::SealElement()
+{
+    elementJustOpened = false;
+    Print( ">" );
+}
+
+
+void XMLPrinter::PushText( const char* text, bool cdata )
+{
+    textDepth = depth-1;
+
+    if ( elementJustOpened ) {
+        SealElement();
+    }
+    if ( cdata ) {
+        Print( "<![CDATA[" );
+        Print( "%s", text );
+        Print( "]]>" );
+    }
+    else {
+        PrintString( text, true );
+    }
+}
+
+void XMLPrinter::PushText( int value )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( value, buf, BUF_SIZE );
+    PushText( buf, false );
+}
+
+
+void XMLPrinter::PushText( unsigned value )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( value, buf, BUF_SIZE );
+    PushText( buf, false );
+}
+
+
+void XMLPrinter::PushText( bool value )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( value, buf, BUF_SIZE );
+    PushText( buf, false );
+}
+
+
+void XMLPrinter::PushText( float value )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( value, buf, BUF_SIZE );
+    PushText( buf, false );
+}
+
+
+void XMLPrinter::PushText( double value )
+{
+    char buf[BUF_SIZE];
+    XMLUtil::ToStr( value, buf, BUF_SIZE );
+    PushText( buf, false );
+}
+
+
+void XMLPrinter::PushComment( const char* comment )
+{
+    if ( elementJustOpened ) {
+        SealElement();
+    }
+    if ( textDepth < 0 && !firstElement && !compactMode) {
+        Print( "\n" );
+        PrintSpace( depth );
+    }
+    firstElement = false;
+    Print( "<!--%s-->", comment );
+}
+
+
+void XMLPrinter::PushDeclaration( const char* value )
+{
+    if ( elementJustOpened ) {
+        SealElement();
+    }
+    if ( textDepth < 0 && !firstElement && !compactMode) {
+        Print( "\n" );
+        PrintSpace( depth );
+    }
+    firstElement = false;
+    Print( "<?%s?>", value );
+}
+
+
+void XMLPrinter::PushUnknown( const char* value )
+{
+    if ( elementJustOpened ) {
+        SealElement();
+    }
+    if ( textDepth < 0 && !firstElement && !compactMode) {
+        Print( "\n" );
+        PrintSpace( depth );
+    }
+    firstElement = false;
+    Print( "<!%s>", value );
+}
+
+
+bool XMLPrinter::VisitEnter( const XMLDocument& doc )
+{
+    processEntities = doc.ProcessEntities();
+    if ( doc.HasBOM() ) {
+        PushHeader( true, false );
+    }
+    return true;
+}
+
+
+bool XMLPrinter::VisitEnter( const XMLElement& element, const XMLAttribute* attribute )
+{
+    OpenElement( element.Name() );
+    while ( attribute ) {
+        PushAttribute( attribute->Name(), attribute->Value() );
+        attribute = attribute->Next();
+    }
+    return true;
+}
+
+
+bool XMLPrinter::VisitExit( const XMLElement& )
+{
+    CloseElement();
+    return true;
+}
+
+
+bool XMLPrinter::Visit( const XMLText& text )
+{
+    PushText( text.Value(), text.CData() );
+    return true;
+}
+
+
+bool XMLPrinter::Visit( const XMLComment& comment )
+{
+    PushComment( comment.Value() );
+    return true;
+}
+
+bool XMLPrinter::Visit( const XMLDeclaration& declaration )
+{
+    PushDeclaration( declaration.Value() );
+    return true;
+}
+
+
+bool XMLPrinter::Visit( const XMLUnknown& unknown )
+{
+    PushUnknown( unknown.Value() );
+    return true;
+}
diff --git a/llvm/xml/tinyxml2.h b/llvm/xml/tinyxml2.h
new file mode 100644
index 0000000..80e076d
--- /dev/null
+++ b/llvm/xml/tinyxml2.h
@@ -0,0 +1,1480 @@
+/*
+Original code by Lee Thomason (www.grinninglizard.com)
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any
+damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any
+purpose, including commercial applications, and to alter it and
+redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must
+not claim that you wrote the original software. If you use this
+software in a product, an acknowledgment in the product documentation
+would be appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and
+must not be misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+#ifndef TINYXML2_INCLUDED
+#define TINYXML2_INCLUDED
+
+#include <cctype>
+#include <climits>
+#include <cstdio>
+#include <cstring>
+//#include <cstdarg>
+#include <stdarg.h>
+/*
+   TODO: intern strings instead of allocation.
+*/
+/*
+    gcc: g++ -Wall tinyxml2.cpp xmltest.cpp -o gccxmltest.exe
+*/
+
+#if defined( _DEBUG ) || defined( DEBUG ) || defined (__DEBUG__)
+    #ifndef DEBUG
+        #define DEBUG
+    #endif
+#endif
+
+
+#if defined(DEBUG)
+        #if defined(_MSC_VER)
+                #define TIXMLASSERT( x )           if ( !(x)) { __debugbreak(); } //if ( !(x)) WinDebugBreak()
+        #elif defined (ANDROID_NDK)
+                #include <android/log.h>
+                #define TIXMLASSERT( x )           if ( !(x)) { __android_log_assert( "assert", "grinliz", "ASSERT in '%s' at %d.", __FILE__, __LINE__ ); }
+        #else
+                #include <assert.h>
+                #define TIXMLASSERT                assert
+        #endif
+#else
+        #define TIXMLASSERT( x )           {}
+#endif
+
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1400 )
+    // Microsoft visual studio, version 2005 and higher.
+    /*int _snprintf_s(
+       char *buffer,
+       size_t sizeOfBuffer,
+       size_t count,
+       const char *format [,
+          argument] ...
+    );*/
+    inline int TIXML_SNPRINTF( char* buffer, size_t size, const char* format, ... ) {
+        va_list va;
+        va_start( va, format );
+        int result = vsnprintf_s( buffer, size, _TRUNCATE, format, va );
+        va_end( va );
+        return result;
+    }
+    #define TIXML_SSCANF   sscanf_s
+#else
+    // GCC version 3 and higher
+    //#warning( "Using sn* functions." )
+    #define TIXML_SNPRINTF snprintf
+    #define TIXML_SSCANF   sscanf
+#endif
+
+static const int TIXML2_MAJOR_VERSION = 1;
+static const int TIXML2_MINOR_VERSION = 0;
+static const int TIXML2_PATCH_VERSION = 6;
+
+namespace tinyxml2
+{
+class XMLDocument;
+class XMLElement;
+class XMLAttribute;
+class XMLComment;
+class XMLNode;
+class XMLText;
+class XMLDeclaration;
+class XMLUnknown;
+
+class XMLPrinter;
+
+/*
+    A class that wraps strings. Normally stores the start and end
+    pointers into the XML file itself, and will apply normalization
+    and entity translation if actually read. Can also store (and memory
+    manage) a traditional char[]
+*/
+class StrPair
+{
+public:
+    enum {
+        NEEDS_ENTITY_PROCESSING            = 0x01,
+        NEEDS_NEWLINE_NORMALIZATION        = 0x02,
+
+        TEXT_ELEMENT        = NEEDS_ENTITY_PROCESSING | NEEDS_NEWLINE_NORMALIZATION,
+        TEXT_ELEMENT_LEAVE_ENTITIES        = NEEDS_NEWLINE_NORMALIZATION,
+        ATTRIBUTE_NAME        = 0,
+        ATTRIBUTE_VALUE        = NEEDS_ENTITY_PROCESSING | NEEDS_NEWLINE_NORMALIZATION,
+        ATTRIBUTE_VALUE_LEAVE_ENTITIES        = NEEDS_NEWLINE_NORMALIZATION,
+        COMMENT                = NEEDS_NEWLINE_NORMALIZATION
+    };
+
+    StrPair() : flags( 0 ), start( 0 ), end( 0 ) {}
+    ~StrPair();
+
+    void Set( char* _start, char* _end, int _flags ) {
+        Reset();
+        this->start = _start; this->end = _end; this->flags = _flags | NEEDS_FLUSH;
+    }
+    const char* GetStr();
+    bool Empty() const { return start == end; }
+
+    void SetInternedStr( const char* str ) { Reset(); this->start = const_cast<char*>(str); }
+    void SetStr( const char* str, int flags=0 );
+
+    char* ParseText( char* in, const char* endTag, int strFlags );
+    char* ParseName( char* in );
+
+
+private:
+    void Reset();
+
+    enum {
+        NEEDS_FLUSH = 0x100,
+        NEEDS_DELETE = 0x200
+    };
+
+    // After parsing, if *end != 0, it can be set to zero.
+    int flags;
+    char* start;
+    char* end;
+};
+
+
+/*
+    A dynamic array of Plain Old Data. Doesn't support constructors, etc.
+    Has a small initial memory pool, so that low or no usage will not
+    cause a call to new/delete
+*/
+template <class T, int INIT>
+class DynArray
+{
+public:
+    DynArray< T, INIT >()
+    {
+        mem = pool;
+        allocated = INIT;
+        size = 0;
+    }
+    ~DynArray()
+    {
+        if ( mem != pool ) {
+            delete [] mem;
+        }
+    }
+    void Push( T t )
+    {
+        EnsureCapacity( size+1 );
+        mem[size++] = t;
+    }
+
+    T* PushArr( int count )
+    {
+        EnsureCapacity( size+count );
+        T* ret = &mem[size];
+        size += count;
+        return ret;
+    }
+    T Pop() {
+        return mem[--size];
+    }
+    void PopArr( int count )
+    {
+        TIXMLASSERT( size >= count );
+        size -= count;
+    }
+
+    bool Empty() const                    { return size == 0; }
+    T& operator[](int i)                { TIXMLASSERT( i>= 0 && i < size ); return mem[i]; }
+    const T& operator[](int i) const    { TIXMLASSERT( i>= 0 && i < size ); return mem[i]; }
+    int Size() const                    { return size; }
+    int Capacity() const                { return allocated; }
+    const T* Mem() const                { return mem; }
+    T* Mem()                            { return mem; }
+
+
+private:
+    void EnsureCapacity( int cap ) {
+        if ( cap > allocated ) {
+            int newAllocated = cap * 2;
+            T* newMem = new T[newAllocated];
+            memcpy( newMem, mem, sizeof(T)*size );    // warning: not using constructors, only works for PODs
+            if ( mem != pool ) delete [] mem;
+            mem = newMem;
+            allocated = newAllocated;
+        }
+    }
+
+    T* mem;
+    T pool[INIT];
+    int allocated;        // objects allocated
+    int size;            // number objects in use
+};
+
+
+/*
+    Parent virtual class of a pool for fast allocation
+    and deallocation of objects.
+*/
+class MemPool
+{
+public:
+    MemPool() {}
+    virtual ~MemPool() {}
+
+    virtual int ItemSize() const = 0;
+    virtual void* Alloc() = 0;
+    virtual void Free( void* ) = 0;
+};
+
+
+/*
+    Template child class to create pools of the correct type.
+*/
+template< int SIZE >
+class MemPoolT : public MemPool
+{
+public:
+    MemPoolT() : root(0), currentAllocs(0), nAllocs(0), maxAllocs(0)    {}
+    ~MemPoolT() {
+        // Delete the blocks.
+        for( int i=0; i<blockPtrs.Size(); ++i ) {
+            delete blockPtrs[i];
+        }
+    }
+
+    virtual int ItemSize() const    { return SIZE; }
+    int CurrentAllocs() const        { return currentAllocs; }
+
+    virtual void* Alloc() {
+        if ( !root ) {
+            // Need a new block.
+            Block* block = new Block();
+            blockPtrs.Push( block );
+
+            for( int i=0; i<COUNT-1; ++i ) {
+                block->chunk[i].next = &block->chunk[i+1];
+            }
+            block->chunk[COUNT-1].next = 0;
+            root = block->chunk;
+        }
+        void* result = root;
+        root = root->next;
+
+        ++currentAllocs;
+        if ( currentAllocs > maxAllocs ) maxAllocs = currentAllocs;
+        nAllocs++;
+        return result;
+    }
+    virtual void Free( void* mem ) {
+        if ( !mem ) return;
+        --currentAllocs;
+        Chunk* chunk = (Chunk*)mem;
+        memset( chunk, 0xfe, sizeof(Chunk) );
+        chunk->next = root;
+        root = chunk;
+    }
+    void Trace( const char* name ) {
+        printf( "Mempool %s watermark=%d [%dk] current=%d size=%d nAlloc=%d blocks=%d\n",
+                 name, maxAllocs, maxAllocs*SIZE/1024, currentAllocs, SIZE, nAllocs, blockPtrs.Size() );
+    }
+
+private:
+    enum { COUNT = 1024/SIZE };
+    union Chunk {
+        Chunk* next;
+        char mem[SIZE];
+    };
+    struct Block {
+        Chunk chunk[COUNT];
+    };
+    DynArray< Block*, 10 > blockPtrs;
+    Chunk* root;
+
+    int currentAllocs;
+    int nAllocs;
+    int maxAllocs;
+};
+
+
+
+/**
+    Implements the interface to the "Visitor pattern" (see the Accept() method.)
+    If you call the Accept() method, it requires being passed a XMLVisitor
+    class to handle callbacks. For nodes that contain other nodes (Document, Element)
+    you will get called with a VisitEnter/VisitExit pair. Nodes that are always leafs
+    are simply called with Visit().
+
+    If you return 'true' from a Visit method, recursive parsing will continue. If you return
+    false, <b>no children of this node or its sibilings</b> will be visited.
+
+    All flavors of Visit methods have a default implementation that returns 'true' (continue
+    visiting). You need to only override methods that are interesting to you.
+
+    Generally Accept() is called on the TiXmlDocument, although all nodes support visiting.
+
+    You should never change the document from a callback.
+
+    @sa XMLNode::Accept()
+*/
+class XMLVisitor
+{
+public:
+    virtual ~XMLVisitor() {}
+
+    /// Visit a document.
+    virtual bool VisitEnter( const XMLDocument& /*doc*/ )            { return true; }
+    /// Visit a document.
+    virtual bool VisitExit( const XMLDocument& /*doc*/ )            { return true; }
+
+    /// Visit an element.
+    virtual bool VisitEnter( const XMLElement& /*element*/, const XMLAttribute* /*firstAttribute*/ )    { return true; }
+    /// Visit an element.
+    virtual bool VisitExit( const XMLElement& /*element*/ )            { return true; }
+
+    /// Visit a declaration.
+    virtual bool Visit( const XMLDeclaration& /*declaration*/ )        { return true; }
+    /// Visit a text node.
+    virtual bool Visit( const XMLText& /*text*/ )                    { return true; }
+    /// Visit a comment node.
+    virtual bool Visit( const XMLComment& /*comment*/ )                { return true; }
+    /// Visit an unknown node.
+    virtual bool Visit( const XMLUnknown& /*unknown*/ )                { return true; }
+};
+
+
+/*
+    Utility functionality.
+*/
+class XMLUtil
+{
+public:
+    // Anything in the high order range of UTF-8 is assumed to not be whitespace. This isn't
+    // correct, but simple, and usually works.
+    static const char* SkipWhiteSpace( const char* p )    { while( !IsUTF8Continuation(*p) && isspace( *reinterpret_cast<const unsigned char*>(p) ) ) { ++p; } return p; }
+    static char* SkipWhiteSpace( char* p )                { while( !IsUTF8Continuation(*p) && isspace( *reinterpret_cast<unsigned char*>(p) ) )        { ++p; } return p; }
+
+    inline static bool StringEqual( const char* p, const char* q, int nChar=INT_MAX )  {
+        int n = 0;
+        if ( p == q ) {
+            return true;
+        }
+        while( *p && *q && *p == *q && n<nChar ) {
+            ++p; ++q; ++n;
+        }
+        if ( (n == nChar) || ( *p == 0 && *q == 0 ) ) {
+            return true;
+        }
+        return false;
+    }
+    inline static int IsUTF8Continuation( const char p ) { return p & 0x80; }
+    inline static int IsAlphaNum( unsigned char anyByte )    { return ( anyByte < 128 ) ? isalnum( anyByte ) : 1; }
+    inline static int IsAlpha( unsigned char anyByte )        { return ( anyByte < 128 ) ? isalpha( anyByte ) : 1; }
+
+    static const char* ReadBOM( const char* p, bool* hasBOM );
+    // p is the starting location,
+    // the UTF-8 value of the entity will be placed in value, and length filled in.
+    static const char* GetCharacterRef( const char* p, char* value, int* length );
+    static void ConvertUTF32ToUTF8( unsigned long input, char* output, int* length );
+
+    // converts primitive types to strings
+    static void ToStr( int v, char* buffer, int bufferSize );
+    static void ToStr( unsigned v, char* buffer, int bufferSize );
+    static void ToStr( bool v, char* buffer, int bufferSize );
+    static void ToStr( float v, char* buffer, int bufferSize );
+    static void ToStr( double v, char* buffer, int bufferSize );
+
+    // converts strings to primitive types
+    static bool    ToInt( const char* str, int* value );
+    static bool ToUnsigned( const char* str, unsigned* value );
+    static bool    ToBool( const char* str, bool* value );
+    static bool    ToFloat( const char* str, float* value );
+    static bool ToDouble( const char* str, double* value );
+};
+
+
+/** XMLNode is a base class for every object that is in the
+    XML Document Object Model (DOM), except XMLAttributes.
+    Nodes have siblings, a parent, and children which can
+    be navigated. A node is always in a XMLDocument.
+    The type of a XMLNode can be queried, and it can
+    be cast to its more defined type.
+
+    A XMLDocument allocates memory for all its Nodes.
+    When the XMLDocument gets deleted, all its Nodes
+    will also be deleted.
+
+    @verbatim
+    A Document can contain:    Element    (container or leaf)
+                            Comment (leaf)
+                            Unknown (leaf)
+                            Declaration( leaf )
+
+    An Element can contain:    Element (container or leaf)
+                            Text    (leaf)
+                            Attributes (not on tree)
+                            Comment (leaf)
+                            Unknown (leaf)
+
+    @endverbatim
+*/
+class XMLNode
+{
+    friend class XMLDocument;
+    friend class XMLElement;
+public:
+
+    /// Get the XMLDocument that owns this XMLNode.
+    const XMLDocument* GetDocument() const    { return document; }
+    /// Get the XMLDocument that owns this XMLNode.
+    XMLDocument* GetDocument()                { return document; }
+
+    virtual XMLElement*        ToElement()        { return 0; }    ///< Safely cast to an Element, or null.
+    virtual XMLText*        ToText()        { return 0; }    ///< Safely cast to Text, or null.
+    virtual XMLComment*        ToComment()        { return 0; }    ///< Safely cast to a Comment, or null.
+    virtual XMLDocument*    ToDocument()    { return 0; }    ///< Safely cast to a Document, or null.
+    virtual XMLDeclaration*    ToDeclaration()    { return 0; }    ///< Safely cast to a Declaration, or null.
+    virtual XMLUnknown*        ToUnknown()        { return 0; }    ///< Safely cast to an Unknown, or null.
+
+    virtual const XMLElement*        ToElement() const        { return 0; }
+    virtual const XMLText*            ToText() const            { return 0; }
+    virtual const XMLComment*        ToComment() const        { return 0; }
+    virtual const XMLDocument*        ToDocument() const        { return 0; }
+    virtual const XMLDeclaration*    ToDeclaration() const    { return 0; }
+    virtual const XMLUnknown*        ToUnknown() const        { return 0; }
+
+    /** The meaning of 'value' changes for the specific type.
+        @verbatim
+        Document:    empty
+        Element:    name of the element
+        Comment:    the comment text
+        Unknown:    the tag contents
+        Text:        the text string
+        @endverbatim
+    */
+    const char* Value() const            { return value.GetStr(); }
+    /** Set the Value of an XML node.
+        @sa Value()
+    */
+    void SetValue( const char* val, bool staticMem=false );
+
+    /// Get the parent of this node on the DOM.
+    const XMLNode*    Parent() const            { return parent; }
+    XMLNode* Parent()                        { return parent; }
+
+    /// Returns true if this node has no children.
+    bool NoChildren() const                    { return !firstChild; }
+
+    /// Get the first child node, or null if none exists.
+    const XMLNode*  FirstChild() const        { return firstChild; }
+    XMLNode*        FirstChild()            { return firstChild; }
+    /** Get the first child element, or optionally the first child
+        element with the specified name.
+    */
+    const XMLElement* FirstChildElement( const char* value=0 ) const;
+    XMLElement* FirstChildElement( const char* _value=0 )    { return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->FirstChildElement( _value )); }
+
+    /// Get the last child node, or null if none exists.
+    const XMLNode*    LastChild() const                        { return lastChild; }
+    XMLNode*        LastChild()                                { return const_cast<XMLNode*>(const_cast<const XMLNode*>(this)->LastChild() ); }
+
+    /** Get the last child element or optionally the last child
+        element with the specified name.
+    */
+    const XMLElement* LastChildElement( const char* value=0 ) const;
+    XMLElement* LastChildElement( const char* _value=0 )    { return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->LastChildElement(_value) ); }
+
+    /// Get the previous (left) sibling node of this node.
+    const XMLNode*    PreviousSibling() const                    { return prev; }
+    XMLNode*    PreviousSibling()                            { return prev; }
+
+    /// Get the previous (left) sibling element of this node, with an opitionally supplied name.
+    const XMLElement*    PreviousSiblingElement( const char* value=0 ) const ;
+    XMLElement*    PreviousSiblingElement( const char* _value=0 ) { return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->PreviousSiblingElement( _value ) ); }
+
+    /// Get the next (right) sibling node of this node.
+    const XMLNode*    NextSibling() const                        { return next; }
+    XMLNode*    NextSibling()                                { return next; }
+
+    /// Get the next (right) sibling element of this node, with an opitionally supplied name.
+    const XMLElement*    NextSiblingElement( const char* value=0 ) const;
+     XMLElement*    NextSiblingElement( const char* _value=0 )    { return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->NextSiblingElement( _value ) ); }
+
+    /**
+        Add a child node as the last (right) child.
+    */
+    XMLNode* InsertEndChild( XMLNode* addThis );
+
+    XMLNode* LinkEndChild( XMLNode* addThis )    { return InsertEndChild( addThis ); }
+    /**
+        Add a child node as the first (left) child.
+    */
+    XMLNode* InsertFirstChild( XMLNode* addThis );
+    /**
+        Add a node after the specified child node.
+    */
+    XMLNode* InsertAfterChild( XMLNode* afterThis, XMLNode* addThis );
+
+    /**
+        Delete all the children of this node.
+    */
+    void DeleteChildren();
+
+    /**
+        Delete a child of this node.
+    */
+    void DeleteChild( XMLNode* node );
+
+    /**
+        Make a copy of this node, but not its children.
+        You may pass in a Document pointer that will be
+        the owner of the new Node. If the 'document' is
+        null, then the node returned will be allocated
+        from the current Document. (this->GetDocument())
+
+        Note: if called on a XMLDocument, this will return null.
+    */
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const = 0;
+
+    /**
+        Test if 2 nodes are the same, but don't test children.
+        The 2 nodes do not need to be in the same Document.
+
+        Note: if called on a XMLDocument, this will return false.
+    */
+    virtual bool ShallowEqual( const XMLNode* compare ) const = 0;
+
+    /** Accept a hierarchical visit of the nodes in the TinyXML DOM. Every node in the
+        XML tree will be conditionally visited and the host will be called back
+        via the TiXmlVisitor interface.
+
+        This is essentially a SAX interface for TinyXML. (Note however it doesn't re-parse
+        the XML for the callbacks, so the performance of TinyXML is unchanged by using this
+        interface versus any other.)
+
+        The interface has been based on ideas from:
+
+        - http://www.saxproject.org/
+        - http://c2.com/cgi/wiki?HierarchicalVisitorPattern
+
+        Which are both good references for "visiting".
+
+        An example of using Accept():
+        @verbatim
+        TiXmlPrinter printer;
+        tinyxmlDoc.Accept( &printer );
+        const char* xmlcstr = printer.CStr();
+        @endverbatim
+    */
+    virtual bool Accept( XMLVisitor* visitor ) const = 0;
+
+    // internal
+    virtual char* ParseDeep( char*, StrPair* );
+
+protected:
+    XMLNode( XMLDocument* );
+    virtual ~XMLNode();
+    XMLNode( const XMLNode& );    // not supported
+    XMLNode& operator=( const XMLNode& );    // not supported
+
+    XMLDocument*    document;
+    XMLNode*        parent;
+    mutable StrPair    value;
+
+    XMLNode*        firstChild;
+    XMLNode*        lastChild;
+
+    XMLNode*        prev;
+    XMLNode*        next;
+
+private:
+    MemPool*        memPool;
+    void Unlink( XMLNode* child );
+};
+
+
+/** XML text.
+
+    Note that a text node can have child element nodes, for example:
+    @verbatim
+    <root>This is <b>bold</b></root>
+    @endverbatim
+
+    A text node can have 2 ways to output the next. "normal" output
+    and CDATA. It will default to the mode it was parsed from the XML file and
+    you generally want to leave it alone, but you can change the output mode with
+    SetCDATA() and query it with CDATA().
+*/
+class XMLText : public XMLNode
+{
+    friend class XMLBase;
+    friend class XMLDocument;
+public:
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    virtual XMLText*    ToText()            { return this; }
+    virtual const XMLText*    ToText() const    { return this; }
+
+    /// Declare whether this should be CDATA or standard text.
+    void SetCData( bool _isCData )            { this->isCData = _isCData; }
+    /// Returns true if this is a CDATA text element.
+    bool CData() const                        { return isCData; }
+
+    char* ParseDeep( char*, StrPair* endTag );
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const;
+    virtual bool ShallowEqual( const XMLNode* compare ) const;
+
+
+protected:
+    XMLText( XMLDocument* doc )    : XMLNode( doc ), isCData( false )    {}
+    virtual ~XMLText()                                                {}
+    XMLText( const XMLText& );    // not supported
+    XMLText& operator=( const XMLText& );    // not supported
+
+private:
+    bool isCData;
+};
+
+
+/** An XML Comment. */
+class XMLComment : public XMLNode
+{
+    friend class XMLDocument;
+public:
+    virtual XMLComment*    ToComment()                    { return this; }
+    virtual const XMLComment* ToComment() const        { return this; }
+
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    char* ParseDeep( char*, StrPair* endTag );
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const;
+    virtual bool ShallowEqual( const XMLNode* compare ) const;
+
+protected:
+    XMLComment( XMLDocument* doc );
+    virtual ~XMLComment();
+    XMLComment( const XMLComment& );    // not supported
+    XMLComment& operator=( const XMLComment& );    // not supported
+
+private:
+};
+
+
+/** In correct XML the declaration is the first entry in the file.
+    @verbatim
+        <?xml version="1.0" standalone="yes"?>
+    @endverbatim
+
+    TinyXML2 will happily read or write files without a declaration,
+    however.
+
+    The text of the declaration isn't interpreted. It is parsed
+    and written as a string.
+*/
+class XMLDeclaration : public XMLNode
+{
+    friend class XMLDocument;
+public:
+    virtual XMLDeclaration*    ToDeclaration()                    { return this; }
+    virtual const XMLDeclaration* ToDeclaration() const        { return this; }
+
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    char* ParseDeep( char*, StrPair* endTag );
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const;
+    virtual bool ShallowEqual( const XMLNode* compare ) const;
+
+protected:
+    XMLDeclaration( XMLDocument* doc );
+    virtual ~XMLDeclaration();
+    XMLDeclaration( const XMLDeclaration& );    // not supported
+    XMLDeclaration& operator=( const XMLDeclaration& );    // not supported
+};
+
+
+/** Any tag that tinyXml doesn't recognize is saved as an
+    unknown. It is a tag of text, but should not be modified.
+    It will be written back to the XML, unchanged, when the file
+    is saved.
+
+    DTD tags get thrown into TiXmlUnknowns.
+*/
+class XMLUnknown : public XMLNode
+{
+    friend class XMLDocument;
+public:
+    virtual XMLUnknown*    ToUnknown()                    { return this; }
+    virtual const XMLUnknown* ToUnknown() const        { return this; }
+
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    char* ParseDeep( char*, StrPair* endTag );
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const;
+    virtual bool ShallowEqual( const XMLNode* compare ) const;
+
+protected:
+    XMLUnknown( XMLDocument* doc );
+    virtual ~XMLUnknown();
+    XMLUnknown( const XMLUnknown& );    // not supported
+    XMLUnknown& operator=( const XMLUnknown& );    // not supported
+};
+
+
+enum {
+    XML_NO_ERROR = 0,
+    XML_SUCCESS = 0,
+
+    XML_NO_ATTRIBUTE,
+    XML_WRONG_ATTRIBUTE_TYPE,
+
+    XML_ERROR_FILE_NOT_FOUND,
+    XML_ERROR_FILE_COULD_NOT_BE_OPENED,
+    XML_ERROR_FILE_READ_ERROR,
+    XML_ERROR_ELEMENT_MISMATCH,
+    XML_ERROR_PARSING_ELEMENT,
+    XML_ERROR_PARSING_ATTRIBUTE,
+    XML_ERROR_IDENTIFYING_TAG,
+    XML_ERROR_PARSING_TEXT,
+    XML_ERROR_PARSING_CDATA,
+    XML_ERROR_PARSING_COMMENT,
+    XML_ERROR_PARSING_DECLARATION,
+    XML_ERROR_PARSING_UNKNOWN,
+    XML_ERROR_EMPTY_DOCUMENT,
+    XML_ERROR_MISMATCHED_ELEMENT,
+    XML_ERROR_PARSING,
+
+    XML_CAN_NOT_CONVERT_TEXT,
+    XML_NO_TEXT_NODE
+};
+
+
+/** An attribute is a name-value pair. Elements have an arbitrary
+    number of attributes, each with a unique name.
+
+    @note The attributes are not XMLNodes. You may only query the
+    Next() attribute in a list.
+*/
+class XMLAttribute
+{
+    friend class XMLElement;
+public:
+    const char* Name() const { return name.GetStr(); }            ///< The name of the attribute.
+    const char* Value() const { return value.GetStr(); }        ///< The value of the attribute.
+    const XMLAttribute* Next() const { return next; }            ///< The next attribute in the list.
+
+    /** IntAttribute interprets the attribute as an integer, and returns the value.
+        If the value isn't an integer, 0 will be returned. There is no error checking;
+        use QueryIntAttribute() if you need error checking.
+    */
+    int         IntValue() const                { int i=0;        QueryIntValue( &i );        return i; }
+    /// Query as an unsigned integer. See IntAttribute()
+    unsigned UnsignedValue() const            { unsigned i=0; QueryUnsignedValue( &i );    return i; }
+    /// Query as a boolean. See IntAttribute()
+    bool     BoolValue() const                { bool b=false; QueryBoolValue( &b );        return b; }
+    /// Query as a double. See IntAttribute()
+    double      DoubleValue() const            { double d=0;    QueryDoubleValue( &d );        return d; }
+    /// Query as a float. See IntAttribute()
+    float     FloatValue() const                { float f=0;    QueryFloatValue( &f );        return f; }
+
+    /** QueryIntAttribute interprets the attribute as an integer, and returns the value
+        in the provided paremeter. The function will return XML_NO_ERROR on success,
+        and XML_WRONG_ATTRIBUTE_TYPE if the conversion is not successful.
+    */
+    int QueryIntValue( int* value ) const;
+    /// See QueryIntAttribute
+    int QueryUnsignedValue( unsigned int* value ) const;
+    /// See QueryIntAttribute
+    int QueryBoolValue( bool* value ) const;
+    /// See QueryIntAttribute
+    int QueryDoubleValue( double* value ) const;
+    /// See QueryIntAttribute
+    int QueryFloatValue( float* value ) const;
+
+    /// Set the attribute to a string value.
+    void SetAttribute( const char* value );
+    /// Set the attribute to value.
+    void SetAttribute( int value );
+    /// Set the attribute to value.
+    void SetAttribute( unsigned value );
+    /// Set the attribute to value.
+    void SetAttribute( bool value );
+    /// Set the attribute to value.
+    void SetAttribute( double value );
+    /// Set the attribute to value.
+    void SetAttribute( float value );
+
+private:
+    enum { BUF_SIZE = 200 };
+
+    XMLAttribute() : next( 0 ) {}
+    virtual ~XMLAttribute()    {}
+    XMLAttribute( const XMLAttribute& );    // not supported
+    void operator=( const XMLAttribute& );    // not supported
+    void SetName( const char* name );
+
+    char* ParseDeep( char* p, bool processEntities );
+
+    mutable StrPair name;
+    mutable StrPair value;
+    XMLAttribute* next;
+    MemPool* memPool;
+};
+
+
+/** The element is a container class. It has a value, the element name,
+    and can contain other elements, text, comments, and unknowns.
+    Elements also contain an arbitrary number of attributes.
+*/
+class XMLElement : public XMLNode
+{
+    friend class XMLBase;
+    friend class XMLDocument;
+public:
+    /// Get the name of an element (which is the Value() of the node.)
+    const char* Name() const        { return Value(); }
+    /// Set the name of the element.
+    void SetName( const char* str, bool staticMem=false )    { SetValue( str, staticMem ); }
+
+    virtual XMLElement* ToElement()                { return this; }
+    virtual const XMLElement* ToElement() const { return this; }
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    /** Given an attribute name, Attribute() returns the value
+        for the attribute of that name, or null if none
+        exists. For example:
+
+        @verbatim
+        const char* value = ele->Attribute( "foo" );
+        @endverbatim
+
+        The 'value' parameter is normally null. However, if specified,
+        the attribute will only be returned if the 'name' and 'value'
+        match. This allow you to write code:
+
+        @verbatim
+        if ( ele->Attribute( "foo", "bar" ) ) callFooIsBar();
+        @endverbatim
+
+        rather than:
+        @verbatim
+        if ( ele->Attribute( "foo" ) ) {
+            if ( strcmp( ele->Attribute( "foo" ), "bar" ) == 0 ) callFooIsBar();
+        }
+        @endverbatim
+    */
+    const char* Attribute( const char* name, const char* value=0 ) const;
+
+    /** Given an attribute name, IntAttribute() returns the value
+        of the attribute interpreted as an integer. 0 will be
+        returned if there is an error. For a method with error
+        checking, see QueryIntAttribute()
+    */
+    int         IntAttribute( const char* name ) const        { int i=0;        QueryIntAttribute( name, &i );        return i; }
+    /// See IntAttribute()
+    unsigned UnsignedAttribute( const char* name ) const{ unsigned i=0; QueryUnsignedAttribute( name, &i ); return i; }
+    /// See IntAttribute()
+    bool     BoolAttribute( const char* name ) const    { bool b=false; QueryBoolAttribute( name, &b );        return b; }
+    /// See IntAttribute()
+    double      DoubleAttribute( const char* name ) const    { double d=0;    QueryDoubleAttribute( name, &d );    return d; }
+    /// See IntAttribute()
+    float     FloatAttribute( const char* name ) const    { float f=0;    QueryFloatAttribute( name, &f );    return f; }
+
+    /** Given an attribute name, QueryIntAttribute() returns
+        XML_NO_ERROR, XML_WRONG_ATTRIBUTE_TYPE if the conversion
+        can't be performed, or XML_NO_ATTRIBUTE if the attribute
+        doesn't exist. If successful, the result of the conversion
+        will be written to 'value'. If not successful, nothing will
+        be written to 'value'. This allows you to provide default
+        value:
+
+        @verbatim
+        int value = 10;
+        QueryIntAttribute( "foo", &value );        // if "foo" isn't found, value will still be 10
+        @endverbatim
+    */
+    int QueryIntAttribute( const char* name, int* _value ) const                { const XMLAttribute* a = FindAttribute( name ); if ( !a ) return XML_NO_ATTRIBUTE; return a->QueryIntValue( _value ); }
+    /// See QueryIntAttribute()
+    int QueryUnsignedAttribute( const char* name, unsigned int* _value ) const    { const XMLAttribute* a = FindAttribute( name ); if ( !a ) return XML_NO_ATTRIBUTE; return a->QueryUnsignedValue( _value ); }
+    /// See QueryIntAttribute()
+    int QueryBoolAttribute( const char* name, bool* _value ) const                { const XMLAttribute* a = FindAttribute( name ); if ( !a ) return XML_NO_ATTRIBUTE; return a->QueryBoolValue( _value ); }
+    /// See QueryIntAttribute()
+    int QueryDoubleAttribute( const char* name, double* _value ) const            { const XMLAttribute* a = FindAttribute( name ); if ( !a ) return XML_NO_ATTRIBUTE; return a->QueryDoubleValue( _value ); }
+    /// See QueryIntAttribute()
+    int QueryFloatAttribute( const char* name, float* _value ) const            { const XMLAttribute* a = FindAttribute( name ); if ( !a ) return XML_NO_ATTRIBUTE; return a->QueryFloatValue( _value ); }
+
+    /// Sets the named attribute to value.
+    void SetAttribute( const char* name, const char* _value )    { XMLAttribute* a = FindOrCreateAttribute( name ); a->SetAttribute( _value ); }
+    /// Sets the named attribute to value.
+    void SetAttribute( const char* name, int _value )            { XMLAttribute* a = FindOrCreateAttribute( name ); a->SetAttribute( _value ); }
+    /// Sets the named attribute to value.
+    void SetAttribute( const char* name, unsigned _value )        { XMLAttribute* a = FindOrCreateAttribute( name ); a->SetAttribute( _value ); }
+    /// Sets the named attribute to value.
+    void SetAttribute( const char* name, bool _value )            { XMLAttribute* a = FindOrCreateAttribute( name ); a->SetAttribute( _value ); }
+    /// Sets the named attribute to value.
+    void SetAttribute( const char* name, double _value )        { XMLAttribute* a = FindOrCreateAttribute( name ); a->SetAttribute( _value ); }
+
+    /**
+        Delete an attribute.
+    */
+    void DeleteAttribute( const char* name );
+
+    /// Return the first attribute in the list.
+    const XMLAttribute* FirstAttribute() const { return rootAttribute; }
+    /// Query a specific attribute in the list.
+    const XMLAttribute* FindAttribute( const char* name ) const;
+
+    /** Convenience function for easy access to the text inside an element. Although easy
+        and concise, GetText() is limited compared to getting the TiXmlText child
+        and accessing it directly.
+
+        If the first child of 'this' is a TiXmlText, the GetText()
+        returns the character string of the Text node, else null is returned.
+
+        This is a convenient method for getting the text of simple contained text:
+        @verbatim
+        <foo>This is text</foo>
+            const char* str = fooElement->GetText();
+        @endverbatim
+
+        'str' will be a pointer to "This is text".
+
+        Note that this function can be misleading. If the element foo was created from
+        this XML:
+        @verbatim
+            <foo><b>This is text</b></foo>
+        @endverbatim
+
+        then the value of str would be null. The first child node isn't a text node, it is
+        another element. From this XML:
+        @verbatim
+            <foo>This is <b>text</b></foo>
+        @endverbatim
+        GetText() will return "This is ".
+    */
+    const char* GetText() const;
+
+    /**
+        Convenience method to query the value of a child text node. This is probably best
+        shown by example. Given you have a document is this form:
+        @verbatim
+            <point>
+                <x>1</x>
+                <y>1.4</y>
+            </point>
+        @endverbatim
+
+        The QueryIntText() and similar functions provide a safe and easier way to get to the
+        "value" of x and y.
+
+        @verbatim
+            int x = 0;
+            float y = 0;    // types of x and y are contrived for example
+            const XMLElement* xElement = pointElement->FirstChildElement( "x" );
+            const XMLElement* yElement = pointElement->FirstChildElement( "y" );
+            xElement->QueryIntText( &x );
+            yElement->QueryFloatText( &y );
+        @endverbatim
+
+        @returns XML_SUCCESS (0) on success, XML_CAN_NOT_CONVERT_TEXT if the text cannot be converted
+                 to the requested type, and XML_NO_TEXT_NODE if there is no child text to query.
+
+    */
+    int QueryIntText( int* _value ) const;
+    /// See QueryIntText()
+    int QueryUnsignedText( unsigned* _value ) const;
+    /// See QueryIntText()
+    int QueryBoolText( bool* _value ) const;
+    /// See QueryIntText()
+    int QueryDoubleText( double* _value ) const;
+    /// See QueryIntText()
+    int QueryFloatText( float* _value ) const;
+
+    // internal:
+    enum {
+        OPEN,        // <foo>
+        CLOSED,        // <foo/>
+        CLOSING        // </foo>
+    };
+    int ClosingType() const { return closingType; }
+    char* ParseDeep( char* p, StrPair* endTag );
+    virtual XMLNode* ShallowClone( XMLDocument* document ) const;
+    virtual bool ShallowEqual( const XMLNode* compare ) const;
+
+private:
+    XMLElement( XMLDocument* doc );
+    virtual ~XMLElement();
+    XMLElement( const XMLElement& );    // not supported
+    void operator=( const XMLElement& );    // not supported
+
+    XMLAttribute* FindAttribute( const char* name );
+    XMLAttribute* FindOrCreateAttribute( const char* name );
+    //void LinkAttribute( XMLAttribute* attrib );
+    char* ParseAttributes( char* p );
+
+    int closingType;
+    // The attribute list is ordered; there is no 'lastAttribute'
+    // because the list needs to be scanned for dupes before adding
+    // a new attribute.
+    XMLAttribute* rootAttribute;
+};
+
+
+/** A Document binds together all the functionality.
+    It can be saved, loaded, and printed to the screen.
+    All Nodes are connected and allocated to a Document.
+    If the Document is deleted, all its Nodes are also deleted.
+*/
+class XMLDocument : public XMLNode
+{
+    friend class XMLElement;
+public:
+    /// constructor
+    XMLDocument( bool processEntities = true );
+    ~XMLDocument();
+
+    virtual XMLDocument* ToDocument()                { return this; }
+    virtual const XMLDocument* ToDocument() const    { return this; }
+
+    /**
+        Parse an XML file from a character string.
+        Returns XML_NO_ERROR (0) on success, or
+        an errorID.
+    */
+    int Parse( const char* xml );
+
+    /**
+        Load an XML file from disk.
+        Returns XML_NO_ERROR (0) on success, or
+        an errorID.
+    */
+    int LoadFile( const char* filename );
+
+    /**
+        Load an XML file from disk. You are responsible
+        for providing and closing the FILE*.
+
+        Returns XML_NO_ERROR (0) on success, or
+        an errorID.
+    */
+    int LoadFile( FILE* );
+
+    /**
+        Save the XML file to disk.
+        Returns XML_NO_ERROR (0) on success, or
+        an errorID.
+    */
+    int SaveFile( const char* filename );
+
+    /**
+        Save the XML file to disk. You are responsible
+        for providing and closing the FILE*.
+
+        Returns XML_NO_ERROR (0) on success, or
+        an errorID.
+    */
+    int SaveFile( FILE* );
+
+    bool ProcessEntities() const                        { return processEntities; }
+
+    /**
+        Returns true if this document has a leading Byte Order Mark of UTF8.
+    */
+    bool HasBOM() const { return writeBOM; }
+    /** Sets whether to write the BOM when writing the file.
+    */
+    void SetBOM( bool useBOM ) { writeBOM = useBOM; }
+
+    /** Return the root element of DOM. Equivalent to FirstChildElement().
+        To get the first node, use FirstChild().
+    */
+    XMLElement* RootElement()                { return FirstChildElement(); }
+    const XMLElement* RootElement() const    { return FirstChildElement(); }
+
+    /** Print the Document. If the Printer is not provided, it will
+        print to stdout. If you provide Printer, this can print to a file:
+        @verbatim
+        XMLPrinter printer( fp );
+        doc.Print( &printer );
+        @endverbatim
+
+        Or you can use a printer to print to memory:
+        @verbatim
+        XMLPrinter printer;
+        doc->Print( &printer );
+        // printer.CStr() has a const char* to the XML
+        @endverbatim
+    */
+    void Print( XMLPrinter* streamer=0 );
+    virtual bool Accept( XMLVisitor* visitor ) const;
+
+    /**
+        Create a new Element associated with
+        this Document. The memory for the Element
+        is managed by the Document.
+    */
+    XMLElement* NewElement( const char* name );
+    /**
+        Create a new Comment associated with
+        this Document. The memory for the Comment
+        is managed by the Document.
+    */
+    XMLComment* NewComment( const char* comment );
+    /**
+        Create a new Text associated with
+        this Document. The memory for the Text
+        is managed by the Document.
+    */
+    XMLText* NewText( const char* text );
+    /**
+        Create a new Declaration associated with
+        this Document. The memory for the object
+        is managed by the Document.
+
+        If the 'text' param is null, the standard
+        declaration is used.:
+        @verbatim
+            <?xml version="1.0" encoding="UTF-8"?>
+        @endverbatim
+    */
+    XMLDeclaration* NewDeclaration( const char* text=0 );
+    /**
+        Create a new Unknown associated with
+        this Document. The memory for the object
+        is managed by the Document.
+    */
+    XMLUnknown* NewUnknown( const char* text );
+
+    /**
+        Delete a node associated with this document.
+        It will be unlinked from the DOM.
+    */
+    void DeleteNode( XMLNode* node )    { node->parent->DeleteChild( node ); }
+
+    void SetError( int error, const char* str1, const char* str2 );
+
+    /// Return true if there was an error parsing the document.
+    bool Error() const { return errorID != XML_NO_ERROR; }
+    /// Return the errorID.
+    int  ErrorID() const { return errorID; }
+    /// Return a possibly helpful diagnostic location or string.
+    const char* GetErrorStr1() const { return errorStr1; }
+    /// Return a possibly helpful secondary diagnostic location or string.
+    const char* GetErrorStr2() const { return errorStr2; }
+    /// If there is an error, print it to stdout.
+    void PrintError() const;
+
+    // internal
+    char* Identify( char* p, XMLNode** node );
+
+    virtual XMLNode* ShallowClone( XMLDocument* /*document*/ ) const    { return 0; }
+    virtual bool ShallowEqual( const XMLNode* /*compare*/ ) const    { return false; }
+
+private:
+    XMLDocument( const XMLDocument& );    // not supported
+    void operator=( const XMLDocument& );    // not supported
+    void InitDocument();
+
+    bool writeBOM;
+    bool processEntities;
+    int errorID;
+    const char* errorStr1;
+    const char* errorStr2;
+    char* charBuffer;
+
+    MemPoolT< sizeof(XMLElement) >    elementPool;
+    MemPoolT< sizeof(XMLAttribute) > attributePool;
+    MemPoolT< sizeof(XMLText) >        textPool;
+    MemPoolT< sizeof(XMLComment) >    commentPool;
+};
+
+
+/**
+    A XMLHandle is a class that wraps a node pointer with null checks; this is
+    an incredibly useful thing. Note that XMLHandle is not part of the TinyXML
+    DOM structure. It is a separate utility class.
+
+    Take an example:
+    @verbatim
+    <Document>
+        <Element attributeA = "valueA">
+            <Child attributeB = "value1" />
+            <Child attributeB = "value2" />
+        </Element>
+    </Document>
+    @endverbatim
+
+    Assuming you want the value of "attributeB" in the 2nd "Child" element, it's very
+    easy to write a *lot* of code that looks like:
+
+    @verbatim
+    XMLElement* root = document.FirstChildElement( "Document" );
+    if ( root )
+    {
+        XMLElement* element = root->FirstChildElement( "Element" );
+        if ( element )
+        {
+            XMLElement* child = element->FirstChildElement( "Child" );
+            if ( child )
+            {
+                XMLElement* child2 = child->NextSiblingElement( "Child" );
+                if ( child2 )
+                {
+                    // Finally do something useful.
+    @endverbatim
+
+    And that doesn't even cover "else" cases. XMLHandle addresses the verbosity
+    of such code. A XMLHandle checks for null pointers so it is perfectly safe
+    and correct to use:
+
+    @verbatim
+    XMLHandle docHandle( &document );
+    XMLElement* child2 = docHandle.FirstChild( "Document" ).FirstChild( "Element" ).FirstChild().NextSibling().ToElement();
+    if ( child2 )
+    {
+        // do something useful
+    @endverbatim
+
+    Which is MUCH more concise and useful.
+
+    It is also safe to copy handles - internally they are nothing more than node pointers.
+    @verbatim
+    XMLHandle handleCopy = handle;
+    @endverbatim
+
+    See also XMLConstHandle, which is the same as XMLHandle, but operates on const objects.
+*/
+class XMLHandle
+{
+public:
+    /// Create a handle from any node (at any depth of the tree.) This can be a null pointer.
+    XMLHandle( XMLNode* _node )                                                { node = _node; }
+    /// Create a handle from a node.
+    XMLHandle( XMLNode& _node )                                                { node = &_node; }
+    /// Copy constructor
+    XMLHandle( const XMLHandle& ref )                                        { node = ref.node; }
+    /// Assignment
+    XMLHandle& operator=( const XMLHandle& ref )                            { node = ref.node; return *this; }
+
+    /// Get the first child of this handle.
+    XMLHandle FirstChild()                                                     { return XMLHandle( node ? node->FirstChild() : 0 ); }
+    /// Get the first child element of this handle.
+    XMLHandle FirstChildElement( const char* value=0 )                        { return XMLHandle( node ? node->FirstChildElement( value ) : 0 ); }
+    /// Get the last child of this handle.
+    XMLHandle LastChild()                                                    { return XMLHandle( node ? node->LastChild() : 0 ); }
+    /// Get the last child element of this handle.
+    XMLHandle LastChildElement( const char* _value=0 )                        { return XMLHandle( node ? node->LastChildElement( _value ) : 0 ); }
+    /// Get the previous sibling of this handle.
+    XMLHandle PreviousSibling()                                                { return XMLHandle( node ? node->PreviousSibling() : 0 ); }
+    /// Get the previous sibling element of this handle.
+    XMLHandle PreviousSiblingElement( const char* _value=0 )                { return XMLHandle( node ? node->PreviousSiblingElement( _value ) : 0 ); }
+    /// Get the next sibling of this handle.
+    XMLHandle NextSibling()                                                    { return XMLHandle( node ? node->NextSibling() : 0 ); }
+    /// Get the next sibling element of this handle.
+    XMLHandle NextSiblingElement( const char* _value=0 )                    { return XMLHandle( node ? node->NextSiblingElement( _value ) : 0 ); }
+
+    /// Safe cast to XMLNode. This can return null.
+    XMLNode* ToNode()                            { return node; }
+    /// Safe cast to XMLElement. This can return null.
+    XMLElement* ToElement()                     { return ( ( node && node->ToElement() ) ? node->ToElement() : 0 ); }
+    /// Safe cast to XMLText. This can return null.
+    XMLText* ToText()                             { return ( ( node && node->ToText() ) ? node->ToText() : 0 ); }
+    /// Safe cast to XMLUnknown. This can return null.
+    XMLUnknown* ToUnknown()                     { return ( ( node && node->ToUnknown() ) ? node->ToUnknown() : 0 ); }
+    /// Safe cast to XMLDeclaration. This can return null.
+    XMLDeclaration* ToDeclaration()             { return ( ( node && node->ToDeclaration() ) ? node->ToDeclaration() : 0 ); }
+
+private:
+    XMLNode* node;
+};
+
+
+/**
+    A variant of the XMLHandle class for working with const XMLNodes and Documents. It is the
+    same in all regards, except for the 'const' qualifiers. See XMLHandle for API.
+*/
+class XMLConstHandle
+{
+public:
+    XMLConstHandle( const XMLNode* _node )                                            { node = _node; }
+    XMLConstHandle( const XMLNode& _node )                                            { node = &_node; }
+    XMLConstHandle( const XMLConstHandle& ref )                                        { node = ref.node; }
+
+    XMLConstHandle& operator=( const XMLConstHandle& ref )                            { node = ref.node; return *this; }
+
+    const XMLConstHandle FirstChild() const                                            { return XMLConstHandle( node ? node->FirstChild() : 0 ); }
+    const XMLConstHandle FirstChildElement( const char* value=0 ) const                { return XMLConstHandle( node ? node->FirstChildElement( value ) : 0 ); }
+    const XMLConstHandle LastChild()    const                                        { return XMLConstHandle( node ? node->LastChild() : 0 ); }
+    const XMLConstHandle LastChildElement( const char* _value=0 ) const                { return XMLConstHandle( node ? node->LastChildElement( _value ) : 0 ); }
+    const XMLConstHandle PreviousSibling() const                                    { return XMLConstHandle( node ? node->PreviousSibling() : 0 ); }
+    const XMLConstHandle PreviousSiblingElement( const char* _value=0 ) const        { return XMLConstHandle( node ? node->PreviousSiblingElement( _value ) : 0 ); }
+    const XMLConstHandle NextSibling() const                                        { return XMLConstHandle( node ? node->NextSibling() : 0 ); }
+    const XMLConstHandle NextSiblingElement( const char* _value=0 ) const            { return XMLConstHandle( node ? node->NextSiblingElement( _value ) : 0 ); }
+
+
+    const XMLNode* ToNode() const                { return node; }
+    const XMLElement* ToElement() const            { return ( ( node && node->ToElement() ) ? node->ToElement() : 0 ); }
+    const XMLText* ToText() const                { return ( ( node && node->ToText() ) ? node->ToText() : 0 ); }
+    const XMLUnknown* ToUnknown() const            { return ( ( node && node->ToUnknown() ) ? node->ToUnknown() : 0 ); }
+    const XMLDeclaration* ToDeclaration() const    { return ( ( node && node->ToDeclaration() ) ? node->ToDeclaration() : 0 ); }
+
+private:
+    const XMLNode* node;
+};
+
+
+/**
+    Printing functionality. The XMLPrinter gives you more
+    options than the XMLDocument::Print() method.
+
+    It can:
+    -# Print to memory.
+    -# Print to a file you provide.
+    -# Print XML without a XMLDocument.
+
+    Print to Memory
+
+    @verbatim
+    XMLPrinter printer;
+    doc->Print( &printer );
+    SomeFunction( printer.CStr() );
+    @endverbatim
+
+    Print to a File
+
+    You provide the file pointer.
+    @verbatim
+    XMLPrinter printer( fp );
+    doc.Print( &printer );
+    @endverbatim
+
+    Print without a XMLDocument
+
+    When loading, an XML parser is very useful. However, sometimes
+    when saving, it just gets in the way. The code is often set up
+    for streaming, and constructing the DOM is just overhead.
+
+    The Printer supports the streaming case. The following code
+    prints out a trivially simple XML file without ever creating
+    an XML document.
+
+    @verbatim
+    XMLPrinter printer( fp );
+    printer.OpenElement( "foo" );
+    printer.PushAttribute( "foo", "bar" );
+    printer.CloseElement();
+    @endverbatim
+*/
+class XMLPrinter : public XMLVisitor
+{
+public:
+    /** Construct the printer. If the FILE* is specified,
+        this will print to the FILE. Else it will print
+        to memory, and the result is available in CStr().
+        If 'compact' is set to true, then output is created
+        with only required whitespace and newlines.
+    */
+    XMLPrinter( FILE* file=0, bool compact = false );
+    ~XMLPrinter()    {}
+
+    /** If streaming, write the BOM and declaration. */
+    void PushHeader( bool writeBOM, bool writeDeclaration );
+    /** If streaming, start writing an element.
+        The element must be closed with CloseElement()
+    */
+    void OpenElement( const char* name );
+    /// If streaming, add an attribute to an open element.
+    void PushAttribute( const char* name, const char* value );
+    void PushAttribute( const char* name, int value );
+    void PushAttribute( const char* name, unsigned value );
+    void PushAttribute( const char* name, bool value );
+    void PushAttribute( const char* name, double value );
+    /// If streaming, close the Element.
+    void CloseElement();
+
+    /// Add a text node.
+    void PushText( const char* text, bool cdata=false );
+    /// Add a text node from an integer.
+    void PushText( int value );
+    /// Add a text node from an unsigned.
+    void PushText( unsigned value );
+    /// Add a text node from a bool.
+    void PushText( bool value );
+    /// Add a text node from a float.
+    void PushText( float value );
+    /// Add a text node from a double.
+    void PushText( double value );
+
+    /// Add a comment
+    void PushComment( const char* comment );
+
+    void PushDeclaration( const char* value );
+    void PushUnknown( const char* value );
+
+    virtual bool VisitEnter( const XMLDocument& /*doc*/ );
+    virtual bool VisitExit( const XMLDocument& /*doc*/ )            { return true; }
+
+    virtual bool VisitEnter( const XMLElement& element, const XMLAttribute* attribute );
+    virtual bool VisitExit( const XMLElement& element );
+
+    virtual bool Visit( const XMLText& text );
+    virtual bool Visit( const XMLComment& comment );
+    virtual bool Visit( const XMLDeclaration& declaration );
+    virtual bool Visit( const XMLUnknown& unknown );
+
+    /**
+        If in print to memory mode, return a pointer to
+        the XML file in memory.
+    */
+    const char* CStr() const { return buffer.Mem(); }
+    /**
+           If in print to memory mode, return the size
+        of the XML file in memory. (Note the size returned
+        includes the terminating null.)
+      */
+      int CStrSize() const { return buffer.Size(); }
+
+private:
+    void SealElement();
+    void PrintSpace( int depth );
+    void PrintString( const char*, bool restrictedEntitySet );    // prints out, after detecting entities.
+    void Print( const char* format, ... );
+
+    bool elementJustOpened;
+    bool firstElement;
+    FILE* fp;
+    int depth;
+    int textDepth;
+    bool processEntities;
+    bool compactMode;
+
+    enum {
+        ENTITY_RANGE = 64,
+        BUF_SIZE = 200
+    };
+    bool entityFlag[ENTITY_RANGE];
+    bool restrictedEntityFlag[ENTITY_RANGE];
+
+    DynArray< const char*, 10 > stack;
+    DynArray< char, 20 > buffer;
+#ifdef _MSC_VER
+    DynArray< char, 20 > accumulator;
+#endif
+};
+
+
+}    // tinyxml2
+
+
+#endif // TINYXML2_INCLUDED
diff --git a/patch/llvm/llvm-3.5.patch b/patch/llvm/llvm-3.5.patch
new file mode 100644
index 0000000..8ade786
--- /dev/null
+++ b/patch/llvm/llvm-3.5.patch
@@ -0,0 +1,864 @@
+diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
+index 6918280..8883165 100644
+--- a/include/llvm/MC/MCInst.h
++++ b/include/llvm/MC/MCInst.h
+@@ -20,6 +20,7 @@
+ #include "llvm/ADT/StringRef.h"
+ #include "llvm/Support/DataTypes.h"
+ #include "llvm/Support/SMLoc.h"
++#include "llvm/IR/DebugLoc.h"
+ 
+ namespace llvm {
+ class raw_ostream;
+@@ -151,6 +152,7 @@ class MCInst {
+   unsigned Opcode;
+   SMLoc Loc;
+   SmallVector<MCOperand, 8> Operands;
++  DebugLoc DbgLoc;
+ public:
+   MCInst() : Opcode(0) {}
+ 
+@@ -160,6 +162,9 @@ public:
+   void setLoc(SMLoc loc) { Loc = loc; }
+   SMLoc getLoc() const { return Loc; }
+ 
++  void setDebugLoc(DebugLoc &Loc) { DbgLoc = Loc; }
++  DebugLoc getDebugLoc() const { return DbgLoc; }
++
+   const MCOperand &getOperand(unsigned i) const { return Operands[i]; }
+   MCOperand &getOperand(unsigned i) { return Operands[i]; }
+   unsigned getNumOperands() const { return Operands.size(); }
+diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
+index 5dda8bd..0bbd7fb 100644
+--- a/include/llvm/Target/TargetRegisterInfo.h
++++ b/include/llvm/Target/TargetRegisterInfo.h
+@@ -238,6 +238,8 @@ protected:
+   virtual ~TargetRegisterInfo();
+ public:
+ 
++  std::vector<unsigned> HQEMUReservedRegs;
++
+   // Register numbers can represent physical registers, virtual registers, and
+   // sometimes stack slots. The unsigned values are divided into these ranges:
+   //
+@@ -452,6 +454,10 @@ public:
+   /// used by register scavenger to determine what registers are free.
+   virtual BitVector getReservedRegs(const MachineFunction &MF) const = 0;
+ 
++  /// Get/Set extra reserved register(s) by HQEMU.
++  virtual void getHQEMUReservedRegs(BitVector &Reserved) const { }
++  virtual void setHQEMUReservedRegs(std::string RegName) { }
++
+   /// getMatchingSuperReg - Return a super-register of the specified register
+   /// Reg so its sub-register of index SubIdx is Reg.
+   unsigned getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
+diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
+index 2ba1f86..f727dd6 100644
+--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
++++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
+@@ -365,7 +365,10 @@ namespace {
+ 
+     }
+     ~JITEmitter() {
++#if 0
++      // HQEMU has the ownership of the memory manager. Do not delete it.
+       delete MemMgr;
++#endif
+     }
+ 
+     JITResolver &getJITResolver() { return Resolver; }
+diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
+index 100e9a2..fc9fcfc 100644
+--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
++++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
+@@ -77,7 +77,11 @@ public:
+ 
+ private:
+   MCJIT *ParentEngine;
++#if 0
+   std::unique_ptr<RTDyldMemoryManager> ClientMM;
++#endif
++  // HQEMU has the ownership of the memory manager. Do not delete it.
++  RTDyldMemoryManager *ClientMM;
+ };
+ 
+ // About Module states: added->loaded->finalized.
+diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+index 32b5f4a..bb873a9 100644
+--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
++++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+@@ -149,9 +149,39 @@ getReservedRegs(const MachineFunction &MF) const {
+     for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+       if (Reserved.test(*SI)) Reserved.set(*I);
+ 
++  getHQEMUReservedRegs(Reserved);
+   return Reserved;
+ }
+ 
++void ARMBaseRegisterInfo::getHQEMUReservedRegs(BitVector &Reserved) const {
++  for (unsigned i = 0, e = HQEMUReservedRegs.size(); i != e; ++i)
++    Reserved.set(HQEMUReservedRegs[i]);
++}
++
++void ARMBaseRegisterInfo::setHQEMUReservedRegs(std::string RegName) {
++#define RESERVE(x) \
++  do { \
++    HQEMUReservedRegs.push_back(ARM::R ## x); \
++    return; \
++  } while(0)
++
++  if (RegName == "r0")  RESERVE(0);
++  if (RegName == "r1")  RESERVE(1);
++  if (RegName == "r2")  RESERVE(2);
++  if (RegName == "r3")  RESERVE(3);
++  if (RegName == "r4")  RESERVE(4);
++  if (RegName == "r5")  RESERVE(5);
++  if (RegName == "r6")  RESERVE(6);
++  if (RegName == "r7")  RESERVE(7);
++  if (RegName == "r8")  RESERVE(8);
++  if (RegName == "r9")  RESERVE(9);
++  if (RegName == "r10") RESERVE(10);
++  if (RegName == "r11") RESERVE(11);
++  if (RegName == "r12") RESERVE(12);
++
++#undef RESERVE
++}
++
+ const TargetRegisterClass*
+ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                          const {
+diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
+index 833d3f2..fdcc6be 100644
+--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
++++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
+@@ -117,6 +117,9 @@ public:
+ 
+   BitVector getReservedRegs(const MachineFunction &MF) const override;
+ 
++  void getHQEMUReservedRegs(BitVector &Reserved) const;
++  void setHQEMUReservedRegs(std::string RegName);
++
+   const TargetRegisterClass *
+   getPointerRegClass(const MachineFunction &MF,
+                      unsigned Kind = 0) const override;
+diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+index 075db11..8b469c5 100644
+--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
++++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+@@ -164,6 +164,9 @@ public:
+                         const MCInst &MI, const MCInstrDesc &Desc,
+                         const MCSubtargetInfo &STI,
+                         raw_ostream &OS) const;
++
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups) const;
+ };
+ 
+ } // end anonymous namespace
+@@ -1151,6 +1154,50 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+   }
+ }
+ 
++bool X86MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++    break;
++  default: return false;
++  }
++
++  unsigned CurByte = 0;
++  DebugLoc Loc = MI.getDebugLoc();
++  if (Loc.isUnknown())
++    return false;
++
++  unsigned PatchType = Loc.getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    for (unsigned i = 0; i != 8; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  if (Opcode == X86::RETQ) {
++    for (unsigned i = 0; i != 5; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  return false;
++}
++
+ void X86MCCodeEmitter::
+ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+@@ -1159,6 +1206,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+   const MCInstrDesc &Desc = MCII.get(Opcode);
+   uint64_t TSFlags = Desc.TSFlags;
+ 
++  if (EmitHQEMUInstruction(MI, OS, Fixups))
++    return;
++
+   // Pseudo instructions don't get encoded.
+   if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+     return;
+diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
+index a3ae7ee..1555712 100644
+--- a/lib/Target/X86/X86CodeEmitter.cpp
++++ b/lib/Target/X86/X86CodeEmitter.cpp
+@@ -105,6 +105,8 @@ namespace {
+     void emitMemModRMByte(const MachineInstr &MI,
+                           unsigned Op, unsigned RegOpcodeField,
+                           intptr_t PCAdj = 0);
++    void emitQMMU(MachineInstr &MI, const MCInstrDesc *Desc);
++    bool emitHQEMUInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
+ 
+     unsigned getX86RegNum(unsigned RegNo) const {
+       const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+@@ -113,6 +115,13 @@ namespace {
+ 
+     unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                          unsigned OpNum) const;
++    unsigned char getWriteMaskRegisterEncoding(const MachineInstr &MI,
++                                               unsigned OpNum) const {
++      assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
++             "Invalid mask register as write-mask!");
++      unsigned MaskRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
++      return MaskRegNum;
++    }
+   };
+ 
+ template<class CodeEmitter>
+@@ -748,9 +757,11 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                                                const MCInstrDesc *Desc) const {
+   unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                            X86II::EncodingShift;
++  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+   bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
++  bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC;
+ 
+   // VEX_R: opcode externsion equivalent to REX.R in
+   // 1's complement (inverted) form
+@@ -759,6 +770,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+   //  0: Same as REX_R=1 (64 bit mode only)
+   //
+   unsigned char VEX_R = 0x1;
++  unsigned char EVEX_R2 = 0x1;
+ 
+   // VEX_X: equivalent to REX.X, only used when a
+   // register is used for index in SIB Byte.
+@@ -793,6 +805,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+   // VEX_4V (VEX vvvv field): a register specifier
+   // (in 1's complement form) or 1111 if unused.
+   unsigned char VEX_4V = 0xf;
++  unsigned char EVEX_V2 = 0x1;
+ 
+   // VEX_L (Vector Length):
+   //
+@@ -800,6 +813,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+   //  1: 256-bit vector
+   //
+   unsigned char VEX_L = 0;
++  unsigned char EVEX_L2 = 0;
+ 
+   // VEX_PP: opcode extension providing equivalent
+   // functionality of a SIMD prefix
+@@ -811,11 +825,36 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+   //
+   unsigned char VEX_PP = 0;
+ 
++  // EVEX_U
++  unsigned char EVEX_U = 1; // Always '1' so far
++
++  // EVEX_z
++  unsigned char EVEX_z = 0;
++
++  // EVEX_b
++  unsigned char EVEX_b = 0;
++
++  // EVEX_rc
++  unsigned char EVEX_rc = 0;
++
++  // EVEX_aaa
++  unsigned char EVEX_aaa = 0;
++
++  bool EncodeRC = false;
++
+   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+     VEX_W = 1;
+ 
+   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+     VEX_L = 1;
++  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
++    EVEX_L2 = 1;
++
++  if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
++    EVEX_z = 1;
++
++  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
++    EVEX_b = 1;
+ 
+   switch (TSFlags & X86II::OpPrefixMask) {
+   default: break; // VEX_PP already correct
+@@ -836,15 +875,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+ 
+   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+   unsigned NumOps = Desc->getNumOperands();
+-  unsigned CurOp = 0;
+-  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+-    ++CurOp;
+-  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+-    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+-    // Special case for GATHER with 2 TIED_TO operands
+-    // Skip the first 2 operands: dst, mask_wb
+-    CurOp += 2;
+-  }
++  unsigned CurOp = X86II::getOperandBias(*Desc);
+ 
+   switch (TSFlags & X86II::FormMask) {
+     default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+@@ -860,14 +891,28 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+         VEX_B = 0x0;
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+         VEX_X = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
++        EVEX_V2 = 0x0;
+ 
+       CurOp = X86::AddrNumOperands;
+-      if (HasVEX_4V)
+-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
++
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
++
++      if (HasVEX_4V) {
++        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
++        CurOp++;
++      }
+ 
+       const MachineOperand &MO = MI.getOperand(CurOp);
+-      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+-        VEX_R = 0x0;
++      if (MO.isReg()) {
++        if (X86II::isX86_64ExtendedReg(MO.getReg()))
++          VEX_R = 0x0;
++        if (X86II::is32ExtendedReg(MO.getReg()))
++          EVEX_R2 = 0x0;
++      }
+       break;
+     }
+     case X86II::MRMSrcMem:
+@@ -882,10 +927,17 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+       //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_R = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        EVEX_R2 = 0x0;
+       CurOp++;
+ 
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
++
+       if (HasVEX_4V) {
+         VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
+         CurOp++;
+       }
+ 
+@@ -896,6 +948,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                           MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+         VEX_X = 0x0;
+ 
++      if (X86II::is32ExtendedReg(
++                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
++        EVEX_V2 = 0x0;
++
+       if (HasVEX_4VOp3)
+         VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+       break;
+@@ -906,8 +962,15 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+       // MRM[0-9]m instructions forms:
+       //  MemAddr
+       //  src1(VEX_4V), MemAddr
+-      if (HasVEX_4V)
+-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
++      if (HasVEX_4V) {
++        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
++        CurOp++;
++      }
++
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+ 
+       if (X86II::isX86_64ExtendedReg(
+                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+@@ -925,19 +988,38 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+       //
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_R = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        EVEX_R2 = 0x0;
+       CurOp++;
+ 
+-      if (HasVEX_4V)
+-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
++
++      if (HasVEX_4V) {
++        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
++        CurOp++;
++      }
+ 
+       if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+         CurOp++;
+ 
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_B = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        VEX_X = 0x0;
+       CurOp++;
+       if (HasVEX_4VOp3)
+         VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++      if (EVEX_b) {
++        if (HasEVEX_RC) {
++          unsigned RcOperand = NumOps-1;
++          assert(RcOperand >= CurOp);
++          EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
++        }
++        EncodeRC = true;
++      }
+       break;
+     case X86II::MRMDestReg:
+       // MRMDestReg instructions forms:
+@@ -946,13 +1028,26 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+       //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_B = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        VEX_X = 0x0;
+       CurOp++;
+ 
+-      if (HasVEX_4V)
+-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
++
++      if (HasVEX_4V) {
++        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
++        CurOp++;
++      }
+ 
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_R = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        EVEX_R2 = 0x0;
++      if (EVEX_b)
++        EncodeRC = true;
+       break;
+     case X86II::MRM0r: case X86II::MRM1r:
+     case X86II::MRM2r: case X86II::MRM3r:
+@@ -960,45 +1055,190 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+     case X86II::MRM6r: case X86II::MRM7r:
+       // MRM0r-MRM7r instructions forms:
+       //  dst(VEX_4V), src(ModR/M), imm8
+-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+-      CurOp++;
++      if (HasVEX_4V) {
++        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
++        if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++          EVEX_V2 = 0x0;
++        CurOp++;
++      }
++      if (HasEVEX_K)
++        EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+ 
+       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+         VEX_B = 0x0;
++      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
++        VEX_X = 0x0;
+       break;
+   }
+ 
+-  // Emit segment override opcode prefix as needed.
+-  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
++  if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
++    // Emit segment override opcode prefix as needed.
++    emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
++
++    // VEX opcode prefix can have 2 or 3 bytes
++    //
++    //  3 bytes:
++    //    +-----+ +--------------+ +-------------------+
++    //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
++    //    +-----+ +--------------+ +-------------------+
++    //  2 bytes:
++    //    +-----+ +-------------------+
++    //    | C5h | | R | vvvv | L | pp |
++    //    +-----+ +-------------------+
++    //
++    //  XOP uses a similar prefix:
++    //    +-----+ +--------------+ +-------------------+
++    //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
++    //    +-----+ +--------------+ +-------------------+
++    unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
++  
++    // Can this use the 2 byte VEX prefix?
++    if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
++      MCE.emitByte(0xC5);
++      MCE.emitByte(LastByte | (VEX_R << 7));
++      return;
++    }
++  
++    // 3 byte VEX prefix
++    MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4);
++    MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
++    MCE.emitByte(LastByte | (VEX_W << 7));
++  } else {
++    assert(Encoding == X86II::EVEX && "unknown encoding!");
++    // EVEX opcode prefix can have 4 bytes
++    //
++    // +-----+ +--------------+ +-------------------+ +------------------------+
++    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
++    // +-----+ +--------------+ +-------------------+ +------------------------+
++    assert((VEX_5M & 0x3) == VEX_5M
++           && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
++
++    VEX_5M &= 0x3;
++
++    MCE.emitByte(0x62);
++    MCE.emitByte((VEX_R   << 7) |
++                 (VEX_X   << 6) |
++                 (VEX_B   << 5) |
++                 (EVEX_R2 << 4) |
++                 VEX_5M);
++    MCE.emitByte((VEX_W   << 7) |
++                 (VEX_4V  << 3) |
++                 (EVEX_U  << 2) |
++                 VEX_PP);
++    if (EncodeRC)
++      MCE.emitByte((EVEX_z  << 7) |
++                   (EVEX_rc << 5) |
++                   (EVEX_b  << 4) |
++                   (EVEX_V2 << 3) |
++                   EVEX_aaa);
++    else
++      MCE.emitByte((EVEX_z  << 7) |
++                   (EVEX_L2 << 6) |
++                   (VEX_L   << 5) |
++                   (EVEX_b  << 4) |
++                   (EVEX_V2 << 3) |
++                   EVEX_aaa);
++  }
++}
+ 
+-  // VEX opcode prefix can have 2 or 3 bytes
+-  //
+-  //  3 bytes:
+-  //    +-----+ +--------------+ +-------------------+
+-  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+-  //    +-----+ +--------------+ +-------------------+
+-  //  2 bytes:
+-  //    +-----+ +-------------------+
+-  //    | C5h | | R | vvvv | L | pp |
+-  //    +-----+ +-------------------+
+-  //
+-  //  XOP uses a similar prefix:
+-  //    +-----+ +--------------+ +-------------------+
+-  //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+-  //    +-----+ +--------------+ +-------------------+
+-  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+-
+-  // Can this use the 2 byte VEX prefix?
+-  if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+-    MCE.emitByte(0xC5);
+-    MCE.emitByte(LastByte | (VEX_R << 7));
+-    return;
++template<class CodeEmitter>
++void Emitter<CodeEmitter>::emitQMMU(MachineInstr &MI,
++                                    const MCInstrDesc *Desc) {
++  // QMMU stub is as follows:
++  //   jmp QMMUExit
++  //   nop
++  //   jmp QMMUMiss
++  MachineBasicBlock *MBB = MI.getParent();
++  if (MBB->succ_size() != 2)
++    llvm_unreachable("Unhandled QMMU stub!");
++
++  MachineBasicBlock* QMMUExit = *MBB->succ_begin();
++  MachineBasicBlock* QMMUMiss = *(++MBB->succ_begin());
++  MachineInstr *MRI = &*QMMUMiss->rbegin();
++  if (MRI->getDesc().getOpcode() != X86::TRAP) {
++    MachineBasicBlock *tmp = QMMUExit;
++    QMMUExit = QMMUMiss;
++    QMMUMiss = tmp;
+   }
+ 
+-  // 3 byte VEX prefix
+-  MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4);
+-  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+-  MCE.emitByte(LastByte | (VEX_W << 7));
++  MRI = &*QMMUMiss->rbegin();
++  if (MRI->getDesc().getOpcode() != X86::TRAP)
++    llvm_unreachable("Unknown QMMU CFG!");
++
++  MCE.emitByte(0xE9);
++  emitPCRelativeBlockAddress(QMMUExit);
++  MCE.emitByte(0x90);
++  if (QMMUMiss != ++MachineFunction::iterator(MBB)) {
++    MCE.emitByte(0xE9);
++    emitPCRelativeBlockAddress(QMMUMiss);
++  }
++}
++
++template<class CodeEmitter>
++bool Emitter<CodeEmitter>::emitHQEMUInstruction(MachineInstr &MI,
++                                                const MCInstrDesc *Desc)
++{
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = Desc->Opcode;
++
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++  case X86::JMP32r:
++  case X86::JMP64r:
++    break;
++  default: return false;
++  }
++
++  LLVMContext &Ctx = MI.getParent()->getParent()->getFunction()->getContext();
++  MDNode *M = MI.getDebugLoc().getScope(Ctx);
++  if (!M || !isa<ConstantInt>(M->getOperand(1)))
++    return false;
++
++  uint64_t flag = cast<ConstantInt>(M->getOperand(1))->getZExtValue();
++  if (flag < PATCH_HQEMU || flag > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    if (flag == PATCH_QMMU)
++      return true;
++
++    unsigned NumNOP = 3 - MCE.getCurrentPCValue() % 4;
++    for (unsigned i = 0; i != NumNOP; ++i)
++      MCE.emitByte(0x90);
++
++    uintptr_t *ChainPoint = (uintptr_t *)cast<ConstantInt>(M->getOperand(2))->getZExtValue();
++    *ChainPoint = (uintptr_t) MCE.getCurrentPCValue();
++    MCE.emitByte(0xE9);
++    emitConstant(0, 4);
++    return true;
++  }
++
++  if (Opcode == X86::RETQ) {
++    uintptr_t ExitAddr = (uintptr_t)cast<ConstantInt>(M->getOperand(2))->getZExtValue();
++    uintptr_t Disp = ExitAddr - ((uintptr_t) MCE.getCurrentPCValue() + 5);
++    MCE.emitByte(0xE9);
++    emitConstant(Disp, 4);
++    return true;
++  }
++  
++  if (Opcode == X86::JMP32r || Opcode == X86::JMP64r) {
++    if (flag == PATCH_QMMU) {
++      emitQMMU(MI, Desc);
++      return true;
++    }
++  }
++  return false;
+ }
+ 
+ template<class CodeEmitter>
+@@ -1032,6 +1272,11 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+ 
+   unsigned Opcode = Desc->Opcode;
+ 
++  if (emitHQEMUInstruction(MI, Desc)) {
++    MCE.processDebugLoc(MI.getDebugLoc(), false);
++    return;
++  }
++
+   // If this is a two-address instruction, skip one of the register operands.
+   unsigned NumOps = Desc->getNumOperands();
+   unsigned CurOp = 0;
+diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
+index 2bd70a9..7e83c66 100644
+--- a/lib/Target/X86/X86MCInstLower.cpp
++++ b/lib/Target/X86/X86MCInstLower.cpp
+@@ -345,6 +345,10 @@ static unsigned getRetOpcode(const X86Subtarget &Subtarget)
+ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  DebugLoc Loc = MI->getDebugLoc();
++  if (!Loc.isUnknown())
++    OutMI.setDebugLoc(Loc);
++
+   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+     const MachineOperand &MO = MI->getOperand(i);
+ 
+diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
+index e8a7e84..a0b425e 100644
+--- a/lib/Target/X86/X86RegisterInfo.cpp
++++ b/lib/Target/X86/X86RegisterInfo.cpp
+@@ -395,9 +395,65 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     }
+   }
+ 
++  getHQEMUReservedRegs(Reserved);
+   return Reserved;
+ }
+ 
++void X86RegisterInfo::getHQEMUReservedRegs(BitVector &Reserved) const {
++  for (unsigned i = 0, e = HQEMUReservedRegs.size(); i != e; ++i)
++    Reserved.set(HQEMUReservedRegs[i]);
++}
++
++void X86RegisterInfo::setHQEMUReservedRegs(std::string RegName) {
++#define RESERVE1(x) \
++  do { \
++    HQEMUReservedRegs.push_back(X86::x ## L); \
++    HQEMUReservedRegs.push_back(X86::x ## H); \
++    HQEMUReservedRegs.push_back(X86::x ## X);\
++    HQEMUReservedRegs.push_back(X86::E ## x ## X);\
++    HQEMUReservedRegs.push_back(X86::R ## x ## X);\
++    return; \
++  } while(0)
++
++#define RESERVE2(x) \
++  do { \
++    HQEMUReservedRegs.push_back(X86::R ## x); \
++    HQEMUReservedRegs.push_back(X86::R ## x ## B);\
++    HQEMUReservedRegs.push_back(X86::R ## x ## D);\
++    HQEMUReservedRegs.push_back(X86::R ## x ## W);\
++    return; \
++  } while(0)
++
++  if (RegName == "ebp") {
++    // 32-bit registers
++    HQEMUReservedRegs.push_back(X86::EBP);
++    // 16-bit registers
++    HQEMUReservedRegs.push_back(X86::BP);
++#if defined(__x86_64__)
++    // X86-64 only
++    HQEMUReservedRegs.push_back(X86::BPL);
++#endif
++    return;
++  }
++#if defined(__x86_64__)
++  if (RegName == "rax") RESERVE1(A);
++  if (RegName == "rbx") RESERVE1(B);
++  if (RegName == "rcx") RESERVE1(C);
++  if (RegName == "rdx") RESERVE1(D);
++  if (RegName == "r8")  RESERVE2(8);
++  if (RegName == "r9")  RESERVE2(9);
++  if (RegName == "r10") RESERVE2(10);
++  if (RegName == "r11") RESERVE2(11);
++  if (RegName == "r12") RESERVE2(12);
++  if (RegName == "r13") RESERVE2(13);
++  if (RegName == "r14") RESERVE2(14);
++  if (RegName == "r15") RESERVE2(15);
++#endif
++
++#undef RESERVE1
++#undef RESERVE2
++}
++
+ //===----------------------------------------------------------------------===//
+ // Stack Frame Processing methods
+ //===----------------------------------------------------------------------===//
+diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
+index 74efd1f..d709505 100644
+--- a/lib/Target/X86/X86RegisterInfo.h
++++ b/lib/Target/X86/X86RegisterInfo.h
+@@ -107,6 +107,9 @@ public:
+   /// register scavenger to determine what registers are free.
+   BitVector getReservedRegs(const MachineFunction &MF) const override;
+ 
++  void getHQEMUReservedRegs(BitVector &Reserved) const override;
++  void setHQEMUReservedRegs(std::string RegName) override;
++
+   bool hasBasePointer(const MachineFunction &MF) const;
+ 
+   bool canRealignStack(const MachineFunction &MF) const;
+diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
+index a5e443f..cd4f57a 100644
+--- a/lib/Transforms/Utils/Local.cpp
++++ b/lib/Transforms/Utils/Local.cpp
+@@ -1188,12 +1188,15 @@ static bool markAliveBlocks(BasicBlock *BB,
+           // If we found a call to a no-return function, insert an unreachable
+           // instruction after it.  Make sure there isn't *already* one there
+           // though.
++#if 0
++          // HQEMU: do not delete instructions after llvm.trap.
+           ++BBI;
+           if (!isa<UnreachableInst>(BBI)) {
+             // Don't insert a call to llvm.trap right before the unreachable.
+             changeToUnreachable(BBI, false);
+             Changed = true;
+           }
++#endif
+           break;
+         }
+       }
+diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
+index 1c62559..8375529 100644
+--- a/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -1028,6 +1028,11 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) {
+ 
+   bool Changed = false;
+   do {
++    // HQEMU: skip hoisting instructions from llvm.trap to the terminator
++    // instruction.
++    if (isa<IntrinsicInst>(I1) || I1->hasMetadata())
++      return Changed;
++
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+     if (isa<TerminatorInst>(I1))
+@@ -3968,6 +3973,10 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+   BasicBlock *BB = IBI->getParent();
+   bool Changed = false;
+ 
++  // HQEMU: LLVM tries to remove the indirectbr with no successors.
++  // Disable it because we use indirectbr to implement IBTC.
++  return false;
++
+   // Eliminate redundant destinations.
+   SmallPtrSet<Value *, 8> Succs;
+   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
diff --git a/patch/llvm/llvm-3.8.patch b/patch/llvm/llvm-3.8.patch
new file mode 100644
index 0000000..a2f8968
--- /dev/null
+++ b/patch/llvm/llvm-3.8.patch
@@ -0,0 +1,247 @@
+diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
+index a730260..5102344 100644
+--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
++++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
+@@ -550,6 +550,7 @@ public:
+   /// is called and is successful, the created engine takes ownership of the
+   /// memory manager. This option defaults to NULL.
+   EngineBuilder &setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager> mcjmm);
++  EngineBuilder &setMCJITMemoryManager(std::shared_ptr<RTDyldMemoryManager> mcjmm);
+ 
+   EngineBuilder&
+   setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM);
+diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
+index 4688b5f..e3124bf 100644
+--- a/include/llvm/MC/MCInst.h
++++ b/include/llvm/MC/MCInst.h
+@@ -27,6 +27,7 @@ class MCAsmInfo;
+ class MCInstPrinter;
+ class MCExpr;
+ class MCInst;
++class DebugLoc;
+ 
+ /// \brief Instances of this class represent operands of the MCInst class.
+ /// This is a simple discriminated union.
+@@ -151,9 +152,10 @@ class MCInst {
+   unsigned Opcode;
+   SMLoc Loc;
+   SmallVector<MCOperand, 8> Operands;
++  const DebugLoc *DbgLoc;
+ 
+ public:
+-  MCInst() : Opcode(0) {}
++  MCInst() : Opcode(0), DbgLoc(nullptr) {}
+ 
+   void setOpcode(unsigned Op) { Opcode = Op; }
+   unsigned getOpcode() const { return Opcode; }
+@@ -161,6 +163,9 @@ public:
+   void setLoc(SMLoc loc) { Loc = loc; }
+   SMLoc getLoc() const { return Loc; }
+ 
++  void setDebugLoc(const DebugLoc *Loc) { DbgLoc = Loc; }
++  const DebugLoc *getDebugLoc() const { return DbgLoc; }
++
+   const MCOperand &getOperand(unsigned i) const { return Operands[i]; }
+   MCOperand &getOperand(unsigned i) { return Operands[i]; }
+   unsigned getNumOperands() const { return Operands.size(); }
+diff --git a/include/llvm/MC/MCInstrInfo.h b/include/llvm/MC/MCInstrInfo.h
+index 70c8658..69a6427 100644
+--- a/include/llvm/MC/MCInstrInfo.h
++++ b/include/llvm/MC/MCInstrInfo.h
+@@ -26,6 +26,7 @@ class MCInstrInfo {
+   const unsigned *InstrNameIndices; // Array for name indices in InstrNameData
+   const char *InstrNameData;        // Instruction name string pool
+   unsigned NumOpcodes;              // Number of entries in the desc array
++  unsigned long HQEMUExitAddr;
+ 
+ public:
+   /// \brief Initialize MCInstrInfo, called by TableGen auto-generated routines.
+@@ -52,6 +53,9 @@ public:
+     assert(Opcode < NumOpcodes && "Invalid opcode!");
+     return &InstrNameData[InstrNameIndices[Opcode]];
+   }
++
++  void setHQEMUExitAddr(unsigned long Addr) { HQEMUExitAddr = Addr; }
++  unsigned long getHQEMUExitAddr() const { return HQEMUExitAddr; }
+ };
+ 
+ } // End llvm namespace
+diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
+index 41c8da4..ffca9ea 100644
+--- a/lib/ExecutionEngine/ExecutionEngine.cpp
++++ b/lib/ExecutionEngine/ExecutionEngine.cpp
+@@ -497,6 +497,13 @@ EngineBuilder &EngineBuilder::setMCJITMemoryManager(
+   return *this;
+ }
+ 
++EngineBuilder &EngineBuilder::setMCJITMemoryManager(
++                                   std::shared_ptr<RTDyldMemoryManager> mcjmm) {
++  MemMgr = mcjmm;
++  Resolver = mcjmm;
++  return *this;
++}
++
+ EngineBuilder&
+ EngineBuilder::setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM) {
+   MemMgr = std::shared_ptr<MCJITMemoryManager>(std::move(MM));
+diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+index dfab6ec..8a9752f 100644
+--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
++++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+@@ -23,6 +23,7 @@
+ #include "llvm/MC/MCSubtargetInfo.h"
+ #include "llvm/MC/MCSymbol.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ 
+ using namespace llvm;
+ 
+@@ -164,6 +165,9 @@ public:
+                         const MCInst &MI, const MCInstrDesc &Desc,
+                         const MCSubtargetInfo &STI,
+                         raw_ostream &OS) const;
++
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups) const;
+ };
+ 
+ } // end anonymous namespace
+@@ -1158,6 +1162,52 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+   }
+ }
+ 
++bool X86MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++    break;
++  default: return false;
++  }
++
++  unsigned CurByte = 0;
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    for (unsigned i = 0; i != 8; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  if (Opcode == X86::RETQ) {
++    uintptr_t ExitAddr = MCII.getHQEMUExitAddr();
++    EmitByte(0xE9, CurByte, OS);
++    EmitImmediate(MCOperand::createImm(ExitAddr), MI.getLoc(), 4, FK_PCRel_4,
++                  CurByte, OS, Fixups);
++    return true;
++  }
++  return false;
++}
++
+ void X86MCCodeEmitter::
+ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+@@ -1166,6 +1216,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+   const MCInstrDesc &Desc = MCII.get(Opcode);
+   uint64_t TSFlags = Desc.TSFlags;
+ 
++  if (EmitHQEMUInstruction(MI, OS, Fixups))
++    return;
++
+   // Pseudo instructions don't get encoded.
+   if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+     return;
+diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
+index e1ca558..c3acaec 100644
+--- a/lib/Target/X86/X86MCInstLower.cpp
++++ b/lib/Target/X86/X86MCInstLower.cpp
+@@ -437,6 +437,9 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands())
+     if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+       OutMI.addOperand(MaybeMCOp.getValue());
+diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
+index 274b566..dbb4fec 100644
+--- a/lib/Target/X86/X86RegisterInfo.cpp
++++ b/lib/Target/X86/X86RegisterInfo.cpp
+@@ -473,6 +473,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     }
+   }
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    if (!Is64Bit) {
++      Reserved.set(X86::EBP);
++      Reserved.set(X86::BP);
++      Reserved.set(X86::BPL);
++    } else {
++      Reserved.set(X86::R14);
++      Reserved.set(X86::R14B);
++      Reserved.set(X86::R14D);
++      Reserved.set(X86::R14W);
++    }
++  }
+   return Reserved;
+ }
+ 
+diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
+index abc9b65..39241c2 100644
+--- a/lib/Transforms/Utils/Local.cpp
++++ b/lib/Transforms/Utils/Local.cpp
+@@ -1302,7 +1302,8 @@ static bool markAliveBlocks(Function &F,
+         }
+ 
+       if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+-        if (CI->doesNotReturn()) {
++        // HQEMU: do not delete instructions after llvm.trap.
++        if (!F.hasFnAttribute("hqemu") && CI->doesNotReturn()) {
+           // If we found a call to a no-return function, insert an unreachable
+           // instruction after it.  Make sure there isn't *already* one there
+           // though.
+diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
+index e484b69..6ac6033 100644
+--- a/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -1120,6 +1120,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
+ 
+   bool Changed = false;
+   do {
++    if (BIParent->getParent()->hasFnAttribute("hqemu"))
++      if (isa<IntrinsicInst>(I1) || I1->hasMetadata())
++        return Changed;
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+     if (isa<TerminatorInst>(I1))
+@@ -4898,6 +4901,9 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+   BasicBlock *BB = IBI->getParent();
+   bool Changed = false;
+ 
++  if (BB->getParent()->hasFnAttribute("hqemu"))
++    return false;
++
+   // Eliminate redundant destinations.
+   SmallPtrSet<Value *, 8> Succs;
+   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
diff --git a/patch/llvm/llvm-3.9.patch b/patch/llvm/llvm-3.9.patch
new file mode 100644
index 0000000..38fa566
--- /dev/null
+++ b/patch/llvm/llvm-3.9.patch
@@ -0,0 +1,404 @@
+diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
+index ab13028..810f403 100644
+--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
++++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
+@@ -550,6 +550,7 @@ public:
+   /// is called and is successful, the created engine takes ownership of the
+   /// memory manager. This option defaults to NULL.
+   EngineBuilder &setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager> mcjmm);
++  EngineBuilder &setMCJITMemoryManager(std::shared_ptr<RTDyldMemoryManager> mcjmm);
+ 
+   EngineBuilder&
+   setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM);
+diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
+index 4688b5f..e3124bf 100644
+--- a/include/llvm/MC/MCInst.h
++++ b/include/llvm/MC/MCInst.h
+@@ -27,6 +27,7 @@ class MCAsmInfo;
+ class MCInstPrinter;
+ class MCExpr;
+ class MCInst;
++class DebugLoc;
+ 
+ /// \brief Instances of this class represent operands of the MCInst class.
+ /// This is a simple discriminated union.
+@@ -151,9 +152,10 @@ class MCInst {
+   unsigned Opcode;
+   SMLoc Loc;
+   SmallVector<MCOperand, 8> Operands;
++  const DebugLoc *DbgLoc;
+ 
+ public:
+-  MCInst() : Opcode(0) {}
++  MCInst() : Opcode(0), DbgLoc(nullptr) {}
+ 
+   void setOpcode(unsigned Op) { Opcode = Op; }
+   unsigned getOpcode() const { return Opcode; }
+@@ -161,6 +163,9 @@ public:
+   void setLoc(SMLoc loc) { Loc = loc; }
+   SMLoc getLoc() const { return Loc; }
+ 
++  void setDebugLoc(const DebugLoc *Loc) { DbgLoc = Loc; }
++  const DebugLoc *getDebugLoc() const { return DbgLoc; }
++
+   const MCOperand &getOperand(unsigned i) const { return Operands[i]; }
+   MCOperand &getOperand(unsigned i) { return Operands[i]; }
+   unsigned getNumOperands() const { return Operands.size(); }
+diff --git a/include/llvm/MC/MCInstrInfo.h b/include/llvm/MC/MCInstrInfo.h
+index 70c8658..69a6427 100644
+--- a/include/llvm/MC/MCInstrInfo.h
++++ b/include/llvm/MC/MCInstrInfo.h
+@@ -26,6 +26,7 @@ class MCInstrInfo {
+   const unsigned *InstrNameIndices; // Array for name indices in InstrNameData
+   const char *InstrNameData;        // Instruction name string pool
+   unsigned NumOpcodes;              // Number of entries in the desc array
++  unsigned long HQEMUExitAddr;
+ 
+ public:
+   /// \brief Initialize MCInstrInfo, called by TableGen auto-generated routines.
+@@ -52,6 +53,9 @@ public:
+     assert(Opcode < NumOpcodes && "Invalid opcode!");
+     return &InstrNameData[InstrNameIndices[Opcode]];
+   }
++
++  void setHQEMUExitAddr(unsigned long Addr) { HQEMUExitAddr = Addr; }
++  unsigned long getHQEMUExitAddr() const { return HQEMUExitAddr; }
+ };
+ 
+ } // End llvm namespace
+diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
+index a8e68bf..a4f1d99 100644
+--- a/lib/ExecutionEngine/ExecutionEngine.cpp
++++ b/lib/ExecutionEngine/ExecutionEngine.cpp
+@@ -492,6 +492,13 @@ EngineBuilder &EngineBuilder::setMCJITMemoryManager(
+   return *this;
+ }
+ 
++EngineBuilder &EngineBuilder::setMCJITMemoryManager(
++                                   std::shared_ptr<RTDyldMemoryManager> mcjmm) {
++  MemMgr = mcjmm;
++  Resolver = mcjmm;
++  return *this;
++}
++
+ EngineBuilder&
+ EngineBuilder::setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM) {
+   MemMgr = std::shared_ptr<MCJITMemoryManager>(std::move(MM));
+diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
+index 2b4cdf1..0e09232 100644
+--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
++++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
+@@ -207,6 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands()) {
+     MCOperand MCOp;
+     if (lowerOperand(MO, MCOp))
+diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+index af867da..1755863 100644
+--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
++++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+@@ -138,6 +138,14 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     Reserved.set(AArch64::W19);
+   }
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    Reserved.set(AArch64::X19);
++    Reserved.set(AArch64::W19);
++    Reserved.set(AArch64::X28);
++    Reserved.set(AArch64::W28);
++  }
++
+   return Reserved;
+ }
+ 
+diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+index 7b9ff8f..7d724cb 100644
+--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
++++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+@@ -24,6 +24,7 @@
+ #include "llvm/MC/MCSubtargetInfo.h"
+ #include "llvm/Support/EndianStream.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ using namespace llvm;
+ 
+ #define DEBUG_TYPE "mccodeemitter"
+@@ -35,11 +36,13 @@ namespace {
+ 
+ class AArch64MCCodeEmitter : public MCCodeEmitter {
+   MCContext &Ctx;
++  const MCInstrInfo &MCII;
+ 
+   AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+   void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
+ public:
+-  AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : Ctx(ctx) {}
++  AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
++      : Ctx(ctx), MCII(mcii) {}
+ 
+   ~AArch64MCCodeEmitter() override {}
+ 
+@@ -170,6 +173,10 @@ public:
+ 
+   unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
++
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups,
++                            const MCSubtargetInfo &STI) const;
+ };
+ 
+ } // end anonymous namespace
+@@ -536,9 +543,85 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+   return EncodedValue & ~(1u << 30);
+ }
+ 
++bool AArch64MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case AArch64::BRK:
++  case AArch64::RET:
++    break;
++  default: return false;
++  }
++
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == AArch64::BRK) {
++    uint64_t Binary = 0;
++    MCOperand Operand = MCOperand::createImm(1);
++    MCInst Jump;
++
++    Jump.setOpcode(AArch64::B);
++    Jump.addOperand(Operand);
++    Binary = getBinaryCodeForInstr(Jump, Fixups, STI);
++    support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
++    ++MCNumEmitted;
++    return true;
++  }
++  if (Opcode == AArch64::RET) {
++    uint64_t ExitAddr = MCII.getHQEMUExitAddr();
++    uint32_t Binary[4];
++    MCOperand Reg = MCOperand::createReg(AArch64::X1);
++    MCInst Jump, Mov;
++
++    // mov w0, ExitAddr[15:0]
++    Binary[0] = (0x2 << 29) | 0x1;
++    Binary[0] |= (0x25 << 23);
++    Binary[0] |= ((ExitAddr & 0xFFFF) << 5);
++
++    // movk w0, ExitAddr[31:16]
++    Binary[1] =  (0x3 << 29) | 0x1;
++    Binary[1] |= (0x25 << 23);
++    Binary[1] |= (0x1 << 21);
++    Binary[1] |= ((ExitAddr & 0xFFFF0000) >> 11);
++
++    Jump.setOpcode(AArch64::BR);
++    Jump.addOperand(Reg);
++    Binary[2] = getBinaryCodeForInstr(Jump, Fixups, STI);
++
++    for (int i = 0; i < 3; ++i) {
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Binary[i]);
++      ++MCNumEmitted;
++    }
++    return true;
++  }
++  return false;
++}
++
+ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const {
++  if (EmitHQEMUInstruction(MI, OS, Fixups, STI))
++    return;
++
+   if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+     // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+     // following (BLR) instruction. It doesn't emit any code itself so it
+diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+index 96c2e81..504b3eb 100644
+--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
++++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+@@ -23,6 +23,7 @@
+ #include "llvm/MC/MCSubtargetInfo.h"
+ #include "llvm/MC/MCSymbol.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ 
+ using namespace llvm;
+ 
+@@ -142,6 +143,9 @@ public:
+                         const MCInst &MI, const MCInstrDesc &Desc,
+                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups) const;
++
+   uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                              int MemOperand, const MCInstrDesc &Desc) const;
+ };
+@@ -1110,6 +1114,52 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+   return Ret;
+ }
+ 
++bool X86MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++    break;
++  default: return false;
++  }
++
++  unsigned CurByte = 0;
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    for (unsigned i = 0; i != 8; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  if (Opcode == X86::RETQ) {
++    uintptr_t ExitAddr = MCII.getHQEMUExitAddr();
++    EmitByte(0xE9, CurByte, OS);
++    EmitImmediate(MCOperand::createImm(ExitAddr), MI.getLoc(), 4, FK_PCRel_4,
++                  CurByte, OS, Fixups);
++    return true;
++  }
++  return false;
++}
++
+ void X86MCCodeEmitter::
+ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+@@ -1118,6 +1168,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+   const MCInstrDesc &Desc = MCII.get(Opcode);
+   uint64_t TSFlags = Desc.TSFlags;
+ 
++  if (EmitHQEMUInstruction(MI, OS, Fixups))
++    return;
++
+   // Pseudo instructions don't get encoded.
+   if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+     return;
+diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
+index 906e342..8f7db6b 100644
+--- a/lib/Target/X86/X86MCInstLower.cpp
++++ b/lib/Target/X86/X86MCInstLower.cpp
+@@ -389,6 +389,9 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands())
+     if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+       OutMI.addOperand(MaybeMCOp.getValue());
+diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
+index 8675063..e1d0e19 100644
+--- a/lib/Target/X86/X86RegisterInfo.cpp
++++ b/lib/Target/X86/X86RegisterInfo.cpp
+@@ -503,6 +503,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     }
+   }
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    if (!Is64Bit) {
++      Reserved.set(X86::EBP);
++      Reserved.set(X86::BP);
++      Reserved.set(X86::BPL);
++    } else {
++      Reserved.set(X86::R14);
++      Reserved.set(X86::R14B);
++      Reserved.set(X86::R14D);
++      Reserved.set(X86::R14W);
++    }
++  }
+   return Reserved;
+ }
+ 
+diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
+index f1838d8..3d4d3b9 100644
+--- a/lib/Transforms/Utils/Local.cpp
++++ b/lib/Transforms/Utils/Local.cpp
+@@ -1413,7 +1413,8 @@ static bool markAliveBlocks(Function &F,
+           Changed = true;
+           break;
+         }
+-        if (CI->doesNotReturn()) {
++        // HQEMU: do not delete instructions after llvm.trap.
++        if (!F.hasFnAttribute("hqemu") && CI->doesNotReturn()) {
+           // If we found a call to a no-return function, insert an unreachable
+           // instruction after it.  Make sure there isn't *already* one there
+           // though.
+diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
+index 0504646..92291c3 100644
+--- a/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -1201,6 +1201,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
+ 
+   bool Changed = false;
+   do {
++    if (BIParent->getParent()->hasFnAttribute("hqemu"))
++      if (isa<IntrinsicInst>(I1) || I1->hasMetadata())
++        return Changed;
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+     if (isa<TerminatorInst>(I1))
+@@ -5088,6 +5091,9 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+   BasicBlock *BB = IBI->getParent();
+   bool Changed = false;
+ 
++  if (BB->getParent()->hasFnAttribute("hqemu"))
++    return false;
++
+   // Eliminate redundant destinations.
+   SmallPtrSet<Value *, 8> Succs;
+   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
diff --git a/patch/llvm/llvm-5.0.patch b/patch/llvm/llvm-5.0.patch
new file mode 100644
index 0000000..bb89779
--- /dev/null
+++ b/patch/llvm/llvm-5.0.patch
@@ -0,0 +1,652 @@
+diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
+index 2830a26..8c9c09e 100644
+--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
++++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
+@@ -566,6 +566,7 @@ public:
+   /// is called and is successful, the created engine takes ownership of the
+   /// memory manager. This option defaults to NULL.
+   EngineBuilder &setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager> mcjmm);
++  EngineBuilder &setMCJITMemoryManager(std::shared_ptr<RTDyldMemoryManager> mcjmm);
+ 
+   EngineBuilder&
+   setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM);
+diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
+index 9bf440e..4f0250c 100644
+--- a/include/llvm/MC/MCInst.h
++++ b/include/llvm/MC/MCInst.h
+@@ -29,6 +29,7 @@ class MCExpr;
+ class MCInst;
+ class MCInstPrinter;
+ class raw_ostream;
++class DebugLoc;
+ 
+ /// \brief Instances of this class represent operands of the MCInst class.
+ /// This is a simple discriminated union.
+@@ -160,6 +161,7 @@ class MCInst {
+   unsigned Opcode = 0;
+   SMLoc Loc;
+   SmallVector<MCOperand, 8> Operands;
++  const DebugLoc *DbgLoc = nullptr;
+ 
+ public:
+   MCInst() = default;
+@@ -170,6 +172,9 @@ public:
+   void setLoc(SMLoc loc) { Loc = loc; }
+   SMLoc getLoc() const { return Loc; }
+ 
++  void setDebugLoc(const DebugLoc *Loc) { DbgLoc = Loc; }
++  const DebugLoc *getDebugLoc() const { return DbgLoc; }
++
+   const MCOperand &getOperand(unsigned i) const { return Operands[i]; }
+   MCOperand &getOperand(unsigned i) { return Operands[i]; }
+   unsigned getNumOperands() const { return Operands.size(); }
+diff --git a/include/llvm/MC/MCInstrInfo.h b/include/llvm/MC/MCInstrInfo.h
+index 80f1f32..e5056cb 100644
+--- a/include/llvm/MC/MCInstrInfo.h
++++ b/include/llvm/MC/MCInstrInfo.h
+@@ -26,6 +26,7 @@ class MCInstrInfo {
+   const unsigned *InstrNameIndices; // Array for name indices in InstrNameData
+   const char *InstrNameData;        // Instruction name string pool
+   unsigned NumOpcodes;              // Number of entries in the desc array
++  unsigned long HQEMUExitAddr;
+ 
+ public:
+   /// \brief Initialize MCInstrInfo, called by TableGen auto-generated routines.
+@@ -52,6 +53,9 @@ public:
+     assert(Opcode < NumOpcodes && "Invalid opcode!");
+     return StringRef(&InstrNameData[InstrNameIndices[Opcode]]);
+   }
++
++  void setHQEMUExitAddr(unsigned long Addr) { HQEMUExitAddr = Addr; }
++  unsigned long getHQEMUExitAddr() const { return HQEMUExitAddr; }
+ };
+ 
+ } // End llvm namespace
+diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
+index 3c439e6..536e776 100644
+--- a/lib/CodeGen/BranchFolding.cpp
++++ b/lib/CodeGen/BranchFolding.cpp
+@@ -169,6 +169,12 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                     MachineModuleInfo *mmi,
+                                     MachineLoopInfo *mli, bool AfterPlacement) {
+   if (!tii) return false;
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    switch (MF.getTarget().getTargetTriple().getArch()) {
++      case Triple::x86: case Triple::x86_64: break;
++      default: return false;
++    }
++  }
+ 
+   TriedMerging.clear();
+ 
+diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
+index 2ee72f9..701c21b 100644
+--- a/lib/ExecutionEngine/ExecutionEngine.cpp
++++ b/lib/ExecutionEngine/ExecutionEngine.cpp
+@@ -496,6 +496,13 @@ EngineBuilder &EngineBuilder::setMCJITMemoryManager(
+   return *this;
+ }
+ 
++EngineBuilder &EngineBuilder::setMCJITMemoryManager(
++                                   std::shared_ptr<RTDyldMemoryManager> mcjmm) {
++  MemMgr = mcjmm;
++  Resolver = mcjmm;
++  return *this;
++}
++
+ EngineBuilder&
+ EngineBuilder::setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM) {
+   MemMgr = std::shared_ptr<MCJITMemoryManager>(std::move(MM));
+diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
+index f82b9db..c42ac7f 100644
+--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
++++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
+@@ -219,6 +219,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands()) {
+     MCOperand MCOp;
+     if (lowerOperand(MO, MCOp))
+diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+index 9f7dcb3..0e56bb6 100644
+--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
++++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+@@ -130,6 +130,12 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+   if (hasBasePointer(MF))
+     markSuperRegs(Reserved, AArch64::W19);
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    markSuperRegs(Reserved, AArch64::W19);
++    markSuperRegs(Reserved, AArch64::W28);
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved));
+   return Reserved;
+ }
+diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+index 33698d2..9735e88 100644
+--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
++++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+@@ -29,6 +29,7 @@
+ #include "llvm/Support/EndianStream.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ 
+@@ -180,6 +181,10 @@ public:
+   unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups,
++                            const MCSubtargetInfo &STI) const;
++
+ private:
+   uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+   void verifyInstructionPredicates(const MCInst &MI,
+@@ -552,9 +557,85 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+   return EncodedValue & ~(1u << 30);
+ }
+ 
++bool AArch64MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case AArch64::BRK:
++  case AArch64::RET:
++    break;
++  default: return false;
++  }
++
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == AArch64::BRK) {
++    uint64_t Binary = 0;
++    MCOperand Operand = MCOperand::createImm(1);
++    MCInst Jump;
++
++    Jump.setOpcode(AArch64::B);
++    Jump.addOperand(Operand);
++    Binary = getBinaryCodeForInstr(Jump, Fixups, STI);
++    support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
++    ++MCNumEmitted;
++    return true;
++  }
++  if (Opcode == AArch64::RET) {
++    uint64_t ExitAddr = MCII.getHQEMUExitAddr();
++    uint32_t Binary[4];
++    MCOperand Reg = MCOperand::createReg(AArch64::X1);
++    MCInst Jump, Mov;
++
++    // mov w0, ExitAddr[15:0]
++    Binary[0] = (0x2 << 29) | 0x1;
++    Binary[0] |= (0x25 << 23);
++    Binary[0] |= ((ExitAddr & 0xFFFF) << 5);
++
++    // movk w0, ExitAddr[31:16]
++    Binary[1] =  (0x3 << 29) | 0x1;
++    Binary[1] |= (0x25 << 23);
++    Binary[1] |= (0x1 << 21);
++    Binary[1] |= ((ExitAddr & 0xFFFF0000) >> 11);
++
++    Jump.setOpcode(AArch64::BR);
++    Jump.addOperand(Reg);
++    Binary[2] = getBinaryCodeForInstr(Jump, Fixups, STI);
++
++    for (int i = 0; i < 3; ++i) {
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Binary[i]);
++      ++MCNumEmitted;
++    }
++    return true;
++  }
++  return false;
++}
++
+ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const {
++  if (EmitHQEMUInstruction(MI, OS, Fixups, STI))
++    return;
++
+   verifyInstructionPredicates(MI,
+                               computeAvailableFeatures(STI.getFeatureBits()));
+ 
+diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+index 92c8c22..befec89 100644
+--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
++++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+@@ -30,6 +30,7 @@
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/MathExtras.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ 
+@@ -109,9 +110,16 @@ public:
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups,
++                            const MCSubtargetInfo &STI) const;
++
+   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const override {
++    if (EmitHQEMUInstruction(MI, OS, Fixups, STI))
++      return;
++
+     verifyInstructionPredicates(MI,
+                                 computeAvailableFeatures(STI.getFeatureBits()));
+ 
+@@ -386,5 +394,75 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+   return MO.getImm();
+ }
+ 
++bool PPCMCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case PPC::TRAP:
++  case PPC::BLR:
++  case PPC::BLR8:
++    break;
++  default: return false;
++  }
++
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == PPC::TRAP) {
++    uint64_t Bits = 0;
++    MCInst NopInst;
++    NopInst.setOpcode(PPC::NOP);
++    Bits = getBinaryCodeForInstr(NopInst, Fixups, STI);
++    for (unsigned i = 0; i != 5; ++i) {
++      if (IsLittleEndian) {
++        support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
++      } else {
++        support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
++      }
++    }
++    MCNumEmitted += 5;
++    return true;
++  }
++  if (Opcode == PPC::BLR || Opcode == PPC::BLR8) {
++    uint64_t Bits[2];
++    MCInst Inst[2];
++    Inst[0].setOpcode(PPC::MTCTR);
++    Inst[0].addOperand(MCOperand::createReg(PPC::R31));
++    Inst[1].setOpcode(PPC::BCTR);
++    Bits[0] = getBinaryCodeForInstr(Inst[0], Fixups, STI);
++    Bits[1] = getBinaryCodeForInstr(Inst[1], Fixups, STI);
++
++    if (IsLittleEndian) {
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Bits[0]);
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Bits[1]);
++    } else {
++      support::endian::Writer<support::big>(OS).write<uint32_t>(Bits[0]);
++      support::endian::Writer<support::big>(OS).write<uint32_t>(Bits[1]);
++    }
++
++    MCNumEmitted += 2;
++    return true;
++  }
++  return false;
++}
++
+ #define ENABLE_INSTR_PREDICATE_VERIFIER
+ #include "PPCGenMCCodeEmitter.inc"
+diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
+index b3a3c73..05c8cac 100644
+--- a/lib/Target/PowerPC/PPCISelLowering.cpp
++++ b/lib/Target/PowerPC/PPCISelLowering.cpp
+@@ -2422,10 +2422,11 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+   EVT PtrVT = Op.getValueType();
+   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+   const Constant *C = CP->getConstVal();
++  bool isHQEMU = DAG.getMachineFunction().getFunction()->hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+     return getTOCEntry(DAG, SDLoc(CP), true, GA);
+@@ -2435,7 +2436,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+                                            PPCII::MO_PIC_FLAG);
+     return getTOCEntry(DAG, SDLoc(CP), false, GA);
+@@ -2500,10 +2501,11 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+   EVT PtrVT = Op.getValueType();
+   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
++  bool isHQEMU = DAG.getMachineFunction().getFunction()->hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+     return getTOCEntry(DAG, SDLoc(JT), true, GA);
+@@ -2513,7 +2515,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                         PPCII::MO_PIC_FLAG);
+     return getTOCEntry(DAG, SDLoc(GA), false, GA);
+@@ -2529,10 +2531,11 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+   EVT PtrVT = Op.getValueType();
+   BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+   const BlockAddress *BA = BASDN->getBlockAddress();
++  bool isHQEMU = DAG.getMachineFunction().getFunction()->hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual BlockAddress is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+     return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+@@ -2642,10 +2645,11 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+   SDLoc DL(GSDN);
+   const GlobalValue *GV = GSDN->getGlobal();
++  bool isHQEMU = DAG.getMachineFunction().getFunction()->hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+     return getTOCEntry(DAG, DL, true, GA);
+@@ -2655,7 +2659,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                             GSDN->getOffset(),
+                                             PPCII::MO_PIC_FLAG);
+diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
+index b310493..afc6c81 100644
+--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
++++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
+@@ -141,7 +141,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                         AsmPrinter &AP, bool isDarwin) {
+   OutMI.setOpcode(MI->getOpcode());
+-  
++
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+     const MachineOperand &MO = MI->getOperand(i);
+     
+diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
+index 9207165..286c2cb 100644
+--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
++++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
+@@ -269,6 +269,13 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+          IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+       markSuperRegs(Reserved, *I);
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    markSuperRegs(Reserved, PPC::R27);
++    if (TM.isPPC64())
++      markSuperRegs(Reserved, PPC::R31);
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved));
+   return Reserved;
+ }
+@@ -882,6 +889,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+   if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
+     if (!(hasBasePointer(MF) && FrameIndex < 0))
+       Offset += MFI.getStackSize();
++  } else {
++    if (MF.getFunction()->hasFnAttribute("hqemu") && FrameIndex >= 0) {
++      const PPCFrameLowering *TFI = getFrameLowering(MF);
++      Offset += TFI->determineFrameLayout(MF, false, false);
++    }
+   }
+ 
+   // If we can, encode the offset directly into the instruction.  If this is a
+diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+index 10e2bbc..e6e6a66 100644
+--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
++++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+@@ -27,6 +27,7 @@
+ #include "llvm/MC/MCSymbol.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ #include <cstdlib>
+@@ -150,6 +151,9 @@ public:
+                         const MCInst &MI, const MCInstrDesc &Desc,
+                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups) const;
++
+   uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                              int MemOperand, const MCInstrDesc &Desc) const;
+ };
+@@ -1152,6 +1156,52 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+   return Ret;
+ }
+ 
++bool X86MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++    break;
++  default: return false;
++  }
++
++  unsigned CurByte = 0;
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    for (unsigned i = 0; i != 8; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  if (Opcode == X86::RETQ) {
++    uintptr_t ExitAddr = MCII.getHQEMUExitAddr();
++    EmitByte(0xE9, CurByte, OS);
++    EmitImmediate(MCOperand::createImm(ExitAddr), MI.getLoc(), 4, FK_PCRel_4,
++                  CurByte, OS, Fixups);
++    return true;
++  }
++  return false;
++}
++
+ void X86MCCodeEmitter::
+ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+@@ -1160,6 +1210,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+   const MCInstrDesc &Desc = MCII.get(Opcode);
+   uint64_t TSFlags = Desc.TSFlags;
+ 
++  if (EmitHQEMUInstruction(MI, OS, Fixups))
++    return;
++
+   // Pseudo instructions don't get encoded.
+   if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+     return;
+diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
+index f294e81..10a22ae 100644
+--- a/lib/Target/X86/X86FrameLowering.cpp
++++ b/lib/Target/X86/X86FrameLowering.cpp
+@@ -83,6 +83,10 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ /// or if frame pointer elimination is disabled.
+ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+   const MachineFrameInfo &MFI = MF.getFrameInfo();
++  // HQEMU does not use FramePtr for stack accesses, so return false when
++  // running in HQEMU mode.
++  if (MF.getFunction()->hasFnAttribute("hqemu"))
++    return false;
+   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+           TRI->needsStackRealignment(MF) ||
+           MFI.hasVarSizedObjects() ||
+diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
+index fd2837b..51d6e5b 100644
+--- a/lib/Target/X86/X86MCInstLower.cpp
++++ b/lib/Target/X86/X86MCInstLower.cpp
+@@ -389,6 +389,9 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands())
+     if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+       OutMI.addOperand(MaybeMCOp.getValue());
+diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
+index 343da25..72550a0 100644
+--- a/lib/Target/X86/X86RegisterInfo.cpp
++++ b/lib/Target/X86/X86RegisterInfo.cpp
+@@ -573,6 +573,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     }
+   }
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction()->hasFnAttribute("hqemu")) {
++    if (!Is64Bit) {
++      Reserved.set(X86::EBP);
++      Reserved.set(X86::BP);
++      Reserved.set(X86::BPL);
++    } else {
++      Reserved.set(X86::R14);
++      Reserved.set(X86::R14B);
++      Reserved.set(X86::R14D);
++      Reserved.set(X86::R14W);
++    }
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved,
+                                  {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+   return Reserved;
+diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
+index 7461061..cb2e665 100644
+--- a/lib/Transforms/Utils/Local.cpp
++++ b/lib/Transforms/Utils/Local.cpp
+@@ -1531,7 +1531,8 @@ static bool markAliveBlocks(Function &F,
+           Changed = true;
+           break;
+         }
+-        if (CI->doesNotReturn()) {
++        // Do not delete instructions after llvm.trap in HQEMU mode.
++        if (!F.hasFnAttribute("hqemu") && CI->doesNotReturn()) {
+           // If we found a call to a no-return function, insert an unreachable
+           // instruction after it.  Make sure there isn't *already* one there
+           // though.
+diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
+index 8784b97..c4cf1cc 100644
+--- a/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -1250,6 +1250,10 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
+ 
+   bool Changed = false;
+   do {
++    // Do not hoist llvm::trap and debug instructions in HQEMU mode.
++    if (BIParent->getParent()->hasFnAttribute("hqemu"))
++      if (isa<IntrinsicInst>(I1) || I1->hasMetadata())
++        return Changed;
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+     if (isa<TerminatorInst>(I1))
+@@ -5542,6 +5546,10 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+   BasicBlock *BB = IBI->getParent();
+   bool Changed = false;
+ 
++  // Do not delete indirectbrs of no successors in HQEMU mode.
++  if (BB->getParent()->hasFnAttribute("hqemu"))
++    return false;
++
+   // Eliminate redundant destinations.
+   SmallPtrSet<Value *, 8> Succs;
+   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
diff --git a/patch/llvm/llvm-6.0.patch b/patch/llvm/llvm-6.0.patch
new file mode 100644
index 0000000..12fde6d
--- /dev/null
+++ b/patch/llvm/llvm-6.0.patch
@@ -0,0 +1,652 @@
+diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
+index 77c23b4..85fa6d4 100644
+--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
++++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
+@@ -567,6 +567,7 @@ public:
+   /// is called and is successful, the created engine takes ownership of the
+   /// memory manager. This option defaults to NULL.
+   EngineBuilder &setMCJITMemoryManager(std::unique_ptr<RTDyldMemoryManager> mcjmm);
++  EngineBuilder &setMCJITMemoryManager(std::shared_ptr<RTDyldMemoryManager> mcjmm);
+ 
+   EngineBuilder&
+   setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM);
+diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
+index db28fd0..574b66e 100644
+--- a/include/llvm/MC/MCInst.h
++++ b/include/llvm/MC/MCInst.h
+@@ -29,6 +29,7 @@ class MCExpr;
+ class MCInst;
+ class MCInstPrinter;
+ class raw_ostream;
++class DebugLoc;
+ 
+ /// \brief Instances of this class represent operands of the MCInst class.
+ /// This is a simple discriminated union.
+@@ -160,6 +161,7 @@ class MCInst {
+   unsigned Opcode = 0;
+   SMLoc Loc;
+   SmallVector<MCOperand, 8> Operands;
++  const DebugLoc *DbgLoc = nullptr;
+   // These flags could be used to pass some info from one target subcomponent
+   // to another, for example, from disassembler to asm printer. The values of
+   // the flags have any sense on target level only (e.g. prefixes on x86).
+@@ -177,6 +179,9 @@ public:
+   void setLoc(SMLoc loc) { Loc = loc; }
+   SMLoc getLoc() const { return Loc; }
+ 
++  void setDebugLoc(const DebugLoc *Loc) { DbgLoc = Loc; }
++  const DebugLoc *getDebugLoc() const { return DbgLoc; }
++
+   const MCOperand &getOperand(unsigned i) const { return Operands[i]; }
+   MCOperand &getOperand(unsigned i) { return Operands[i]; }
+   unsigned getNumOperands() const { return Operands.size(); }
+diff --git a/include/llvm/MC/MCInstrInfo.h b/include/llvm/MC/MCInstrInfo.h
+index 80f1f32..e5056cb 100644
+--- a/include/llvm/MC/MCInstrInfo.h
++++ b/include/llvm/MC/MCInstrInfo.h
+@@ -26,6 +26,7 @@ class MCInstrInfo {
+   const unsigned *InstrNameIndices; // Array for name indices in InstrNameData
+   const char *InstrNameData;        // Instruction name string pool
+   unsigned NumOpcodes;              // Number of entries in the desc array
++  unsigned long HQEMUExitAddr;
+ 
+ public:
+   /// \brief Initialize MCInstrInfo, called by TableGen auto-generated routines.
+@@ -52,6 +53,9 @@ public:
+     assert(Opcode < NumOpcodes && "Invalid opcode!");
+     return StringRef(&InstrNameData[InstrNameIndices[Opcode]]);
+   }
++
++  void setHQEMUExitAddr(unsigned long Addr) { HQEMUExitAddr = Addr; }
++  unsigned long getHQEMUExitAddr() const { return HQEMUExitAddr; }
+ };
+ 
+ } // End llvm namespace
+diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
+index 7f358a6..5ef804f 100644
+--- a/lib/CodeGen/BranchFolding.cpp
++++ b/lib/CodeGen/BranchFolding.cpp
+@@ -175,6 +175,12 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                     MachineModuleInfo *mmi,
+                                     MachineLoopInfo *mli, bool AfterPlacement) {
+   if (!tii) return false;
++  if (MF.getFunction().hasFnAttribute("hqemu")) {
++    switch (MF.getTarget().getTargetTriple().getArch()) {
++      case Triple::x86: case Triple::x86_64: break;
++      default: return false;
++    }
++  }
+ 
+   TriedMerging.clear();
+ 
+diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
+index c598857..c7ecd12 100644
+--- a/lib/ExecutionEngine/ExecutionEngine.cpp
++++ b/lib/ExecutionEngine/ExecutionEngine.cpp
+@@ -496,6 +496,13 @@ EngineBuilder &EngineBuilder::setMCJITMemoryManager(
+   return *this;
+ }
+ 
++EngineBuilder &EngineBuilder::setMCJITMemoryManager(
++                                   std::shared_ptr<RTDyldMemoryManager> mcjmm) {
++  MemMgr = mcjmm;
++  Resolver = mcjmm;
++  return *this;
++}
++
+ EngineBuilder&
+ EngineBuilder::setMemoryManager(std::unique_ptr<MCJITMemoryManager> MM) {
+   MemMgr = std::shared_ptr<MCJITMemoryManager>(std::move(MM));
+diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
+index 65dae03..09e5858 100644
+--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
++++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
+@@ -239,6 +239,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands()) {
+     MCOperand MCOp;
+     if (lowerOperand(MO, MCOp))
+diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+index 88dd297..4b2ccd8 100644
+--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
++++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
+@@ -132,6 +132,12 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+   if (hasBasePointer(MF))
+     markSuperRegs(Reserved, AArch64::W19);
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction().hasFnAttribute("hqemu")) {
++    markSuperRegs(Reserved, AArch64::W19);
++    markSuperRegs(Reserved, AArch64::W28);
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved));
+   return Reserved;
+ }
+diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+index 33698d2..9735e88 100644
+--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
++++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+@@ -29,6 +29,7 @@
+ #include "llvm/Support/EndianStream.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ 
+@@ -180,6 +181,10 @@ public:
+   unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups,
++                            const MCSubtargetInfo &STI) const;
++
+ private:
+   uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+   void verifyInstructionPredicates(const MCInst &MI,
+@@ -552,9 +557,85 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+   return EncodedValue & ~(1u << 30);
+ }
+ 
++bool AArch64MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case AArch64::BRK:
++  case AArch64::RET:
++    break;
++  default: return false;
++  }
++
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == AArch64::BRK) {
++    uint64_t Binary = 0;
++    MCOperand Operand = MCOperand::createImm(1);
++    MCInst Jump;
++
++    Jump.setOpcode(AArch64::B);
++    Jump.addOperand(Operand);
++    Binary = getBinaryCodeForInstr(Jump, Fixups, STI);
++    support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
++    ++MCNumEmitted;
++    return true;
++  }
++  if (Opcode == AArch64::RET) {
++    uint64_t ExitAddr = MCII.getHQEMUExitAddr();
++    uint32_t Binary[4];
++    MCOperand Reg = MCOperand::createReg(AArch64::X1);
++    MCInst Jump, Mov;
++
++    // mov w0, ExitAddr[15:0]
++    Binary[0] = (0x2 << 29) | 0x1;
++    Binary[0] |= (0x25 << 23);
++    Binary[0] |= ((ExitAddr & 0xFFFF) << 5);
++
++    // movk w0, ExitAddr[31:16]
++    Binary[1] =  (0x3 << 29) | 0x1;
++    Binary[1] |= (0x25 << 23);
++    Binary[1] |= (0x1 << 21);
++    Binary[1] |= ((ExitAddr & 0xFFFF0000) >> 11);
++
++    Jump.setOpcode(AArch64::BR);
++    Jump.addOperand(Reg);
++    Binary[2] = getBinaryCodeForInstr(Jump, Fixups, STI);
++
++    for (int i = 0; i < 3; ++i) {
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Binary[i]);
++      ++MCNumEmitted;
++    }
++    return true;
++  }
++  return false;
++}
++
+ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const {
++  if (EmitHQEMUInstruction(MI, OS, Fixups, STI))
++    return;
++
+   verifyInstructionPredicates(MI,
+                               computeAvailableFeatures(STI.getFeatureBits()));
+ 
+diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+index 92c8c22..befec89 100644
+--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
++++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+@@ -30,6 +30,7 @@
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/MathExtras.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ 
+@@ -109,9 +110,16 @@ public:
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups,
++                            const MCSubtargetInfo &STI) const;
++
+   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const override {
++    if (EmitHQEMUInstruction(MI, OS, Fixups, STI))
++      return;
++
+     verifyInstructionPredicates(MI,
+                                 computeAvailableFeatures(STI.getFeatureBits()));
+ 
+@@ -386,5 +394,75 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+   return MO.getImm();
+ }
+ 
++bool PPCMCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case PPC::TRAP:
++  case PPC::BLR:
++  case PPC::BLR8:
++    break;
++  default: return false;
++  }
++
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == PPC::TRAP) {
++    uint64_t Bits = 0;
++    MCInst NopInst;
++    NopInst.setOpcode(PPC::NOP);
++    Bits = getBinaryCodeForInstr(NopInst, Fixups, STI);
++    for (unsigned i = 0; i != 5; ++i) {
++      if (IsLittleEndian) {
++        support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
++      } else {
++        support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
++      }
++    }
++    MCNumEmitted += 5;
++    return true;
++  }
++  if (Opcode == PPC::BLR || Opcode == PPC::BLR8) {
++    uint64_t Bits[2];
++    MCInst Inst[2];
++    Inst[0].setOpcode(PPC::MTCTR);
++    Inst[0].addOperand(MCOperand::createReg(PPC::R31));
++    Inst[1].setOpcode(PPC::BCTR);
++    Bits[0] = getBinaryCodeForInstr(Inst[0], Fixups, STI);
++    Bits[1] = getBinaryCodeForInstr(Inst[1], Fixups, STI);
++
++    if (IsLittleEndian) {
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Bits[0]);
++      support::endian::Writer<support::little>(OS).write<uint32_t>(Bits[1]);
++    } else {
++      support::endian::Writer<support::big>(OS).write<uint32_t>(Bits[0]);
++      support::endian::Writer<support::big>(OS).write<uint32_t>(Bits[1]);
++    }
++
++    MCNumEmitted += 2;
++    return true;
++  }
++  return false;
++}
++
+ #define ENABLE_INSTR_PREDICATE_VERIFIER
+ #include "PPCGenMCCodeEmitter.inc"
+diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
+index f0e8b11..a96a36d 100644
+--- a/lib/Target/PowerPC/PPCISelLowering.cpp
++++ b/lib/Target/PowerPC/PPCISelLowering.cpp
+@@ -2442,10 +2442,11 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+   EVT PtrVT = Op.getValueType();
+   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+   const Constant *C = CP->getConstVal();
++  bool isHQEMU = DAG.getMachineFunction().getFunction().hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+     return getTOCEntry(DAG, SDLoc(CP), true, GA);
+@@ -2455,7 +2456,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+                                            PPCII::MO_PIC_FLAG);
+     return getTOCEntry(DAG, SDLoc(CP), false, GA);
+@@ -2518,10 +2519,11 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+   EVT PtrVT = Op.getValueType();
+   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
++  bool isHQEMU = DAG.getMachineFunction().getFunction().hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+     return getTOCEntry(DAG, SDLoc(JT), true, GA);
+@@ -2531,7 +2533,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                         PPCII::MO_PIC_FLAG);
+     return getTOCEntry(DAG, SDLoc(GA), false, GA);
+@@ -2547,10 +2549,11 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+   EVT PtrVT = Op.getValueType();
+   BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+   const BlockAddress *BA = BASDN->getBlockAddress();
++  bool isHQEMU = DAG.getMachineFunction().getFunction().hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual BlockAddress is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+     return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+@@ -2660,10 +2663,11 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+   SDLoc DL(GSDN);
+   const GlobalValue *GV = GSDN->getGlobal();
++  bool isHQEMU = DAG.getMachineFunction().getFunction().hasFnAttribute("hqemu");
+ 
+   // 64-bit SVR4 ABI code is always position-independent.
+   // The actual address of the GlobalValue is stored in the TOC.
+-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
++  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64() && !isHQEMU) {
+     setUsesTOCBasePtr(DAG);
+     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+     return getTOCEntry(DAG, DL, true, GA);
+@@ -2673,7 +2677,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+   bool IsPIC = isPositionIndependent();
+   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
+ 
+-  if (IsPIC && Subtarget.isSVR4ABI()) {
++  if (IsPIC && Subtarget.isSVR4ABI() && !isHQEMU) {
+     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                             GSDN->getOffset(),
+                                             PPCII::MO_PIC_FLAG);
+diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
+index 1e40711..496238a 100644
+--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
++++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
+@@ -141,7 +141,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                         AsmPrinter &AP, bool isDarwin) {
+   OutMI.setOpcode(MI->getOpcode());
+-  
++
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+     MCOperand MCOp;
+     if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
+diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
+index 6b62a82..cc5a73b 100644
+--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
++++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
+@@ -279,6 +279,13 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+          IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+       markSuperRegs(Reserved, *I);
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction().hasFnAttribute("hqemu")) {
++    markSuperRegs(Reserved, PPC::R27);
++    if (TM.isPPC64())
++      markSuperRegs(Reserved, PPC::R31);
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved));
+   return Reserved;
+ }
+@@ -904,6 +911,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+   if (!MF.getFunction().hasFnAttribute(Attribute::Naked)) {
+     if (!(hasBasePointer(MF) && FrameIndex < 0))
+       Offset += MFI.getStackSize();
++  } else {
++    if (MF.getFunction().hasFnAttribute("hqemu") && FrameIndex >= 0) {
++      const PPCFrameLowering *TFI = getFrameLowering(MF);
++      Offset += TFI->determineFrameLayout(MF, false, false);
++    }
+   }
+ 
+   // If we can, encode the offset directly into the instruction.  If this is a
+diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+index 4ddc1f0..c564e71 100644
+--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
++++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+@@ -27,6 +27,7 @@
+ #include "llvm/MC/MCSymbol.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/raw_ostream.h"
++#include "llvm/IR/DebugLoc.h"
+ #include <cassert>
+ #include <cstdint>
+ #include <cstdlib>
+@@ -150,6 +151,9 @@ public:
+                         const MCInst &MI, const MCInstrDesc &Desc,
+                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
+ 
++  bool EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                            SmallVectorImpl<MCFixup> &Fixups) const;
++
+   uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                              int MemOperand, const MCInstrDesc &Desc) const;
+ };
+@@ -1158,6 +1162,52 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+   return Ret;
+ }
+ 
++bool X86MCCodeEmitter::
++EmitHQEMUInstruction(const MCInst &MI, raw_ostream &OS,
++                     SmallVectorImpl<MCFixup> &Fixups) const {
++  /* NOTE: the following flags must be synchronized with those in file
++   *       llvm-opc.h of the HQEMU source tree. */
++  enum {
++    PATCH_HQEMU = 0x4182U,
++    PATCH_DUMMY,
++    PATCH_EXIT_TB,
++    PATCH_DIRECT_JUMP,
++    PATCH_TRACE_BLOCK_CHAINING,
++    PATCH_QMMU
++  };
++
++  unsigned Opcode = MI.getOpcode();
++  switch (Opcode) {
++  case X86::TRAP:
++  case X86::RETQ:
++    break;
++  default: return false;
++  }
++
++  unsigned CurByte = 0;
++  const DebugLoc *Loc = MI.getDebugLoc();
++  if (!Loc)
++    return false;
++
++  unsigned PatchType = Loc->getLine();
++  if (PatchType < PATCH_HQEMU || PatchType > PATCH_QMMU)
++    return false;
++
++  if (Opcode == X86::TRAP) {
++    for (unsigned i = 0; i != 8; ++i)
++      EmitByte(0x90, CurByte, OS);
++    return true;
++  }
++  if (Opcode == X86::RETQ) {
++    uintptr_t ExitAddr = MCII.getHQEMUExitAddr();
++    EmitByte(0xE9, CurByte, OS);
++    EmitImmediate(MCOperand::createImm(ExitAddr), MI.getLoc(), 4, FK_PCRel_4,
++                  CurByte, OS, Fixups);
++    return true;
++  }
++  return false;
++}
++
+ void X86MCCodeEmitter::
+ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                   SmallVectorImpl<MCFixup> &Fixups,
+@@ -1167,6 +1217,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
+   uint64_t TSFlags = Desc.TSFlags;
+   unsigned Flags = MI.getFlags();
+ 
++  if (EmitHQEMUInstruction(MI, OS, Fixups))
++    return;
++
+   // Pseudo instructions don't get encoded.
+   if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+     return;
+diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
+index 11808f8..bd10b86 100644
+--- a/lib/Target/X86/X86FrameLowering.cpp
++++ b/lib/Target/X86/X86FrameLowering.cpp
+@@ -83,6 +83,10 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ /// or if frame pointer elimination is disabled.
+ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+   const MachineFrameInfo &MFI = MF.getFrameInfo();
++  // HQEMU does not use FramePtr for stack accesses, so return false when
++  // running in HQEMU mode.
++  if (MF.getFunction().hasFnAttribute("hqemu"))
++    return false;
+   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+           TRI->needsStackRealignment(MF) ||
+           MFI.hasVarSizedObjects() ||
+diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
+index 730ba74..c1b3ef6 100644
+--- a/lib/Target/X86/X86MCInstLower.cpp
++++ b/lib/Target/X86/X86MCInstLower.cpp
+@@ -389,6 +389,9 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+   OutMI.setOpcode(MI->getOpcode());
+ 
++  if (MI->getDebugLoc())
++    OutMI.setDebugLoc(&MI->getDebugLoc());
++
+   for (const MachineOperand &MO : MI->operands())
+     if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+       OutMI.addOperand(MaybeMCOp.getValue());
+diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
+index bc31e95..893ff41 100644
+--- a/lib/Target/X86/X86RegisterInfo.cpp
++++ b/lib/Target/X86/X86RegisterInfo.cpp
+@@ -570,6 +570,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+     }
+   }
+ 
++  // Reserve registers for HQEMU.
++  if (MF.getFunction().hasFnAttribute("hqemu")) {
++    if (!Is64Bit) {
++      Reserved.set(X86::EBP);
++      Reserved.set(X86::BP);
++      Reserved.set(X86::BPL);
++    } else {
++      Reserved.set(X86::R14);
++      Reserved.set(X86::R14B);
++      Reserved.set(X86::R14D);
++      Reserved.set(X86::R14W);
++    }
++  }
++
+   assert(checkAllSuperRegsMarked(Reserved,
+                                  {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+   return Reserved;
+diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
+index acccf7a..f2ab12d 100644
+--- a/lib/Transforms/Utils/Local.cpp
++++ b/lib/Transforms/Utils/Local.cpp
+@@ -1587,7 +1587,8 @@ static bool markAliveBlocks(Function &F,
+           Changed = true;
+           break;
+         }
+-        if (CI->doesNotReturn()) {
++        // Do not delete instructions after llvm.trap in HQEMU mode.
++        if (!F.hasFnAttribute("hqemu") && CI->doesNotReturn()) {
+           // If we found a call to a no-return function, insert an unreachable
+           // instruction after it.  Make sure there isn't *already* one there
+           // though.
+diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
+index 7c19578..30f3481 100644
+--- a/lib/Transforms/Utils/SimplifyCFG.cpp
++++ b/lib/Transforms/Utils/SimplifyCFG.cpp
+@@ -1271,6 +1271,10 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
+ 
+   bool Changed = false;
+   do {
++    // Do not hoist llvm::trap and debug instructions in HQEMU mode.
++    if (BI->getParent()->getParent()->hasFnAttribute("hqemu"))
++      if (isa<IntrinsicInst>(I1) || I1->hasMetadata())
++        return Changed;
+     // If we are hoisting the terminator instruction, don't move one (making a
+     // broken BB), instead clone it, and remove BI.
+     if (isa<TerminatorInst>(I1))
+@@ -5600,6 +5604,10 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+   BasicBlock *BB = IBI->getParent();
+   bool Changed = false;
+ 
++  // Do not delete indirectbrs of no successors in HQEMU mode.
++  if (BB->getParent()->hasFnAttribute("hqemu"))
++    return false;
++
+   // Eliminate redundant destinations.
+   SmallPtrSet<Value *, 8> Succs;
+   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index c2ff970..e6c9f51 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -15,6 +15,7 @@
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/wait.h>
+#include <sys/sysmacros.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
diff --git a/qom/object.c b/qom/object.c
index d751569..deb182f 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -28,6 +28,10 @@
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
 
+#define Type     QEMUType
+#define class    QEMUclass
+#define typename QEMUtypename
+
 #define MAX_INTERFACES 32
 
 typedef struct InterfaceImpl InterfaceImpl;
@@ -2126,3 +2130,7 @@ static void register_types(void)
 }
 
 type_init(register_types)
+
+#undef Type
+#undef class
+#undef typename
diff --git a/softmmu_template.h b/softmmu_template.h
index 6803890..4574545 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -24,6 +24,7 @@
 #include "qemu/timer.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
+#include "hqemu-config.h"
 
 #define DATA_SIZE (1 << SHIFT)
 
@@ -116,6 +117,16 @@
 # define helper_te_st_name  helper_le_st_name
 #endif
 
+#if defined(ENABLE_TLBVERSION)
+#define TLB_IO_MASK          (TLB_NOTDIRTY | TLB_MMIO)
+#define TLB_NONIO_MASK       (TARGET_PAGE_MASK | TLB_INVALID_MASK | TLB_VERSION_MASK)
+#define page_val(addr, env)  (((tlbaddr_t)addr & TARGET_PAGE_MASK) | tlb_version(env))
+#else
+#define TLB_IO_MASK          (~TARGET_PAGE_MASK)
+#define TLB_NONIO_MASK       (TARGET_PAGE_MASK | TLB_INVALID_MASK)
+#define page_val(addr, env)  ((addr & TARGET_PAGE_MASK))
+#endif
+
 /* macro to check the victim tlb */
 #define VICTIM_TLB_HIT(ty)                                                    \
 ({                                                                            \
@@ -126,7 +137,7 @@
     CPUIOTLBEntry tmpiotlb;                                                   \
     CPUTLBEntry tmptlb;                                                       \
     for (vidx = CPU_VTLB_SIZE-1; vidx >= 0; --vidx) {                         \
-        if (env->tlb_v_table[mmu_idx][vidx].ty == (addr & TARGET_PAGE_MASK)) {\
+        if (env->tlb_v_table[mmu_idx][vidx].ty == page_val(addr, env)) {      \
             /* found entry in victim tlb, swap tlb and iotlb */               \
             tmptlb = env->tlb_table[mmu_idx][index];                          \
             env->tlb_table[mmu_idx][index] = env->tlb_v_table[mmu_idx][vidx]; \
@@ -170,7 +181,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
 {
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
     uintptr_t haddr;
     DATA_TYPE res;
 
@@ -178,8 +189,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
     retaddr -= GETPC_ADJ;
 
     /* If the TLB entry is for a different page, reload and try again.  */
-    if ((addr & TARGET_PAGE_MASK)
-         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
         if ((addr & (DATA_SIZE - 1)) != 0
             && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
             cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
@@ -193,7 +203,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
     }
 
     /* Handle an IO access.  */
-    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
         CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
@@ -254,7 +264,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
 {
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
     uintptr_t haddr;
     DATA_TYPE res;
 
@@ -262,8 +272,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
     retaddr -= GETPC_ADJ;
 
     /* If the TLB entry is for a different page, reload and try again.  */
-    if ((addr & TARGET_PAGE_MASK)
-         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
         if ((addr & (DATA_SIZE - 1)) != 0
             && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
             cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
@@ -277,7 +286,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
     }
 
     /* Handle an IO access.  */
-    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
         CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
@@ -375,15 +384,14 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 {
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
     /* If the TLB entry is for a different page, reload and try again.  */
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
         if ((addr & (DATA_SIZE - 1)) != 0
             && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
             cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
@@ -396,7 +404,7 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     }
 
     /* Handle an IO access.  */
-    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
         CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
@@ -455,15 +463,14 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 {
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
     /* If the TLB entry is for a different page, reload and try again.  */
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
         if ((addr & (DATA_SIZE - 1)) != 0
             && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
             cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
@@ -476,7 +483,7 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     }
 
     /* Handle an IO access.  */
-    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
         CPUIOTLBEntry *iotlbentry;
         if ((addr & (DATA_SIZE - 1)) != 0) {
             goto do_unaligned_access;
@@ -537,10 +544,9 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
                  uintptr_t retaddr)
 {
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
 
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
         /* TLB entry is for a different page */
         if (!VICTIM_TLB_HIT(addr_write)) {
             tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
@@ -550,6 +556,11 @@ void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
 #endif
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
+#include "softmmu_template_llvm.h"
+
+#undef TLB_IO_MASK
+#undef TLB_NONIO_MASK
+#undef page_val
 #undef READ_ACCESS_TYPE
 #undef SHIFT
 #undef DATA_TYPE
diff --git a/softmmu_template_llvm.h b/softmmu_template_llvm.h
new file mode 100644
index 0000000..0a5f4bf
--- /dev/null
+++ b/softmmu_template_llvm.h
@@ -0,0 +1,384 @@
+/*
+ *  Software MMU support for LLVM
+ */
+
+#if DATA_SIZE == 1
+# define llvm_le_ld_name  glue(glue(llvm_ret_ld, USUFFIX), MMUSUFFIX)
+# define llvm_be_ld_name  llvm_le_ld_name
+# define llvm_le_lds_name glue(glue(llvm_ret_ld, SSUFFIX), MMUSUFFIX)
+# define llvm_be_lds_name llvm_le_lds_name
+# define llvm_le_st_name  glue(glue(llvm_ret_st, SUFFIX), MMUSUFFIX)
+# define llvm_be_st_name  llvm_le_st_name
+#else
+# define llvm_le_ld_name  glue(glue(llvm_le_ld, USUFFIX), MMUSUFFIX)
+# define llvm_be_ld_name  glue(glue(llvm_be_ld, USUFFIX), MMUSUFFIX)
+# define llvm_le_lds_name glue(glue(llvm_le_ld, SSUFFIX), MMUSUFFIX)
+# define llvm_be_lds_name glue(glue(llvm_be_ld, SSUFFIX), MMUSUFFIX)
+# define llvm_le_st_name  glue(glue(llvm_le_st, SUFFIX), MMUSUFFIX)
+# define llvm_be_st_name  glue(glue(llvm_be_st, SUFFIX), MMUSUFFIX)
+#endif
+
+#ifdef TARGET_WORDS_BIGENDIAN
+# define llvm_te_ld_name  llvm_be_ld_name
+# define llvm_te_st_name  llvm_be_st_name
+#else
+# define llvm_te_ld_name  llvm_le_ld_name
+# define llvm_te_st_name  llvm_le_st_name
+#endif
+
+
+#ifndef SOFTMMU_CODE_ACCESS
+WORD_TYPE llvm_le_ld_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi)
+{
+    unsigned mmu_idx = get_mmuidx((uint16_t)oi);
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    uintptr_t haddr;
+    DATA_TYPE res;
+    uintptr_t retaddr;
+
+    env->restore_val = oi >> 16;
+
+    /* Adjust the given return address.  */
+    retaddr = GETPC();
+
+    /* If the TLB entry is for a different page, reload and try again.  */
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
+        if ((addr & (DATA_SIZE - 1)) != 0
+            && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                                 mmu_idx, retaddr);
+        }
+        if (!VICTIM_TLB_HIT(ADDR_READ)) {
+            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                     mmu_idx, retaddr);
+        }
+        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    }
+
+    /* Handle an IO access.  */
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
+        CPUIOTLBEntry *iotlbentry;
+        if ((addr & (DATA_SIZE - 1)) != 0) {
+            goto do_unaligned_access;
+        }
+        iotlbentry = &env->iotlb[mmu_idx][index];
+
+        /* ??? Note that the io helpers always read data in the target
+           byte ordering.  We should push the LE/BE request down into io.  */
+        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = TGT_LE(res);
+        return res;
+    }
+
+    /* Handle slow unaligned access (it spans two pages or IO).  */
+    if (DATA_SIZE > 1
+        && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
+                    >= TARGET_PAGE_SIZE)) {
+        target_ulong addr1, addr2;
+        DATA_TYPE res1, res2;
+        unsigned shift;
+    do_unaligned_access:
+        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                                 mmu_idx, retaddr);
+        }
+        addr1 = addr & ~(DATA_SIZE - 1);
+        addr2 = addr1 + DATA_SIZE;
+        /* Note the adjustment at the beginning of the function.
+           Undo that for the recursion.  */
+        res1 = helper_le_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_le_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
+        shift = (addr & (DATA_SIZE - 1)) * 8;
+
+        /* Little-endian combine.  */
+        res = (res1 >> shift) | (res2 << ((DATA_SIZE * 8) - shift));
+        return res;
+    }
+
+    /* Handle aligned access or unaligned access in the same page.  */
+    if ((addr & (DATA_SIZE - 1)) != 0
+        && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
+    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+#if DATA_SIZE == 1
+    res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
+#else
+    res = glue(glue(ld, LSUFFIX), _le_p)((uint8_t *)haddr);
+#endif
+    return res;
+}
+
+#if DATA_SIZE > 1
+WORD_TYPE llvm_be_ld_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi)
+{
+    unsigned mmu_idx = get_mmuidx((uint16_t)oi);
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    uintptr_t haddr;
+    DATA_TYPE res;
+    uintptr_t retaddr;
+
+    env->restore_val = oi >> 16;
+
+    /* Adjust the given return address.  */
+    retaddr = GETPC();
+
+    /* If the TLB entry is for a different page, reload and try again.  */
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
+        if ((addr & (DATA_SIZE - 1)) != 0
+            && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                                 mmu_idx, retaddr);
+        }
+        if (!VICTIM_TLB_HIT(ADDR_READ)) {
+            tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                     mmu_idx, retaddr);
+        }
+        tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    }
+
+    /* Handle an IO access.  */
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
+        CPUIOTLBEntry *iotlbentry;
+        if ((addr & (DATA_SIZE - 1)) != 0) {
+            goto do_unaligned_access;
+        }
+        iotlbentry = &env->iotlb[mmu_idx][index];
+
+        /* ??? Note that the io helpers always read data in the target
+           byte ordering.  We should push the LE/BE request down into io.  */
+        res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
+        res = TGT_BE(res);
+        return res;
+    }
+
+    /* Handle slow unaligned access (it spans two pages or IO).  */
+    if (DATA_SIZE > 1
+        && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
+                    >= TARGET_PAGE_SIZE)) {
+        target_ulong addr1, addr2;
+        DATA_TYPE res1, res2;
+        unsigned shift;
+    do_unaligned_access:
+        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                                 mmu_idx, retaddr);
+        }
+        addr1 = addr & ~(DATA_SIZE - 1);
+        addr2 = addr1 + DATA_SIZE;
+        /* Note the adjustment at the beginning of the function.
+           Undo that for the recursion.  */
+        res1 = helper_be_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_be_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
+        shift = (addr & (DATA_SIZE - 1)) * 8;
+
+        /* Big-endian combine.  */
+        res = (res1 << shift) | (res2 >> ((DATA_SIZE * 8) - shift));
+        return res;
+    }
+
+    /* Handle aligned access or unaligned access in the same page.  */
+    if ((addr & (DATA_SIZE - 1)) != 0
+        && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
+    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+    res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
+    return res;
+}
+#endif /* DATA_SIZE > 1 */
+
+/* Provide signed versions of the load routines as well.  We can of course
+   avoid this for 64-bit data, or for 32-bit data on 32-bit host.  */
+#if DATA_SIZE * 8 < TCG_TARGET_REG_BITS
+WORD_TYPE llvm_le_lds_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi)
+{
+    env->restore_val = oi >> 16;
+    return (SDATA_TYPE)helper_le_ld_name(env, addr, (uint16_t)oi, GETRA());
+}
+
+# if DATA_SIZE > 1
+WORD_TYPE llvm_be_lds_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi)
+{
+    env->restore_val = oi >> 16;
+    return (SDATA_TYPE)helper_be_ld_name(env, addr, (uint16_t)oi, GETRA());
+}
+# endif
+#endif
+
+void llvm_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+                     TCGMemOpIdx oi)
+{
+    unsigned mmu_idx = get_mmuidx((uint16_t)oi);
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    uintptr_t haddr;
+    uintptr_t retaddr;
+
+    env->restore_val = oi >> 16;
+
+    /* Adjust the given return address.  */
+    retaddr = GETPC();
+
+    /* If the TLB entry is for a different page, reload and try again.  */
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
+        if ((addr & (DATA_SIZE - 1)) != 0
+            && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                                 mmu_idx, retaddr);
+        }
+        if (!VICTIM_TLB_HIT(addr_write)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+        tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    }
+
+    /* Handle an IO access.  */
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
+        CPUIOTLBEntry *iotlbentry;
+        if ((addr & (DATA_SIZE - 1)) != 0) {
+            goto do_unaligned_access;
+        }
+        iotlbentry = &env->iotlb[mmu_idx][index];
+
+        /* ??? Note that the io helpers always read data in the target
+           byte ordering.  We should push the LE/BE request down into io.  */
+        val = TGT_LE(val);
+        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        return;
+    }
+
+    /* Handle slow unaligned access (it spans two pages or IO).  */
+    if (DATA_SIZE > 1
+        && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
+                     >= TARGET_PAGE_SIZE)) {
+        int i;
+    do_unaligned_access:
+        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                                 mmu_idx, retaddr);
+        }
+        /* XXX: not efficient, but simple */
+        /* Note: relies on the fact that tlb_fill() does not remove the
+         * previous page from the TLB cache.  */
+        for (i = DATA_SIZE - 1; i >= 0; i--) {
+            /* Little-endian extract.  */
+            uint8_t val8 = val >> (i * 8);
+            /* Note the adjustment at the beginning of the function.
+               Undo that for the recursion.  */
+            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
+                                            oi, retaddr + GETPC_ADJ);
+        }
+        return;
+    }
+
+    /* Handle aligned access or unaligned access in the same page.  */
+    if ((addr & (DATA_SIZE - 1)) != 0
+        && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
+    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+#if DATA_SIZE == 1
+    glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
+#else
+    glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
+#endif
+}
+
+#if DATA_SIZE > 1
+void llvm_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
+                     TCGMemOpIdx oi)
+{
+    unsigned mmu_idx = get_mmuidx((uint16_t)oi);
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    tlbaddr_t tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    uintptr_t haddr;
+    uintptr_t retaddr;
+
+    env->restore_val = oi >> 16;
+
+    /* Adjust the given return address.  */
+    retaddr = GETPC();
+
+    /* If the TLB entry is for a different page, reload and try again.  */
+    if (page_val(addr, env) != (tlb_addr & TLB_NONIO_MASK)) {
+        if ((addr & (DATA_SIZE - 1)) != 0
+            && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                                 mmu_idx, retaddr);
+        }
+        if (!VICTIM_TLB_HIT(addr_write)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+        tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    }
+
+    /* Handle an IO access.  */
+    if (unlikely(tlb_addr & TLB_IO_MASK)) {
+        CPUIOTLBEntry *iotlbentry;
+        if ((addr & (DATA_SIZE - 1)) != 0) {
+            goto do_unaligned_access;
+        }
+        iotlbentry = &env->iotlb[mmu_idx][index];
+
+        /* ??? Note that the io helpers always read data in the target
+           byte ordering.  We should push the LE/BE request down into io.  */
+        val = TGT_BE(val);
+        glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
+        return;
+    }
+
+    /* Handle slow unaligned access (it spans two pages or IO).  */
+    if (DATA_SIZE > 1
+        && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
+                     >= TARGET_PAGE_SIZE)) {
+        int i;
+    do_unaligned_access:
+        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                                 mmu_idx, retaddr);
+        }
+        /* XXX: not efficient, but simple */
+        /* Note: relies on the fact that tlb_fill() does not remove the
+         * previous page from the TLB cache.  */
+        for (i = DATA_SIZE - 1; i >= 0; i--) {
+            /* Big-endian extract.  */
+            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
+            /* Note the adjustment at the beginning of the function.
+               Undo that for the recursion.  */
+            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
+                                            oi, retaddr + GETPC_ADJ);
+        }
+        return;
+    }
+
+    /* Handle aligned access or unaligned access in the same page.  */
+    if ((addr & (DATA_SIZE - 1)) != 0
+        && (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
+    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+    glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
+}
+#endif /* DATA_SIZE > 1 */
+
+#endif /* !defined(SOFTMMU_CODE_ACCESS) */
+
+#undef llvm_le_ld_name
+#undef llvm_be_ld_name
+#undef llvm_le_lds_name
+#undef llvm_be_lds_name
+#undef llvm_le_st_name
+#undef llvm_be_st_name
+#undef llvm_te_ld_name
+#undef llvm_te_st_name
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 815fef8..1087075 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -437,7 +437,7 @@ typedef struct CPUARMState {
          * the two execution states, and means we do not need to explicitly
          * map these registers when changing states.
          */
-        float64 regs[64];
+        float64 regs[64] __attribute__((aligned(16)));
 
         uint32_t xregs[16];
         /* We store these fpcsr fields separately for convenience.  */
@@ -496,6 +496,8 @@ typedef struct CPUARMState {
     /* Internal CPU feature flags.  */
     uint64_t features;
 
+    CPU_OPTIMIZATION_COMMON
+
     /* PMSAv7 MPU */
     struct {
         uint32_t *drbar;
@@ -1509,7 +1511,7 @@ bool write_cpustate_to_list(ARMCPU *cpu);
 /* The ARM MMU allows 1k pages.  */
 /* ??? Linux doesn't actually use these, and they're deprecated in recent
    architecture revisions.  Maybe a configure option to disable them.  */
-#define TARGET_PAGE_BITS 10
+#define TARGET_PAGE_BITS 12
 #endif
 
 #if defined(TARGET_AARCH64)
@@ -1523,7 +1525,7 @@ bool write_cpustate_to_list(ARMCPU *cpu);
 static inline bool arm_excp_unmasked(CPUState *cs, unsigned int excp_idx,
                                      unsigned int target_el)
 {
-    CPUARMState *env = cs->env_ptr;
+    CPUARMState *env = (CPUARMState *)cs->env_ptr;
     unsigned int cur_el = arm_current_el(env);
     bool secure = arm_is_secure(env);
     bool pstate_unmasked;
@@ -1983,6 +1985,62 @@ static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
     *cs_base = 0;
 }
 
+static inline target_ulong cpu_get_pc(CPUARMState *env)
+{
+#if defined(TARGET_AARCH64)
+    return env->pc;
+#else
+    return env->regs[15];
+#endif
+}
+
+static inline int cpu_check_state(CPUARMState *env,
+                                  target_ulong cs_base, int flags)
+{
+    int f;
+    if (is_a64(env)) {
+        f = ARM_TBFLAG_AARCH64_STATE_MASK;
+    } else {
+        f = (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
+            | (env->vfp.vec_len << ARM_TBFLAG_VECLEN_SHIFT)
+            | (env->vfp.vec_stride << ARM_TBFLAG_VECSTRIDE_SHIFT)
+            | (env->condexec_bits << ARM_TBFLAG_CONDEXEC_SHIFT)
+            | (env->bswap_code << ARM_TBFLAG_BSWAP_CODE_SHIFT);
+        if (!(access_secure_reg(env))) {
+            f |= ARM_TBFLAG_NS_MASK;
+        }
+        if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
+            || arm_el_is_aa64(env, 1)) {
+            f |= ARM_TBFLAG_VFPEN_MASK;
+        }
+        f |= (extract32(env->cp15.c15_cpar, 0, 2)
+                   << ARM_TBFLAG_XSCALE_CPAR_SHIFT);
+    }
+
+    f |= (cpu_mmu_index(env, false) << ARM_TBFLAG_MMUIDX_SHIFT);
+    /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
+     * states defined in the ARM ARM for software singlestep:
+     *  SS_ACTIVE   PSTATE.SS   State
+     *     0            x       Inactive (the TB flag for SS is always 0)
+     *     1            0       Active-pending
+     *     1            1       Active-not-pending
+     */
+    if (arm_singlestep_active(env)) {
+        f |= ARM_TBFLAG_SS_ACTIVE_MASK;
+        if (is_a64(env)) {
+            if (env->pstate & PSTATE_SS) {
+                f |= ARM_TBFLAG_PSTATE_SS_MASK;
+            }
+        } else {
+            if (env->uncached_cpsr & PSTATE_SS) {
+                f |= ARM_TBFLAG_PSTATE_SS_MASK;
+            }
+        }
+    }
+    f |= fp_exception_el(env) << ARM_TBFLAG_FPEXC_EL_SHIFT;
+    return f == flags;
+}
+
 #include "exec/exec-all.h"
 
 enum {
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 1743e37..8e862d9 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -11,6 +11,7 @@
 #include "arm_ldst.h"
 #include <zlib.h> /* For crc32 */
 #include "exec/semihost.h"
+#include "hqemu.h"
 
 #define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */
 
@@ -2225,6 +2226,8 @@ static void vmsa_ttbr_write(CPUARMState *env, const ARMCPRegInfo *ri,
         tlb_flush(CPU(cpu), 1);
     }
     raw_write(env, ri, value);
+
+    pcid = (target_ulong)value >> 12;
 }
 
 static void vttbr_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -8091,29 +8094,23 @@ float64 VFP_HELPER(sqrt, d)(float64 a, CPUARMState *env)
 
 /* XXX: check quiet/signaling case */
 #define DO_VFP_cmp(p, type) \
-void VFP_HELPER(cmp, p)(type a, type b, CPUARMState *env)  \
+uint32_t VFP_HELPER(cmp, p)(type a, type b, CPUARMState *env)  \
 { \
-    uint32_t flags; \
-    switch(type ## _compare_quiet(a, b, &env->vfp.fp_status)) { \
-    case 0: flags = 0x6; break; \
-    case -1: flags = 0x8; break; \
-    case 1: flags = 0x2; break; \
-    default: case 2: flags = 0x3; break; \
-    } \
-    env->vfp.xregs[ARM_VFP_FPSCR] = (flags << 28) \
-        | (env->vfp.xregs[ARM_VFP_FPSCR] & 0x0fffffff); \
+    uint32_t flags = 0x3; \
+    int ret = type ## _compare_quiet(a, b, &env->vfp.fp_status); \
+    if (ret == 0) flags = 0x6; \
+    else if (ret == -1) flags = 0x8; \
+    else if (ret == 1) flags = 0x2; \
+    return flags << 28; \
 } \
-void VFP_HELPER(cmpe, p)(type a, type b, CPUARMState *env) \
+uint32_t VFP_HELPER(cmpe, p)(type a, type b, CPUARMState *env) \
 { \
-    uint32_t flags; \
-    switch(type ## _compare(a, b, &env->vfp.fp_status)) { \
-    case 0: flags = 0x6; break; \
-    case -1: flags = 0x8; break; \
-    case 1: flags = 0x2; break; \
-    default: case 2: flags = 0x3; break; \
-    } \
-    env->vfp.xregs[ARM_VFP_FPSCR] = (flags << 28) \
-        | (env->vfp.xregs[ARM_VFP_FPSCR] & 0x0fffffff); \
+    uint32_t flags = 0x3; \
+    int ret = type ## _compare(a, b, &env->vfp.fp_status); \
+    if (ret == 0) flags = 0x6; \
+    else if (ret == -1) flags = 0x8; \
+    else if (ret == 1) flags = 0x2; \
+    return flags << 28; \
 }
 DO_VFP_cmp(s, float32)
 DO_VFP_cmp(d, float64)
@@ -8891,3 +8888,12 @@ uint32_t HELPER(crc32c)(uint32_t acc, uint32_t val, uint32_t bytes)
     /* Linux crc32c converts the output to one's complement.  */
     return crc32c(acc, buf, bytes) ^ 0xffffffff;
 }
+
+CPUState *cpu_create(void)
+{
+    ARMCPU *cpu = g_malloc0(sizeof(ARMCPU));
+    CPUState *cs = CPU(cpu);
+    memcpy(cpu, ARM_CPU(first_cpu), sizeof(ARMCPU));
+    cs->env_ptr = &cpu->env;
+    return cs;
+}
diff --git a/target-arm/helper.h b/target-arm/helper.h
index c2a85c7..41c2c6d 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -56,6 +56,7 @@ DEF_HELPER_2(pre_smc, void, env, i32)
 
 DEF_HELPER_1(check_breakpoints, void, env)
 
+DEF_HELPER_3(cpsr_write_nzcv, void, env, i32, i32)
 DEF_HELPER_3(cpsr_write, void, env, i32, i32)
 DEF_HELPER_1(cpsr_read, i32, env)
 
@@ -103,10 +104,10 @@ DEF_HELPER_1(vfp_abss, f32, f32)
 DEF_HELPER_1(vfp_absd, f64, f64)
 DEF_HELPER_2(vfp_sqrts, f32, f32, env)
 DEF_HELPER_2(vfp_sqrtd, f64, f64, env)
-DEF_HELPER_3(vfp_cmps, void, f32, f32, env)
-DEF_HELPER_3(vfp_cmpd, void, f64, f64, env)
-DEF_HELPER_3(vfp_cmpes, void, f32, f32, env)
-DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
+DEF_HELPER_3(vfp_cmps, i32, f32, f32, env)
+DEF_HELPER_3(vfp_cmpd, i32, f64, f64, env)
+DEF_HELPER_3(vfp_cmpes, i32, f32, f32, env)
+DEF_HELPER_3(vfp_cmped, i32, f64, f64, env)
 
 DEF_HELPER_2(vfp_fcvtds, f64, f32, env)
 DEF_HELPER_2(vfp_fcvtsd, f32, f64, env)
@@ -535,3 +536,5 @@ DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #endif
+
+#include "hqemu-helper.h"
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 6cd54c8..fdea907 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -386,6 +386,16 @@ void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
     cpsr_write(env, val, mask);
 }
 
+void HELPER(cpsr_write_nzcv)(CPUARMState *env, uint32_t val, uint32_t mask)
+{
+    if (mask & CPSR_NZCV) {
+        env->ZF = (~val) & CPSR_Z;
+        env->NF = val;
+        env->CF = (val >> 29) & 1;
+        env->VF = (val << 3) & 0x80000000;
+    }
+}
+
 /* Access to user mode registers from privileged modes.  */
 uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno)
 {
diff --git a/target-arm/simd_helper.h b/target-arm/simd_helper.h
new file mode 100644
index 0000000..186a7bd
--- /dev/null
+++ b/target-arm/simd_helper.h
@@ -0,0 +1,91 @@
+
+static inline void gen_vector_op3(TCGOpcode opc, TCGArg arg1, TCGArg arg2,
+                                  TCGArg arg3)
+{
+    int pi = tcg_ctx.gen_next_parm_idx;
+    tcg_emit_op(&tcg_ctx, opc, pi);
+    *tcg_ctx.vec_opparam_ptr++ = arg1;
+    *tcg_ctx.vec_opparam_ptr++ = arg2;
+    *tcg_ctx.vec_opparam_ptr++ = arg3;
+}
+
+#define gen_vector_arith(op,etype,size) \
+do {                                    \
+    TCGOpcode _opc = 0;                 \
+    TCGArg _rd = offsetof(CPUARMState, vfp.regs[rd]); \
+    TCGArg _rn = offsetof(CPUARMState, vfp.regs[rn]); \
+    TCGArg _rm = offsetof(CPUARMState, vfp.regs[rm]); \
+    if (q == 1) {                       \
+        switch(size) {                  \
+        case 0: _opc = INDEX_op_##op##_##etype##8_128;  break; \
+        case 1: _opc = INDEX_op_##op##_##etype##16_128; break; \
+        case 2: _opc = INDEX_op_##op##_##etype##32_128; break; \
+        case 3: _opc = INDEX_op_##op##_##etype##64_128; break; \
+        default:                        \
+            fprintf(stderr, "%s:%d: tcg fatal error: size=%d q=%d\n", \
+                            __FILE__, __LINE__, size, q);              \
+            exit(0);                    \
+            break;                      \
+	}                               \
+    } else {			        \
+        switch(size) {                  \
+	case 0: _opc = INDEX_op_##op##_##etype##8_64;   break; \
+	case 1: _opc = INDEX_op_##op##_##etype##16_64;  break; \
+	case 2: _opc = INDEX_op_##op##_##etype##32_64;  break; \
+        default:                        \
+            fprintf(stderr, "%s:%d: tcg fatal error: size=%d q=%d\n", \
+                            __FILE__, __LINE__, size, q);             \
+            exit(0);                    \
+            break;                      \
+        }			        \
+    }                                   \
+    gen_vector_op3(_opc, _rd, _rn, _rm); \
+} while (0)
+
+#define gen_vector_fop(op)             \
+do {                                   \
+    TCGOpcode _opc = 0;                \
+    TCGArg _rd = offsetof(CPUARMState, vfp.regs[rd]); \
+    TCGArg _rn = offsetof(CPUARMState, vfp.regs[rn]); \
+    TCGArg _rm = offsetof(CPUARMState, vfp.regs[rm]); \
+    if(q == 1)                         \
+        _opc = INDEX_op_##op##_f32_128;\
+    else                               \
+	_opc = INDEX_op_##op##_f32_64; \
+    gen_vector_op3(_opc, _rd, _rn, _rm);              \
+} while (0)	
+
+#define gen_vector_fop2(op)            \
+do {                                   \
+    TCGOpcode _opc = 0;                \
+    TCGArg _rd = offsetof(CPUARMState, vfp.regs[rd]); \
+    TCGArg _rn = offsetof(CPUARMState, vfp.regs[rn]); \
+    TCGArg _rm = offsetof(CPUARMState, vfp.regs[rm]); \
+    if(q == 1)                         \
+        _opc = (size) ? INDEX_op_##op##_f64_128 : INDEX_op_##op##_f32_128;\
+    else                               \
+        _opc = INDEX_op_##op##_f32_64; \
+    gen_vector_op3(_opc, _rd, _rn, _rm);              \
+} while (0)	
+
+#define gen_vector_logical(op)         \
+do {                                   \
+    TCGOpcode _opc = 0;                \
+    TCGArg _rd = offsetof(CPUARMState, vfp.regs[rd]); \
+    TCGArg _rn = offsetof(CPUARMState, vfp.regs[rn]); \
+    TCGArg _rm = offsetof(CPUARMState, vfp.regs[rm]); \
+    if(q == 1) \
+        _opc = INDEX_op_##op##_128;    \
+    else                               \
+        _opc = INDEX_op_##op##_64;     \
+    gen_vector_op3(_opc, _rd, _rn, _rm);              \
+} while (0)
+
+#define gen_vector_cvt(op,size)        \
+do {                                   \
+    TCGOpcode _opc = INDEX_op_##op##_128;             \
+    TCGArg _rd = offsetof(CPUARMState, vfp.regs[rd]); \
+    TCGArg _rm = offsetof(CPUARMState, vfp.regs[rm]); \
+    gen_vector_op3(_opc, _rd, _rm, size); \
+} while (0)
+
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 14e8131..21cf214 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -37,10 +37,17 @@
 #include "exec/helper-gen.h"
 
 #include "trace-tcg.h"
+#include "hqemu.h"
 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
 
+#if defined(CONFIG_USER_ONLY)
+#define IS_USER(s) 1
+#else
+#define IS_USER(s) (s->user)
+#endif
+
 /* Load/store exclusive handling */
 static TCGv_i64 cpu_exclusive_high;
 
@@ -119,6 +126,31 @@ static inline ARMMMUIdx get_a64_user_mem_index(DisasContext *s)
     }
 }
 
+static inline void gen_ibtc_stub(DisasContext *s)
+{
+#ifdef ENABLE_IBTC
+    if (!build_llvm(s->env)) {
+        TCGv_ptr ibtc_host_pc = tcg_temp_new_ptr();
+        gen_helper_lookup_ibtc(ibtc_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(ibtc_host_pc));
+        tcg_temp_free_ptr(ibtc_host_pc);
+        s->gen_ibtc = 0;
+    }
+#endif
+}
+
+static inline void gen_cpbl_stub(DisasContext *s)
+{
+#ifdef ENABLE_CPBL
+    if (!build_llvm(s->env)) {
+        TCGv_ptr cpbl_host_pc = tcg_temp_new_ptr();
+        gen_helper_lookup_cpbl(cpbl_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(cpbl_host_pc));
+        tcg_temp_free_ptr(cpbl_host_pc);
+    }
+#endif
+}
+
 void aarch64_cpu_dump_state(CPUState *cs, FILE *f,
                             fprintf_function cpu_fprintf, int flags)
 {
@@ -285,12 +317,38 @@ static inline bool use_goto_tb(DisasContext *s, int n, uint64_t dest)
     return true;
 }
 
+#if defined(CONFIG_USER_ONLY)
+static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
+{
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    tcg_gen_goto_tb(n);
+    gen_a64_set_pc_im(dest);
+    tcg_gen_exit_tb((intptr_t)tb + n);
+    s->is_jmp = DISAS_TB_JUMP;
+    tb->jmp_pc[n] = dest;
+}
+#else
+static int try_link_pages(DisasContext *s, TranslationBlock *tb, target_ulong dest)
+{
+#ifdef ENABLE_LPAGE
+    if (!build_llvm(s->env)) {
+        target_ulong addr, size;
+        int ret = lpt_search_page(s->env, dest, &addr, &size);
+        if (ret == 1 && (tb->pc & ~(size - 1)) == addr)
+            return 1;
+    }
+#endif
+    return 0;
+}
+
 static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
 {
     TranslationBlock *tb;
 
     tb = s->tb;
-    if (use_goto_tb(s, n, dest)) {
+    if (use_goto_tb(s, n, dest) || try_link_pages(s, tb, dest) == 1) {
         tcg_gen_goto_tb(n);
         gen_a64_set_pc_im(dest);
         tcg_gen_exit_tb((intptr_t)tb + n);
@@ -302,11 +360,14 @@ static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
         } else if (s->singlestep_enabled) {
             gen_exception_internal(EXCP_DEBUG);
         } else {
+            gen_cpbl_stub(s);
             tcg_gen_exit_tb(0);
             s->is_jmp = DISAS_TB_JUMP;
         }
     }
+    tb->jmp_pc[n] = dest;
 }
+#endif
 
 static void unallocated_encoding(DisasContext *s)
 {
@@ -568,6 +629,7 @@ static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
 
         tcg_gen_movi_i64(tmp, 0);
         tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp);
+        tcg_gen_annotate(A_SetCC);
 
         tcg_gen_extrl_i64_i32(cpu_CF, flag);
 
@@ -614,6 +676,7 @@ static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
         result = tcg_temp_new_i64();
         flag = tcg_temp_new_i64();
         tcg_gen_sub_i64(result, t0, t1);
+        tcg_gen_annotate(A_SetCC);
 
         gen_set_NZ64(result);
 
@@ -764,11 +827,51 @@ static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
                      get_mem_index(s));
 }
 
+#ifdef ENABLE_TCG_VECTOR
+#include "simd_helper.h"
+
+#define VFP_DREG(reg) \
+do { \
+    reg = reg * 2; \
+} while (0)
+#define tcg_vector_abort() \
+do {\
+    fprintf(stderr, "%s:%d: tcg fatal error - unhandled vector op.\n", __FILE__, __LINE__);\
+    exit(0);\
+} while (0)
+
+/*
+ * disas_neon_ls_vector()
+ *  return true if the neon instruction is successfully translated to tcg vector opc.
+ */
+static int disas_neon_ls_vector(DisasContext *s, int reg, int is_load,
+                                TCGv_i64 tcg_addr)
+{
+    TCGArg vop, alignment = 32;
+
+    if (!build_llvm(s->env))
+        return 0;
+
+    VFP_DREG(reg);
+    vop = (is_load) ? INDEX_op_vload_128 : INDEX_op_vstore_128;
+    gen_vector_op3(vop,
+                   offsetof(CPUARMState, vfp.regs[reg]),
+                   GET_TCGV_I64(tcg_addr),
+                   alignment);
+    return 1;
+}
+#endif
+
 /*
  * Store from FP register to memory
  */
 static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size)
 {
+#ifdef ENABLE_TCG_VECTOR
+    if (size >= 4 && disas_neon_ls_vector(s, srcidx, 0, tcg_addr) == 1)
+        return;
+#endif
+
     /* This writes the bottom N bits of a 128 bit wide vector to memory */
     TCGv_i64 tmp = tcg_temp_new_i64();
     tcg_gen_ld_i64(tmp, cpu_env, fp_reg_offset(s, srcidx, MO_64));
@@ -791,6 +894,11 @@ static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size)
  */
 static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
 {
+#ifdef ENABLE_TCG_VECTOR
+    if (size >= 4 && disas_neon_ls_vector(s, destidx, 1, tcg_addr) == 1)
+        return;
+#endif
+
     /* This always zero-extends and writes to a full 128 bit wide vector */
     TCGv_i64 tmplo = tcg_temp_new_i64();
     TCGv_i64 tmphi;
@@ -1653,6 +1761,7 @@ static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
     }
 
     s->is_jmp = DISAS_JUMP;
+    s->gen_ibtc = 1;
 }
 
 /* C3.2 Branches, exception generating and system instructions */
@@ -3624,6 +3733,8 @@ static void disas_cc(DisasContext *s, uint32_t insn)
     TCGv_i64 tcg_tmp, tcg_y, tcg_rn;
     DisasCompare c;
 
+    tcg_gen_annotate(A_NoSIMDization);
+
     if (!extract32(insn, 29, 1)) {
         unallocated_encoding(s);
         return;
@@ -8854,6 +8965,153 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
     }
 }
 
+#ifdef ENABLE_TCG_VECTOR
+static int disas_neon_misc(DisasContext *s, uint32_t insn)
+{
+    if (!build_llvm(s->env))
+        return 0;
+
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    bool u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    int rm = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    VFP_DREG(rm);
+    VFP_DREG(rd);
+
+    switch (opcode) {
+    case 0xc ... 0xf:
+    case 0x16 ... 0x1d:
+    case 0x1f:
+    {
+        /* Floating point: U, size[1] and opcode indicate operation;
+         * size[0] indicates single or double precision.
+         */
+        int is_double = extract32(size, 0, 1);
+        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+        size = is_double ? 64 : 32;
+
+        switch (opcode) {
+        case 0x1d: /* SCVTF */
+        case 0x5d: /* UCVTF */
+        {
+            if (is_double && !is_q) {
+                unallocated_encoding(s);
+                return 0;
+            }
+            if (!fp_access_check(s)) {
+                return 0;
+            }
+            if (opcode == 0x1d)
+                gen_vector_cvt(vsitofp, size);
+	    else
+                gen_vector_cvt(vuitofp, size);
+            break;
+	}
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x1c: /* FCVTAS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+            if (is_double && !is_q) {
+                unallocated_encoding(s);
+                return 0;
+            }
+            gen_vector_cvt(vfptosi, size);
+            break;
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x5c: /* FCVTAU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            if (is_double && !is_q) {
+                unallocated_encoding(s);
+                return 0;
+            }
+            gen_vector_cvt(vfptoui, size);
+            break;
+        default:
+            return 0;
+	}
+        break;
+    }
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * disas_neon_data_vector()
+ *  return true if the neon instruction is successfully translated to tcg vector opc.
+ */
+static int disas_neon_data_vector(DisasContext *s, uint32_t insn)
+{
+    if (!build_llvm(s->env))
+        return 0;
+
+    int q = extract32(insn, 30, 1);
+    int u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int op = extract32(insn, 11, 5);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    VFP_DREG(rm);
+    VFP_DREG(rn);
+    VFP_DREG(rd);
+
+    switch(op) {
+    case 0x10: /* ADD, SUB */
+        if(!u) /* ADD */
+            gen_vector_arith(vadd, i, size);
+        else   /* SUB */
+            gen_vector_arith(vsub, i, size);
+        break;
+    case 0x3: /* logic ops */
+        switch ((u << 2) | size) {
+        case 0: gen_vector_logical(vand); break; /* AND */
+        case 1: gen_vector_logical(vbic); break; /* BIC  rd = rn&(~rm)*/
+        case 2: gen_vector_logical(vorr); break; /* ORR */
+        case 3: gen_vector_logical(vorn); break; /* ORN */
+        case 4: gen_vector_logical(veor); break; /* EOR */
+        case 5: gen_vector_logical(vbsl); break; /* BSL */
+        case 6: gen_vector_logical(vbit); break; /* BIT */
+        case 7: gen_vector_logical(vbif); break; /* BIF */
+        default:
+            return 0;
+        }
+        break;
+    case 0x18 ... 0x31:
+    {
+        int fpopcode = extract32(insn, 11, 5)
+            | (extract32(insn, 23, 1) << 5)
+            | (extract32(insn, 29, 1) << 6);
+        int size = extract32(insn, 22, 1);
+        switch (fpopcode) {
+        case 0x1a: gen_vector_fop2(vadd); break; /* FADD */
+        case 0x3a: gen_vector_fop2(vsub); break; /* FSUB */
+        case 0x5b: gen_vector_fop2(vmul); break; /* FMUL */
+        case 0x5f: gen_vector_fop2(vdiv); break; /* FDIV */
+        case 0x19: gen_vector_fop2(vmla); break; /* FMLA */
+        case 0x39: gen_vector_fop2(vmls); break; /* FMLS */
+        default:
+            return 0;
+        }
+        break;
+    }
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+#endif
+
 /* Logic op (opcode == 3) subgroup of C3.6.16. */
 static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
 {
@@ -8870,6 +9128,11 @@ static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
         return;
     }
 
+#ifdef ENABLE_TCG_VECTOR
+    if (disas_neon_data_vector(s, insn) == 1)
+        return;
+#endif
+
     tcg_op1 = tcg_temp_new_i64();
     tcg_op2 = tcg_temp_new_i64();
     tcg_res[0] = tcg_temp_new_i64();
@@ -9138,6 +9401,11 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
         return;
     }
 
+#ifdef ENABLE_TCG_VECTOR
+    if (disas_neon_data_vector(s, insn) == 1)
+        return;
+#endif
+
     switch (fpopcode) {
     case 0x58: /* FMAXNMP */
     case 0x5a: /* FADDP */
@@ -9232,6 +9500,11 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
         return;
     }
 
+#ifdef ENABLE_TCG_VECTOR
+    if (disas_neon_data_vector(s, insn) == 1)
+        return;
+#endif
+
     if (size == 3) {
         assert(is_q);
         for (pass = 0; pass < 2; pass++) {
@@ -9778,6 +10051,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
     TCGv_i32 tcg_rmode;
     TCGv_ptr tcg_fpstatus;
 
+#ifdef ENABLE_TCG_VECTOR
+    if (disas_neon_misc(s, insn) == 1)
+        return;
+#endif
+
     switch (opcode) {
     case 0x0: /* REV64, REV32 */
     case 0x1: /* REV16 */
@@ -11018,6 +11296,8 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
 
     pc_start = tb->pc;
 
+    dc->gen_ibtc = 0;
+    dc->env = env;
     dc->tb = tb;
 
     dc->is_jmp = DISAS_NEXT;
@@ -11078,7 +11358,12 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
         max_insns = TCG_MAX_INSNS;
     }
 
-    gen_tb_start(tb);
+    if (!build_llvm(env)) {
+        gen_tb_start(tb);
+        if (tracer_mode != TRANS_MODE_NONE)
+            tcg_gen_hotpatch(IS_USER(dc), tracer_mode == TRANS_MODE_HYBRIDS ||
+                                          tracer_mode == TRANS_MODE_HYBRIDM);
+    }
 
     tcg_clear_temp_count();
 
@@ -11144,6 +11429,9 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
          * Also stop translation when a page boundary is reached.  This
          * ensures prefetch aborts occur at the right place.
          */
+
+        if (build_llvm(env) && num_insns == tb->icount)
+            break;
     } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
@@ -11155,6 +11443,15 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
         gen_io_end();
     }
 
+    if (build_llvm(env) && tb->size != dc->pc - pc_start) {
+        /* consistency check with tb info. we must make sure
+         * guest basic blocks are the same. skip this trace if inconsistent */
+        fprintf(stderr, "inconsistent block with pc 0x"TARGET_FMT_lx" size=%d"
+                " icount=%d (error size="TARGET_FMT_ld")\n",
+                tb->pc, tb->size, tb->icount, dc->pc - pc_start);
+        exit(0);
+    }
+
     if (unlikely(cs->singlestep_enabled || dc->ss_active)
         && dc->is_jmp != DISAS_EXC) {
         /* Note that this means single stepping WFI doesn't halt the CPU.
@@ -11182,6 +11479,8 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
             /* fall through */
         case DISAS_JUMP:
             /* indicate that the hash table must be used to find the next TB */
+            if (dc->gen_ibtc == 1)
+                gen_ibtc_stub(dc);
             tcg_gen_exit_tb(0);
             break;
         case DISAS_TB_JUMP:
@@ -11211,10 +11510,15 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
     }
 
 done_generating:
-    gen_tb_end(tb, num_insns);
+    if (build_llvm(env)) {
+        /* Terminate the linked list.  */
+        tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+    } else {
+        gen_tb_end(tb, num_insns);
+    }
 
 #ifdef DEBUG_DISAS
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) && !build_llvm(env)) {
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start,
@@ -11222,6 +11526,8 @@ done_generating:
         qemu_log("\n");
     }
 #endif
-    tb->size = dc->pc - pc_start;
-    tb->icount = num_insns;
+    if (!build_llvm(env)) {
+        tb->size = dc->pc - pc_start;
+        tb->icount = num_insns;
+    }
 }
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 5d22879..256227b 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -36,6 +36,7 @@
 #include "exec/helper-gen.h"
 
 #include "trace-tcg.h"
+#include "hqemu.h"
 
 
 #define ENABLE_ARCH_4T    arm_dc_feature(s, ARM_FEATURE_V4T)
@@ -110,6 +111,33 @@ void arm_translate_init(void)
 #endif
 
     a64_translate_init();
+
+    copy_tcg_context_global();
+}
+
+static inline void gen_ibtc_stub(DisasContext *s)
+{
+#ifdef ENABLE_IBTC
+    if (!build_llvm(s->env)) {
+        TCGv_ptr ibtc_host_pc = tcg_temp_new_ptr();
+        gen_helper_lookup_ibtc(ibtc_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(ibtc_host_pc));
+        tcg_temp_free_ptr(ibtc_host_pc);
+        s->gen_ibtc = 0;
+    }
+#endif
+}
+
+static inline void gen_cpbl_stub(DisasContext *s)
+{
+#ifdef ENABLE_CPBL
+    if (!build_llvm(s->env)) {
+        TCGv_ptr cpbl_host_pc = tcg_temp_new_ptr();
+        gen_helper_lookup_cpbl(cpbl_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(cpbl_host_pc));
+        tcg_temp_free_ptr(cpbl_host_pc);
+    }
+#endif
 }
 
 static inline ARMMMUIdx get_a32_user_mem_index(DisasContext *s)
@@ -201,7 +229,10 @@ static void store_reg(DisasContext *s, int reg, TCGv_i32 var)
 static inline void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
 {
     TCGv_i32 tmp_mask = tcg_const_i32(mask);
-    gen_helper_cpsr_write(cpu_env, var, tmp_mask);
+    if (mask & ~CPSR_NZCV)
+        gen_helper_cpsr_write(cpu_env, var, tmp_mask);
+    else
+        gen_helper_cpsr_write_nzcv(cpu_env, var, tmp_mask);
     tcg_temp_free_i32(tmp_mask);
 }
 /* Set NZCV flags from the high 4 bits of var.  */
@@ -493,6 +524,7 @@ static void gen_sub_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
 {
     TCGv_i32 tmp;
     tcg_gen_sub_i32(cpu_NF, t0, t1);
+    tcg_gen_annotate(A_SetCC);
     tcg_gen_mov_i32(cpu_ZF, cpu_NF);
     tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1);
     tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
@@ -878,6 +910,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
         tcg_temp_free_i32(tmp);
     }
     tcg_gen_movi_i32(cpu_R[15], addr & ~1);
+    s->gen_ibtc = 1;
 }
 
 /* Set PC and Thumb state from var.  var is marked as dead.  */
@@ -887,6 +920,7 @@ static inline void gen_bx(DisasContext *s, TCGv_i32 var)
     tcg_gen_andi_i32(cpu_R[15], var, ~1);
     tcg_gen_andi_i32(var, var, 1);
     store_cpu_field(var, thumb);
+    s->gen_ibtc = 1;
 }
 
 /* Variant of store_reg which uses branch&exchange logic when storing
@@ -1199,20 +1233,38 @@ static inline void gen_vfp_sqrt(int dp)
         gen_helper_vfp_sqrts(cpu_F0s, cpu_F0s, cpu_env);
 }
 
+static inline void gen_update_fpscr(TCGv_i32 flags)
+{
+    TCGv_i32 tmp;
+    tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+    tcg_gen_andi_i32(tmp, tmp, 0x0fffffff);
+    tcg_gen_or_i32(tmp, tmp, flags);
+    store_cpu_field(tmp, vfp.xregs[ARM_VFP_FPSCR]);
+    tcg_temp_free_i32(tmp);
+}
+
 static inline void gen_vfp_cmp(int dp)
 {
+    TCGv_i32 flags = tcg_temp_new_i32();
     if (dp)
-        gen_helper_vfp_cmpd(cpu_F0d, cpu_F1d, cpu_env);
+        gen_helper_vfp_cmpd(flags, cpu_F0d, cpu_F1d, cpu_env);
     else
-        gen_helper_vfp_cmps(cpu_F0s, cpu_F1s, cpu_env);
+        gen_helper_vfp_cmps(flags, cpu_F0s, cpu_F1s, cpu_env);
+
+    gen_update_fpscr(flags);
+    tcg_temp_free_i32(flags);
 }
 
 static inline void gen_vfp_cmpe(int dp)
 {
+    TCGv_i32 flags = tcg_temp_new_i32();
     if (dp)
-        gen_helper_vfp_cmped(cpu_F0d, cpu_F1d, cpu_env);
+        gen_helper_vfp_cmped(flags, cpu_F0d, cpu_F1d, cpu_env);
     else
-        gen_helper_vfp_cmpes(cpu_F0s, cpu_F1s, cpu_env);
+        gen_helper_vfp_cmpes(flags, cpu_F0s, cpu_F1s, cpu_env);
+
+    gen_update_fpscr(flags);
+    tcg_temp_free_i32(flags);
 }
 
 static inline void gen_vfp_F1_ld0(int dp)
@@ -3977,20 +4029,49 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
     return 0;
 }
 
+#if defined(CONFIG_USER_ONLY)
+static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
+{
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    tcg_gen_goto_tb(n);
+    gen_set_pc_im(s, dest);
+    tcg_gen_exit_tb((uintptr_t)tb + n);
+    tb->jmp_pc[n] = dest;
+}
+#else
+static int try_link_pages(DisasContext *s, TranslationBlock *tb, target_ulong dest)
+{
+#ifdef ENABLE_LPAGE
+    if (!build_llvm(s->env)) {
+        target_ulong addr, size;
+        int ret = lpt_search_page(s->env, dest, &addr, &size);
+        if (ret == 1 && (tb->pc & ~(size - 1)) == addr)
+            return 1;
+    }
+#endif
+    return 0;
+}
+
 static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
 {
     TranslationBlock *tb;
 
     tb = s->tb;
-    if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)) {
+    if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK) ||
+        try_link_pages(s, tb, dest) == 1) {
         tcg_gen_goto_tb(n);
         gen_set_pc_im(s, dest);
         tcg_gen_exit_tb((uintptr_t)tb + n);
     } else {
         gen_set_pc_im(s, dest);
+        gen_cpbl_stub(s);
         tcg_gen_exit_tb(0);
     }
+    tb->jmp_pc[n] = dest;
 }
+#endif
 
 static inline void gen_jmp (DisasContext *s, uint32_t dest)
 {
@@ -4372,6 +4453,54 @@ static struct {
     {2, 1, 1}
 };
 
+#ifdef ENABLE_TCG_VECTOR
+#include "simd_helper.h"
+
+#define tcg_vector_abort() \
+do {\
+    fprintf(stderr, "%s:%d: tcg fatal error - unhandled vector op.\n", __FILE__, __LINE__);\
+    exit(0);\
+} while (0)
+
+/*
+ * disas_neon_ls_vector()
+ *  return true if the neon instruction is successfully translated to tcg vector opc.
+ */
+static int disas_neon_ls_vector(DisasContext *s, uint32_t insn, TCGv_i32 addr)
+{
+    int rd, op, load;
+    int nregs, reg;
+    int interleave, spacing;
+    TCGArg vop, alignment = 32;
+
+    if (!build_llvm(s->env))
+        return 0;
+
+    /* Load store all elements.  */
+    op = (insn >> 8) & 0xf;
+    nregs = neon_ls_element_type[op].nregs;
+    interleave = neon_ls_element_type[op].interleave;
+    spacing = neon_ls_element_type[op].spacing;
+
+    if (interleave != 1 || nregs % 2 != 0)
+        return 0;
+
+    VFP_DREG_D(rd, insn);
+    load = (insn & (1 << 21)) != 0;
+    vop = (load) ? INDEX_op_vload_128 : INDEX_op_vstore_128;
+
+    for (reg = 0; reg < nregs; reg += 2) {
+        gen_vector_op3(vop,
+                       offsetof(CPUARMState, vfp.regs[rd]),
+                       GET_TCGV_I32(addr),
+                       alignment);
+        rd += spacing * 2;
+        tcg_gen_addi_i32(addr, addr, 16);
+    }
+    return 1;
+}
+#endif
+
 /* Translate a NEON load/store element instruction.  Return nonzero if the
    instruction is invalid.  */
 static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
@@ -4438,6 +4567,11 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
         addr = tcg_temp_new_i32();
         load_reg_var(s, addr, rn);
         stride = (1 << size) * interleave;
+
+#ifdef ENABLE_TCG_VECTOR
+        if (disas_neon_ls_vector(s, insn, addr) == 1)
+            goto vector_done;
+#endif
         for (reg = 0; reg < nregs; reg++) {
             if (interleave > 2 || (interleave == 2 && nregs == 2)) {
                 load_reg_var(s, addr, rn);
@@ -4529,6 +4663,9 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
             }
             rd += spacing;
         }
+#ifdef ENABLE_TCG_VECTOR
+vector_done:
+#endif
         tcg_temp_free_i32(addr);
         stride = nregs * 8;
     } else {
@@ -5111,6 +5248,131 @@ static const uint8_t neon_2rm_sizes[] = {
     [NEON_2RM_VCVT_UF] = 0x4,
 };
 
+#ifdef ENABLE_TCG_VECTOR
+static int disas_neon_misc(DisasContext *s, uint32_t insn)
+{
+    int op, rd, rm;
+
+    if (!build_llvm(s->env))
+        return 0;
+
+    op = ((insn >> 12) & 0x30) | ((insn >> 7) & 0xf);
+    VFP_DREG_D(rd, insn);
+    VFP_DREG_M(rm, insn);
+
+    switch (op) {
+    case NEON_2RM_VCVT_FS: /* VCVT.F32.S32 */
+        gen_vector_cvt(vsitofp, 32);
+        break;
+    case NEON_2RM_VCVT_FU: /* VCVT.F32.U32 */
+        gen_vector_cvt(vuitofp, 32);
+        break;
+    case NEON_2RM_VCVT_SF: /* VCVT.S32.F32 */
+        gen_vector_cvt(vfptosi, 32);
+        break;
+    case NEON_2RM_VCVT_UF: /* VCVT.U32.F32 */
+        gen_vector_cvt(vfptoui, 32);
+        break;
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * disas_neon_data_vector()
+ *  return true if the neon instruction is successfully translated to tcg vector opc.
+ */
+static int disas_neon_data_vector(DisasContext *s, uint32_t insn)
+{
+    int op, q, u, size;
+    int rd, rn, rm;
+
+    if (!build_llvm(s->env))
+        return 0;
+
+    /* Three register same length.  */
+    q = (insn & (1 << 6)) != 0;
+    u = (insn >> 24) & 1;
+    VFP_DREG_D(rd, insn);
+    VFP_DREG_N(rn, insn);
+    VFP_DREG_M(rm, insn);
+    size = (insn >> 20) & 3;
+    op = ((insn >> 7) & 0x1e) | ((insn >> 4) & 1);
+
+    switch (op) {
+    case NEON_3R_VSHL:
+    case NEON_3R_VQSHL:
+    case NEON_3R_VRSHL:
+    case NEON_3R_VQRSHL:
+        {
+            int rtmp;
+            /* Shift instruction operands are reversed.  */
+            rtmp = rn;
+            rn = rm;
+            rm = rtmp;
+        }
+        break;
+    default:
+        break;
+    }
+
+    switch(op) {
+    case NEON_3R_VADD_VSUB:
+        if(!u) /* VADD */
+            gen_vector_arith(vadd, i, size);
+        else   /* VSUB */
+            gen_vector_arith(vsub, i, size);
+        break;
+    case NEON_3R_LOGIC:
+        switch ((u << 2) | size) {
+        case 0: gen_vector_logical(vand); break; /* VAND */
+        case 1: gen_vector_logical(vbic); break; /* BIC  rd = rn&(~rm)*/
+        case 2: gen_vector_logical(vorr); break; /* VORR */
+        case 3: gen_vector_logical(vorn); break; /* VORN OR NOT */
+        case 4: gen_vector_logical(veor); break; /* VEOR Vector Bitwise Exclusive OR*/
+        case 5: gen_vector_logical(vbsl); break; /* VBSL */
+        case 6: gen_vector_logical(vbit); break; /* VBIT */
+        case 7: gen_vector_logical(vbif); break; /* VBIF */
+        }
+        break;
+    case NEON_3R_VFM:
+        if (size) /* VFMS */
+            gen_vector_fop(vfms);
+        else      /* VFMA */
+            gen_vector_fop(vfma);
+        break;
+    case NEON_3R_FLOAT_ARITH: /* Floating point arithmetic. */
+        switch ((u << 2) | size) {
+        case 0: gen_vector_fop(vadd);  break; /* VADD */
+        case 4: gen_vector_fop(vpadd); break; /* VPADD */
+        case 2: gen_vector_fop(vsub);  break; /* VSUB */
+        case 6: gen_vector_fop(vabd);  break; /* VABD */
+        default:
+            tcg_vector_abort();
+            break;
+        }
+        break;
+    case NEON_3R_FLOAT_MULTIPLY: /* float VMLA, VMLS, VMUL */
+        if(u)
+            gen_vector_fop(vmul);
+        else if (!u) {
+            if (size == 0)
+                gen_vector_fop(vmla);
+            else
+                gen_vector_fop(vmls);
+        } else
+            tcg_vector_abort();
+        break;
+    default:
+        return 0;
+    }
+
+    return 1;
+}
+#endif
+
 /* Translate a NEON data processing instruction.  Return nonzero if the
    instruction is invalid.
    We process data in a mixture of 32-bit and 64-bit chunks.
@@ -5341,6 +5603,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
             return 1;
         }
 
+#ifdef ENABLE_TCG_VECTOR
+        if (!pairwise && disas_neon_data_vector(s, insn) == 1)
+            return 0;
+#endif
+
         for (pass = 0; pass < (q ? 4 : 2); pass++) {
 
         if (pairwise) {
@@ -6741,6 +7008,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                     break;
                 default:
                 elementwise:
+#ifdef ENABLE_TCG_VECTOR
+                    if (disas_neon_misc(s, insn) == 1)
+                        return 0;
+#endif
                     for (pass = 0; pass < (q ? 4 : 2); pass++) {
                         if (neon_2rm_is_float_op(op)) {
                             tcg_gen_ld_f32(cpu_F0s, cpu_env,
@@ -11234,6 +11505,8 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
 
     pc_start = tb->pc;
 
+    dc->gen_ibtc = 0;
+    dc->env = env;
     dc->tb = tb;
 
     dc->is_jmp = DISAS_NEXT;
@@ -11303,7 +11576,12 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
         max_insns = TCG_MAX_INSNS;
     }
 
-    gen_tb_start(tb);
+    if (!build_llvm(env)) {
+        gen_tb_start(tb);
+        if (tracer_mode != TRANS_MODE_NONE)
+            tcg_gen_hotpatch(IS_USER(dc), tracer_mode == TRANS_MODE_HYBRIDS ||
+                                          tracer_mode == TRANS_MODE_HYBRIDM);
+    }
 
     tcg_clear_temp_count();
 
@@ -11460,6 +11738,12 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
         end_of_page = (dc->pc >= next_page_start) ||
             ((dc->pc >= next_page_start - 3) && insn_crosses_page(env, dc));
 
+#if defined(CONFIG_LLVM) && defined(CONFIG_USER_ONLY)
+        if (llvm_has_annotation(dc->pc, ANNOTATION_LOOP))
+            break;
+#endif
+        if (build_llvm(env) && num_insns == tb->icount)
+            break;
     } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
@@ -11476,6 +11760,15 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
         gen_io_end();
     }
 
+    if (build_llvm(env) && tb->size != dc->pc - pc_start) {
+        /* consistency check with tb info. we must make sure
+         * guest basic blocks are the same. skip this trace if inconsistent */
+        fprintf(stderr, "inconsistent block with pc 0x"TARGET_FMT_lx" size=%d"
+                " icount=%d (error size="TARGET_FMT_ld")\n",
+                tb->pc, tb->size, tb->icount, dc->pc - pc_start);
+        exit(0);
+    }
+
     /* At this stage dc->condjmp will only be set when the skipped
        instruction was a conditional branch or trap, and the PC has
        already been written.  */
@@ -11543,6 +11836,8 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
         case DISAS_JUMP:
         default:
             /* indicate that the hash table must be used to find the next TB */
+            if (dc->gen_ibtc == 1)
+                gen_ibtc_stub(dc);
             tcg_gen_exit_tb(0);
             break;
         case DISAS_TB_JUMP:
@@ -11581,10 +11876,15 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
     }
 
 done_generating:
-    gen_tb_end(tb, num_insns);
+    if (build_llvm(env)) {
+        /* Terminate the linked list.  */
+        tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+    } else {
+        gen_tb_end(tb, num_insns);
+    }
 
 #ifdef DEBUG_DISAS
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) && !build_llvm(env)) {
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start,
@@ -11592,8 +11892,10 @@ done_generating:
         qemu_log("\n");
     }
 #endif
-    tb->size = dc->pc - pc_start;
-    tb->icount = num_insns;
+    if (!build_llvm(env)) {
+        tb->size = dc->pc - pc_start;
+        tb->icount = num_insns;
+    }
 }
 
 static const char *cpu_mode_names[16] = {
diff --git a/target-arm/translate.h b/target-arm/translate.h
index 53ef971..10f6a05 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -61,6 +61,8 @@ typedef struct DisasContext {
 #define TMP_A64_MAX 16
     int tmp_a64_count;
     TCGv_i64 tmp_a64[TMP_A64_MAX];
+    int gen_ibtc;
+    CPUArchState *env;
 } DisasContext;
 
 typedef struct DisasCompare {
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 84edfd0..cbd8b2a 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -845,7 +845,7 @@ typedef struct CPUX86State {
     uint64_t efer;
 
     /* Beginning of state preserved by INIT (dummy marker).  */
-    struct {} start_init_save;
+    struct { int dummy; } start_init_save;
 
     /* FPU state */
     unsigned int fpstt; /* top of stack index */
@@ -865,8 +865,8 @@ typedef struct CPUX86State {
     float_status mmx_status; /* for 3DNow! float ops */
     float_status sse_status;
     uint32_t mxcsr;
-    XMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32];
-    XMMReg xmm_t0;
+    XMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32] __attribute__((aligned(64)));
+    XMMReg xmm_t0 __attribute__((aligned(64)));
     MMXReg mmx_t0;
 
     uint64_t opmask_regs[NB_OPMASK_REGS];
@@ -906,7 +906,7 @@ typedef struct CPUX86State {
     uint32_t smbase;
 
     /* End of state preserved by INIT (dummy marker).  */
-    struct {} end_init_save;
+    struct { int dummy; } end_init_save;
 
     uint64_t system_time_msr;
     uint64_t wall_clock_msr;
@@ -966,6 +966,8 @@ typedef struct CPUX86State {
     uint64_t mtrr_deftype;
     MTRRVar mtrr_var[MSR_MTRRcap_VCNT];
 
+    CPU_OPTIMIZATION_COMMON
+
     /* For KVM */
     uint32_t mp_state;
     int32_t exception_injected;
@@ -1237,6 +1239,19 @@ static inline void cpu_get_tb_cpu_state(CPUX86State *env, target_ulong *pc,
         (env->eflags & (IOPL_MASK | TF_MASK | RF_MASK | VM_MASK | AC_MASK));
 }
 
+static inline target_ulong cpu_get_pc(CPUX86State *env)
+{
+    return env->eip + env->segs[R_CS].base;
+}
+
+static inline int cpu_check_state(CPUX86State *env,
+                                  target_ulong cs_base, int flags)
+{
+    int mask = IOPL_MASK | TF_MASK | RF_MASK | VM_MASK | AC_MASK;
+    return (cs_base == env->segs[R_CS].base) &&
+           ((uint32_t)flags == (env->hflags | (env->eflags & mask)));
+}
+
 void do_cpu_init(X86CPU *cpu);
 void do_cpu_sipi(X86CPU *cpu);
 
@@ -1297,7 +1312,9 @@ static inline void cpu_load_efer(CPUX86State *env, uint64_t val)
 
 static inline MemTxAttrs cpu_get_mem_attrs(CPUX86State *env)
 {
-    return ((MemTxAttrs) { .secure = (env->hflags & HF_SMM_MASK) != 0 });
+    MemTxAttrs attrs = { 0 };
+    attrs.secure = (env->hflags & HF_SMM_MASK) != 0;
+    return attrs;
 }
 
 /* fpu_helper.c */
diff --git a/target-i386/fpu_helper.c b/target-i386/fpu_helper.c
index d421a47..4f50cd9 100644
--- a/target-i386/fpu_helper.c
+++ b/target-i386/fpu_helper.c
@@ -385,7 +385,7 @@ void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 
 /* FPU operations */
 
-static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
+const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 
 void helper_fcom_ST0_FT0(CPUX86State *env)
 {
diff --git a/target-i386/helper.c b/target-i386/helper.c
index d18be95..4bc1e13 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -25,6 +25,7 @@
 #include "monitor/monitor.h"
 #include "hw/i386/apic_internal.h"
 #endif
+#include "hqemu.h"
 
 static void cpu_x86_version(CPUX86State *env, int *family, int *model)
 {
@@ -641,6 +642,8 @@ void cpu_x86_update_cr3(CPUX86State *env, target_ulong new_cr3)
                         "CR3 update: CR3=" TARGET_FMT_lx "\n", new_cr3);
         tlb_flush(CPU(cpu), 0);
     }
+
+    pcid = new_cr3 >> 12;
 }
 
 void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
@@ -1432,3 +1435,12 @@ void x86_stq_phys(CPUState *cs, hwaddr addr, uint64_t val)
                       NULL);
 }
 #endif
+
+CPUState *cpu_create(void)
+{
+    X86CPU *cpu = g_malloc0(sizeof(X86CPU));
+    CPUState *cs = CPU(cpu);
+    memcpy(cpu, X86_CPU(first_cpu), sizeof(X86CPU));
+    cs->env_ptr = &cpu->env;
+    return cs;
+}
diff --git a/target-i386/helper.h b/target-i386/helper.h
index ecfcfd1..8fbdde6 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -219,3 +219,6 @@ DEF_HELPER_3(rcrl, tl, env, tl, tl)
 DEF_HELPER_3(rclq, tl, env, tl, tl)
 DEF_HELPER_3(rcrq, tl, env, tl, tl)
 #endif
+
+#include "hqemu-helper.h"
+#include "atomic-helper.h"
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 36fa3f0..2639ba5 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -2786,13 +2786,13 @@ static int kvm_handle_debug(X86CPU *cpu,
                     case 0x1:
                         ret = EXCP_DEBUG;
                         cs->watchpoint_hit = &hw_watchpoint;
-                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
+                        hw_watchpoint.addr = hw_breakpoint[n].addr;
                         hw_watchpoint.flags = BP_MEM_WRITE;
                         break;
                     case 0x3:
                         ret = EXCP_DEBUG;
                         cs->watchpoint_hit = &hw_watchpoint;
-                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
+                        hw_watchpoint.addr = hw_breakpoint[n].addr;
                         hw_watchpoint.flags = BP_MEM_ACCESS;
                         break;
                     }
diff --git a/target-i386/misc_helper.c b/target-i386/misc_helper.c
index 13bd4f5..b446daa 100644
--- a/target-i386/misc_helper.c
+++ b/target-i386/misc_helper.c
@@ -599,3 +599,7 @@ void helper_debug(CPUX86State *env)
     cs->exception_index = EXCP_DEBUG;
     cpu_loop_exit(cs);
 }
+
+#ifdef CONFIG_COREMU
+#include "atomic-x86.c"
+#endif
diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
index 1780d1d..4a96ed7 100644
--- a/target-i386/ops_sse.h
+++ b/target-i386/ops_sse.h
@@ -995,7 +995,7 @@ SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
 SSE_HELPER_CMP(cmpord, FPU_CMPORD)
 
-static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
+const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 
 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target-i386/simd_helper.h b/target-i386/simd_helper.h
new file mode 100644
index 0000000..dce0d59
--- /dev/null
+++ b/target-i386/simd_helper.h
@@ -0,0 +1,65 @@
+
+static inline void gen_vector_op3(TCGOpcode opc, TCGArg arg1, TCGArg arg2,
+                                  TCGArg arg3)
+{
+    int pi = tcg_ctx.gen_next_parm_idx;
+    tcg_emit_op(&tcg_ctx, opc, pi);
+    *tcg_ctx.vec_opparam_ptr++ = arg1;
+    *tcg_ctx.vec_opparam_ptr++ = arg2;
+    *tcg_ctx.vec_opparam_ptr++ = arg3;
+}
+
+#define gen_vector_arith(op,etype,size) \
+do {                                    \
+    TCGOpcode _opc = 0;                 \
+    TCGArg _rd = offsetof(CPUX86State, xmm_regs[rd]);              \
+    TCGArg _rn = offsetof(CPUX86State, xmm_regs[rn]);              \
+    TCGArg _rm = (rm == -1) ? offsetof(CPUX86State, xmm_t0) :      \
+                              offsetof(CPUX86State, xmm_regs[rm]); \
+    switch(size) {                  \
+    case 0: _opc = INDEX_op_##op##_##etype##8_128;  break; \
+    case 1: _opc = INDEX_op_##op##_##etype##16_128; break; \
+    case 2: _opc = INDEX_op_##op##_##etype##32_128; break; \
+    case 3: _opc = INDEX_op_##op##_##etype##64_128; break; \
+    default:                        \
+        fprintf(stderr, "%s:%d: tcg fatal error: size=%d\n", \
+                        __FILE__, __LINE__, size);           \
+        exit(0);                    \
+        break;                      \
+    }                               \
+    gen_vector_op3(_opc, _rd, _rn, _rm); \
+} while (0)
+
+#define gen_vector_fop(op,size)        \
+do {                                   \
+    TCGOpcode _opc = 0;                \
+    TCGArg _rd = offsetof(CPUX86State, xmm_regs[rd]);              \
+    TCGArg _rn = offsetof(CPUX86State, xmm_regs[rn]);              \
+    TCGArg _rm = (rm == -1) ? offsetof(CPUX86State, xmm_t0) :      \
+                              offsetof(CPUX86State, xmm_regs[rm]); \
+    if(size == 0)                      \
+        _opc = INDEX_op_##op##_f32_128;\
+    else                               \
+	_opc = INDEX_op_##op##_f64_128;\
+    gen_vector_op3(_opc, _rd, _rn, _rm); \
+} while (0)	
+
+#define gen_vector_logical(op)	       \
+do {                                   \
+    TCGOpcode _opc = INDEX_op_##op##_128;                          \
+    TCGArg _rd = offsetof(CPUX86State, xmm_regs[rd]);              \
+    TCGArg _rn = (rn == -1) ? offsetof(CPUX86State, xmm_t0) :      \
+                              offsetof(CPUX86State, xmm_regs[rn]); \
+    TCGArg _rm = (rm == -1) ? offsetof(CPUX86State, xmm_t0) :      \
+                              offsetof(CPUX86State, xmm_regs[rm]); \
+    gen_vector_op3(_opc, _rd, _rn, _rm); \
+} while (0)
+
+#define gen_vector_cvt(op,size)        \
+do {                                   \
+    TCGOpcode _opc = INDEX_op_##op##_128;                          \
+    TCGArg _rd = offsetof(CPUX86State, xmm_regs[rd]);              \
+    TCGArg _rm = (rm == -1) ? offsetof(CPUX86State, xmm_t0) :      \
+                              offsetof(CPUX86State, xmm_regs[rm]); \
+    gen_vector_op3(_opc, _rd, _rm, size); \
+} while (0)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index a3dd167..7204635 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -32,7 +32,13 @@
 #include "exec/helper-gen.h"
 
 #include "trace-tcg.h"
+#include "hqemu.h"
 
+#if defined(CONFIG_USER_ONLY)
+#define IS_USER(s) 1
+#else
+#define IS_USER(s) (s->cpl == 3)
+#endif
 
 #define PREFIX_REPZ   0x01
 #define PREFIX_REPNZ  0x02
@@ -59,26 +65,35 @@
 # define clztl  clz32
 #endif
 
+#ifdef CONFIG_COREMU
+#ifdef TARGET_X86_64
+#define X86_64_HREGS x86_64_hregs
+#else
+#define X86_64_HREGS 0
+#endif
+#endif
+
 //#define MACRO_TEST   1
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_A0;
-static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT;
+static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2;
 static TCGv_i32 cpu_cc_op;
 static TCGv cpu_regs[CPU_NB_REGS];
 /* local temps */
-static TCGv cpu_T[2];
+static __thread TCGv cpu_T[2];
 /* local register indexes (only used inside old micro ops) */
-static TCGv cpu_tmp0, cpu_tmp4;
-static TCGv_ptr cpu_ptr0, cpu_ptr1;
-static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
-static TCGv_i64 cpu_tmp1_i64;
+static __thread TCGv cpu_A0;
+static __thread TCGv cpu_tmp0, cpu_tmp4;
+static __thread TCGv_ptr cpu_ptr0, cpu_ptr1;
+static __thread TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
+static __thread TCGv_i64 cpu_tmp1_i64;
+static __thread TCGv cpu_cc_srcT;
 
 #include "exec/gen-icount.h"
 
 #ifdef TARGET_X86_64
-static int x86_64_hregs;
+static __thread int x86_64_hregs;
 #endif
 
 typedef struct DisasContext {
@@ -123,6 +138,10 @@ typedef struct DisasContext {
     int cpuid_ext2_features;
     int cpuid_ext3_features;
     int cpuid_7_0_ebx_features;
+    int fallthrough;
+    int gen_ibtc;
+    int gen_cpbl;
+    CPUX86State *env;
 } DisasContext;
 
 static void gen_eob(DisasContext *s);
@@ -209,6 +228,36 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_CLR] = 0,
 };
 
+static inline void gen_ibtc_stub(DisasContext *s)
+{
+#ifdef ENABLE_IBTC
+    if (!build_llvm(s->env)) {
+        TCGv_ptr ibtc_host_pc = tcg_temp_new_ptr();
+        if (s->fallthrough) {
+            tcg_gen_st_i32(tcg_const_i32(1), cpu_env, offsetof(CPUX86State, fallthrough));
+            s->fallthrough = 0;
+        }
+        gen_helper_lookup_ibtc(ibtc_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(ibtc_host_pc));
+        tcg_temp_free_ptr(ibtc_host_pc);
+        s->gen_ibtc = 0;
+    }
+#endif
+}
+
+static inline void gen_cpbl_stub(DisasContext *s)
+{
+#ifdef ENABLE_CPBL
+    if (!build_llvm(s->env)) {
+        TCGv_ptr cpbl_host_pc = tcg_temp_new_ptr();
+        gen_helper_lookup_cpbl(cpbl_host_pc, cpu_env);
+        tcg_gen_op1i(INDEX_op_jmp, GET_TCGV_PTR(cpbl_host_pc));
+        tcg_temp_free_ptr(cpbl_host_pc);
+        s->gen_cpbl = 0;
+    }
+#endif
+}
+
 static void set_cc_op(DisasContext *s, CCOp op)
 {
     int dead;
@@ -1312,6 +1361,30 @@ static void gen_helper_fp_arith_STN_ST0(int op, int opreg)
 /* if d == OR_TMP0, it means memory operand (address in A0) */
 static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 {
+#ifdef CONFIG_COREMU
+    if (s1->prefix & PREFIX_LOCK) {
+        gen_update_cc_op(s1);
+
+        switch (ot & 3) {
+        case 0:
+            gen_helper_atomic_opb(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(op));
+            break;
+        case 1:
+            gen_helper_atomic_opw(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(op));
+            break;
+        case 2:
+            gen_helper_atomic_opl(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(op));
+            break;
+#ifdef TARGET_X86_64
+        case 3:
+            gen_helper_atomic_opq(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(op));
+#endif
+        }
+        set_cc_op(s1, CC_OP_EFLAGS);
+        return;
+    }
+#endif
+
     if (d != OR_TMP0) {
         gen_op_mov_v_reg(ot, cpu_T[0], d);
     } else {
@@ -1378,6 +1451,35 @@ static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d)
 /* if d == OR_TMP0, it means memory operand (address in A0) */
 static void gen_inc(DisasContext *s1, TCGMemOp ot, int d, int c)
 {
+#ifdef CONFIG_COREMU
+    /* with lock prefix */
+    if (s1->prefix & PREFIX_LOCK) {
+        assert(d == OR_TMP0);
+
+        /* The helper will use CAS1 as a unified way to
+           implement atomic inc (locked inc) */
+        gen_update_cc_op(s1);
+
+        switch(ot & 3) {
+        case 0:
+            gen_helper_atomic_incb(cpu_env, cpu_A0, tcg_const_i32(c));
+            break;
+        case 1:
+            gen_helper_atomic_incw(cpu_env, cpu_A0, tcg_const_i32(c));
+            break;
+        case 2:
+            gen_helper_atomic_incl(cpu_env, cpu_A0, tcg_const_i32(c));
+            break;
+#ifdef TARGET_X86_64
+        case 3:
+            gen_helper_atomic_incq(cpu_env, cpu_A0, tcg_const_i32(c));
+#endif
+        }
+        set_cc_op(s1, CC_OP_EFLAGS);
+        return;
+    }
+#endif
+
     if (d != OR_TMP0) {
         gen_op_mov_v_reg(ot, cpu_T[0], d);
     } else {
@@ -2205,6 +2307,31 @@ static inline int insn_const_size(TCGMemOp ot)
     }
 }
 
+#if defined(CONFIG_USER_ONLY)
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
+{
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    tcg_gen_goto_tb(tb_num);
+    gen_jmp_im(eip);
+    tcg_gen_exit_tb((uintptr_t)tb + tb_num);
+    tb->jmp_pc[tb_num] = tb->cs_base + eip;
+}
+#else
+static int try_link_pages(DisasContext *s, TranslationBlock *tb, target_ulong dest)
+{
+#ifdef ENABLE_LPAGE
+    if (!build_llvm(s->env)) {
+        target_ulong addr, size;
+        int ret = lpt_search_page(s->env, dest, &addr, &size);
+        if (ret == 1 && (tb->pc & ~(size - 1)) == addr)
+            return 1;
+    }
+#endif
+    return 0;
+}
+
 static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
 {
     TranslationBlock *tb;
@@ -2214,7 +2341,8 @@ static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
     tb = s->tb;
     /* NOTE: we handle the case where the TB spans two pages here */
     if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
-        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK) ||
+        try_link_pages(s, tb, pc) == 1)  {
         /* jump to same page: we can use a direct jump */
         tcg_gen_goto_tb(tb_num);
         gen_jmp_im(eip);
@@ -2222,9 +2350,12 @@ static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
     } else {
         /* jump to another page: currently not optimized */
         gen_jmp_im(eip);
+        s->gen_cpbl = 1;
         gen_eob(s);
     }
+    tb->jmp_pc[tb_num] = pc;
 }
+#endif
 
 static inline void gen_jcc(DisasContext *s, int b,
                            target_ulong val, target_ulong next_eip)
@@ -2561,6 +2692,10 @@ static void gen_eob(DisasContext *s)
     } else if (s->tf) {
         gen_helper_single_step(cpu_env);
     } else {
+        if (s->gen_ibtc == 1)
+            gen_ibtc_stub(s);
+        if (s->gen_cpbl == 1)
+            gen_cpbl_stub(s);
         tcg_gen_exit_tb(0);
     }
     s->is_jmp = DISAS_TB_JUMP;
@@ -2974,6 +3109,192 @@ static const struct SSEOpHelper_eppi sse_op_table7[256] = {
     [0xdf] = AESNI_OP(aeskeygenassist),
 };
 
+#ifdef ENABLE_TCG_VECTOR
+#include "simd_helper.h"
+
+#define tcg_vector_abort() \
+do {\
+    fprintf(stderr, "%s:%d: tcg fatal error - unhandled vector op.\n", __FILE__, __LINE__);\
+    exit(0);\
+} while (0)
+
+static int gen_vload(DisasContext *s, int op, int mod, int modrm, int reg)
+{
+    int rm;
+    TCGArg alignment = 128;
+    CPUX86State *env = s->env;
+
+    if (!build_llvm(env))
+        return 0;
+
+    switch (op) {
+    case 0x010: /* movups */
+    case 0x110: /* movupd */
+    case 0x26f: /* movdqu xmm, ea */
+        alignment = (TCGArg)-1;
+        break;
+    default:
+        break;
+    }
+
+    if (mod != 3) {
+        gen_lea_modrm(env, s, modrm);
+        gen_vector_op3(INDEX_op_vload_128,
+                       offsetof(CPUX86State, xmm_regs[reg]),
+                       (TCGArg)cpu_A0,
+                       alignment);
+    } else {
+        rm = (modrm & 7) | REX_B(s);
+        gen_vector_op3(INDEX_op_vmov_128,
+                       offsetof(CPUX86State, xmm_regs[reg]),
+                       offsetof(CPUX86State, xmm_regs[rm]),
+                       alignment);
+    }
+
+    return 1;
+}
+
+static int gen_vstore(DisasContext *s, int op, int mod, int modrm, int reg)
+{
+    int rm;
+    TCGArg alignment = 128;
+    CPUX86State *env = s->env;
+
+    if (!build_llvm(env))
+        return 0;
+
+    switch (op) {
+    case 0x011: /* movups */
+    case 0x111: /* movupd */
+    case 0x27f: /* movdqu ea, xmm */
+        alignment = (TCGArg)-1;
+        break;
+    default:
+        break;
+    }
+
+    if (mod != 3) {
+        gen_lea_modrm(env, s, modrm);
+        gen_vector_op3(INDEX_op_vstore_128,
+                       offsetof(CPUX86State, xmm_regs[reg]),
+                       (TCGArg)cpu_A0,
+                       alignment);
+    } else {
+        rm = (modrm & 7) | REX_B(s);
+        gen_vector_op3(INDEX_op_vmov_128,
+                       offsetof(CPUX86State, xmm_regs[rm]),
+                       offsetof(CPUX86State, xmm_regs[reg]),
+                       alignment);
+    }
+
+    return 1;
+}
+
+static int gen_tcg_vector(DisasContext *s, int op, int b1, int mod, int modrm, int reg)
+{
+    int rd, rm, rn;
+    TCGArg alignment = 128;
+    CPUX86State *env = s->env;
+
+    if (!build_llvm(env))
+        return 0;
+
+    switch(op) {
+    case 0x54 ... 0x59:
+    case 0x5b: /* cvtdq2ps cvtps2dq cvttps2dq */
+    case 0x5c:
+    case 0x5e:
+    case 0xd4:
+    case 0xdb:
+    case 0xdf:
+    case 0xeb:
+    case 0xef:
+    case 0xf8 ... 0xfe:
+        break;
+    default: /* unhandled op */
+        return 0;
+    }
+
+    switch (op) {
+    case 0x50 ... 0x5a:
+    case 0x5c ... 0x5f:
+    case 0xc2:
+        /* Most sse scalar operations.  */
+        if (b1 == 2 || b1 == 3)
+            return 0;
+        break;
+    }
+
+    rd = rn = reg;
+    if (mod != 3) {
+        gen_lea_modrm(env, s, modrm);
+        gen_vector_op3(INDEX_op_vload_128,
+                       offsetof(CPUX86State, xmm_t0),
+                       (TCGArg)cpu_A0,
+                       alignment);
+        rm = -1;
+    } else {
+        rm = (modrm & 7) | REX_B(s);
+    }
+
+    switch(op) {
+    case 0x54: /* andps, andpd */
+    case 0xdb: /* MMX_OP2(pand) */
+        gen_vector_logical(vand); break;
+    case 0x55: /* andnps, andnpd */
+    case 0xdf: /* MMX_OP2(pandn) */
+    {
+        int rtmp = rn;
+        rn = rm;
+        rm = rtmp;
+        gen_vector_logical(vbic); break;
+    }
+    case 0x56: /* orps, orpd */
+    case 0xeb: /* por */
+        gen_vector_logical(vorr); break;
+    case 0x57: /* xorps, xorpd */
+    case 0xef: /* pxor */
+        gen_vector_logical(veor); break;
+    case 0x58: /* SSE_FOP(add) */
+        gen_vector_fop(vadd, b1); break;
+    case 0x59: /* SSE_FOP(mul) */
+        gen_vector_fop(vmul, b1); break;
+    case 0x5c: /* SSE_FOP(sub) */
+        gen_vector_fop(vsub, b1); break;
+    case 0x5e: /* SSE_FOP(div) */
+        gen_vector_fop(vdiv, b1); break;
+    case 0x5b: /* cvtdq2ps cvtps2dq cvttps2dq */
+        if(b1 == 0)
+            gen_vector_cvt(vsitofp, 32);
+        else if(b1 == 1)
+            gen_vector_cvt(vfptosi, 32);
+        else if(b1 == 2)
+            gen_vector_cvt(vfptosi, 32);
+        else
+            tcg_vector_abort();
+        break;
+    case 0xd4: /* MMX_OP2(paddq) */
+        if (b1 != 1)
+            tcg_vector_abort();
+        gen_vector_arith(vadd, i, 3); break;
+    case 0xf8 ... 0xfb: /* MMX_OP2(psubb ... psubq) */
+        if (b1 != 1)
+            tcg_vector_abort();
+        gen_vector_arith(vsub, i, (op-0xf8)); break;
+    case 0xfc ... 0xfe: /* MMX_OP2(paddb ... paddl) */
+        if (b1 != 1)
+            tcg_vector_abort();
+        gen_vector_arith(vadd, i, (op-0xfc)); break;
+    default:
+        tcg_vector_abort();
+        break;
+    }
+
+    return 1;
+}
+
+#endif
+
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                     target_ulong pc_start, int rex_r)
 {
@@ -3131,6 +3452,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         case 0x128: /* movapd */
         case 0x16f: /* movdqa xmm, ea */
         case 0x26f: /* movdqu xmm, ea */
+#ifdef ENABLE_TCG_VECTOR
+            if (gen_vload(s, b, mod, modrm, reg) == 1)
+                break;
+#endif
             if (mod != 3) {
                 gen_lea_modrm(env, s, modrm);
                 gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
@@ -3317,6 +3642,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         case 0x129: /* movapd */
         case 0x17f: /* movdqa ea, xmm */
         case 0x27f: /* movdqu ea, xmm */
+#ifdef ENABLE_TCG_VECTOR
+            if (gen_vstore(s, b, mod, modrm, reg) == 1)
+                break;
+#endif
             if (mod != 3) {
                 gen_lea_modrm(env, s, modrm);
                 gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
@@ -4283,6 +4612,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         default:
             break;
         }
+#ifdef ENABLE_TCG_VECTOR
+        if (is_xmm && gen_tcg_vector(s, b, b1, mod, modrm, reg) == 1)
+            return;
+#endif
         if (is_xmm) {
             op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
             if (mod != 3) {
@@ -4565,9 +4898,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     s->aflag = aflag;
     s->dflag = dflag;
 
+#ifndef CONFIG_COREMU
     /* lock generation */
     if (prefixes & PREFIX_LOCK)
         gen_helper_lock();
+#endif
 
     /* now check op code */
  reswitch:
@@ -4719,6 +5054,29 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             set_cc_op(s, CC_OP_LOGICB + ot);
             break;
         case 2: /* not */
+#ifdef CONFIG_COREMU
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3)
+                    goto illegal_op;
+
+                switch(ot & 3) {
+                case 0:
+                    gen_helper_atomic_notb(cpu_env, cpu_A0);
+                    break;
+                case 1:
+                    gen_helper_atomic_notw(cpu_env, cpu_A0);
+                    break;
+                case 2:
+                    gen_helper_atomic_notl(cpu_env, cpu_A0);
+                    break;
+#ifdef TARGET_X86_64
+                case 3:
+                    gen_helper_atomic_notq(cpu_env, cpu_A0);
+#endif
+                }
+                break;
+            }
+#endif
             tcg_gen_not_tl(cpu_T[0], cpu_T[0]);
             if (mod != 3) {
                 gen_op_st_v(s, ot, cpu_T[0], cpu_A0);
@@ -4727,6 +5085,32 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             break;
         case 3: /* neg */
+#ifdef CONFIG_COREMU
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3)
+                    goto illegal_op;
+
+                gen_update_cc_op(s);
+
+                switch(ot & 3) {
+                case 0:
+                    gen_helper_atomic_negb(cpu_env, cpu_A0);
+                    break;
+                case 1:
+                    gen_helper_atomic_negw(cpu_env, cpu_A0);
+                    break;
+                case 2:
+                    gen_helper_atomic_negl(cpu_env, cpu_A0);
+                    break;
+#ifdef TARGET_X86_64
+                case 3:
+                    gen_helper_atomic_negq(cpu_env, cpu_A0);
+#endif
+                }
+                set_cc_op(s, CC_OP_EFLAGS);
+                break;
+            }
+#endif
             tcg_gen_neg_tl(cpu_T[0], cpu_T[0]);
             if (mod != 3) {
                 gen_op_st_v(s, ot, cpu_T[0], cpu_A0);
@@ -4936,6 +5320,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_movi_tl(cpu_T[1], next_eip);
             gen_push_v(s, cpu_T[1]);
             gen_op_jmp_v(cpu_T[0]);
+            s->gen_ibtc = 1;
             gen_eob(s);
             break;
         case 3: /* lcall Ev */
@@ -4954,6 +5339,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                                       tcg_const_i32(dflag - 1),
                                       tcg_const_i32(s->pc - s->cs_base));
             }
+            s->gen_ibtc = 1;
             gen_eob(s);
             break;
         case 4: /* jmp Ev */
@@ -4961,6 +5347,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 tcg_gen_ext16u_tl(cpu_T[0], cpu_T[0]);
             }
             gen_op_jmp_v(cpu_T[0]);
+            s->gen_ibtc = 1;
             gen_eob(s);
             break;
         case 5: /* ljmp Ev */
@@ -4976,6 +5363,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_op_movl_seg_T0_vm(R_CS);
                 gen_op_jmp_v(cpu_T[1]);
             }
+            s->gen_ibtc = 1;
             gen_eob(s);
             break;
         case 6: /* push Ev */
@@ -5124,7 +5512,36 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
             gen_op_mov_reg_v(ot, reg, cpu_T[1]);
             gen_op_mov_reg_v(ot, rm, cpu_T[0]);
-        } else {
+        } else
+#ifdef CONFIG_COREMU
+        if (s->prefix & PREFIX_LOCK) {
+            gen_lea_modrm(env, s, modrm);
+            gen_update_cc_op(s);
+
+            switch (ot & 3) {
+            case 0:
+                gen_helper_atomic_xaddb(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+            case 1:
+                gen_helper_atomic_xaddw(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+            case 2:
+                gen_helper_atomic_xaddl(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+#ifdef TARGET_X86_64
+            case 3:
+                gen_helper_atomic_xaddq(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(x86_64_hregs));
+#endif
+            }
+            set_cc_op(s, CC_OP_EFLAGS);
+            break;
+        } else
+#endif
+        {
             gen_lea_modrm(env, s, modrm);
             gen_op_mov_v_reg(ot, cpu_T[0], reg);
             gen_op_ld_v(s, ot, cpu_T[1], cpu_A0);
@@ -5145,6 +5562,38 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             modrm = cpu_ldub_code(env, s->pc++);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
+
+#ifdef CONFIG_COREMU
+            if (s->prefix & PREFIX_LOCK) {
+                if (mod == 3)
+                    goto illegal_op;
+
+                gen_lea_modrm(env, s, modrm);
+                gen_update_cc_op(s);
+
+                switch(ot & 3) {
+                case 0:
+                    gen_helper_atomic_cmpxchgb(cpu_env, cpu_A0, tcg_const_i32(reg),
+                                                    tcg_const_i32(X86_64_HREGS));
+                    break;
+                case 1:
+                    gen_helper_atomic_cmpxchgw(cpu_env, cpu_A0, tcg_const_i32(reg),
+                                                    tcg_const_i32(X86_64_HREGS));
+                    break;
+                case 2:
+                    gen_helper_atomic_cmpxchgl(cpu_env, cpu_A0, tcg_const_i32(reg),
+                                                    tcg_const_i32(X86_64_HREGS));
+                    break;
+#ifdef TARGET_X86_64
+                case 3:
+                    gen_helper_atomic_cmpxchgq(cpu_env, cpu_A0, tcg_const_i32(reg),
+                            tcg_const_i32(x86_64_hregs));
+#endif
+                }
+                set_cc_op(s, CC_OP_EFLAGS);
+                break;
+            }
+#endif
             t0 = tcg_temp_local_new();
             t1 = tcg_temp_local_new();
             t2 = tcg_temp_local_new();
@@ -5201,6 +5650,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
                 goto illegal_op;
             gen_lea_modrm(env, s, modrm);
+#ifdef CONFIG_COREMU
+            if (s->prefix | PREFIX_LOCK) {
+                gen_helper_atomic_cmpxchg16b(cpu_env, cpu_A0);
+            } else
+#endif
             gen_helper_cmpxchg16b(cpu_env, cpu_A0);
         } else
 #endif        
@@ -5208,6 +5662,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!(s->cpuid_features & CPUID_CX8))
                 goto illegal_op;
             gen_lea_modrm(env, s, modrm);
+#ifdef CONFIG_COREMU
+            if (s->prefix | PREFIX_LOCK) {
+                gen_helper_atomic_cmpxchg8b(cpu_env, cpu_A0);
+            } else
+#endif
             gen_helper_cmpxchg8b(cpu_env, cpu_A0);
         }
         set_cc_op(s, CC_OP_EFLAGS);
@@ -5550,15 +6009,41 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_op_mov_reg_v(ot, reg, cpu_T[1]);
         } else {
             gen_lea_modrm(env, s, modrm);
+#ifdef CONFIG_COREMU
+            /* for xchg, lock is implicit.
+               XXX: none flag is affected! */
+            switch (ot & 3) {
+            case 0:
+                gen_helper_xchgb(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+            case 1:
+                gen_helper_xchgw(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+            case 2:
+                gen_helper_xchgl(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(X86_64_HREGS));
+                break;
+#ifdef TARGET_X86_64
+                case 3:
+                gen_helper_xchgq(cpu_env, cpu_A0, tcg_const_i32(reg),
+                        tcg_const_i32(x86_64_hregs));
+#endif
+            }
+#else
             gen_op_mov_v_reg(ot, cpu_T[0], reg);
             /* for xchg, lock is implicit */
             if (!(prefixes & PREFIX_LOCK))
                 gen_helper_lock();
             gen_op_ld_v(s, ot, cpu_T[1], cpu_A0);
             gen_op_st_v(s, ot, cpu_T[0], cpu_A0);
+#ifndef CONFIG_COREMU
             if (!(prefixes & PREFIX_LOCK))
                 gen_helper_unlock();
+#endif
             gen_op_mov_reg_v(ot, reg, cpu_T[1]);
+#endif
         }
         break;
     case 0xc4: /* les Gv */
@@ -6360,6 +6845,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_stack_update(s, val + (1 << ot));
         /* Note that gen_pop_T0 uses a zero-extending load.  */
         gen_op_jmp_v(cpu_T[0]);
+        s->gen_ibtc = 1;
         gen_eob(s);
         break;
     case 0xc3: /* ret */
@@ -6367,6 +6853,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_pop_update(s, ot);
         /* Note that gen_pop_T0 uses a zero-extending load.  */
         gen_op_jmp_v(cpu_T[0]);
+        s->gen_ibtc = 1;
         gen_eob(s);
         break;
     case 0xca: /* lret im */
@@ -6392,6 +6879,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             /* add stack offset */
             gen_stack_update(s, val + (2 << dflag));
         }
+        s->gen_ibtc = 1;
         gen_eob(s);
         break;
     case 0xcb: /* lret */
@@ -6415,6 +6903,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                                       tcg_const_i32(s->pc - s->cs_base));
             set_cc_op(s, CC_OP_EFLAGS);
         }
+        s->gen_ibtc = 1;
         gen_eob(s);
         break;
     case 0xe8: /* call im */
@@ -6680,6 +7169,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         }
     bt_op:
         tcg_gen_andi_tl(cpu_T[1], cpu_T[1], (1 << (3 + ot)) - 1);
+#ifdef CONFIG_COREMU
+        if (s->prefix & PREFIX_LOCK) {
+            gen_update_cc_op(s);
+
+            switch (op) {
+            case 0:
+                goto illegal_op;
+                break;
+            case 1:
+                gen_helper_atomic_bts(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(ot));
+                break;
+            case 2:
+                gen_helper_atomic_btr(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(ot));
+                break;
+            case 3:
+                gen_helper_atomic_btc(cpu_env, cpu_A0, cpu_T[1], tcg_const_i32(ot));
+            }
+            set_cc_op(s, CC_OP_EFLAGS);
+            break;
+        }
+#endif
         tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
         switch(op) {
         case 0:
@@ -7818,12 +8328,16 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         goto illegal_op;
     }
     /* lock generation */
+#ifndef CONFIG_COREMU
     if (s->prefix & PREFIX_LOCK)
         gen_helper_unlock();
+#endif
     return s->pc;
  illegal_op:
+#ifndef CONFIG_COREMU
     if (s->prefix & PREFIX_LOCK)
         gen_helper_unlock();
+#endif
     /* XXX: ensure that no lock was generated */
     gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
     return s->pc;
@@ -7879,6 +8393,8 @@ void optimize_flags_init(void)
     }
 
     helper_lock_init();
+
+    copy_tcg_context_global();
 }
 
 /* generate intermediate code in gen_opc_buf and gen_opparam_buf for
@@ -7900,6 +8416,10 @@ void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
     cs_base = tb->cs_base;
     flags = tb->flags;
 
+    dc->fallthrough = 0;
+    dc->gen_ibtc = 0;
+    dc->gen_cpbl = 0;
+    dc->env = env;
     dc->pe = (flags >> HF_PE_SHIFT) & 1;
     dc->code32 = (flags >> HF_CS32_SHIFT) & 1;
     dc->ss32 = (flags >> HF_SS32_SHIFT) & 1;
@@ -7977,7 +8497,12 @@ void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
         max_insns = TCG_MAX_INSNS;
     }
 
-    gen_tb_start(tb);
+    if (!build_llvm(env)) {
+        gen_tb_start(tb);
+        if (tracer_mode != TRANS_MODE_NONE)
+            tcg_gen_hotpatch(IS_USER(dc), tracer_mode == TRANS_MODE_HYBRIDS ||
+                                          tracer_mode == TRANS_MODE_HYBRIDM);
+    }
     for(;;) {
         tcg_gen_insn_start(pc_ptr, dc->cc_op);
         num_insns++;
@@ -8027,12 +8552,27 @@ void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
             gen_eob(dc);
             break;
         }
+
+#if defined(CONFIG_LLVM) && defined(CONFIG_USER_ONLY)
+        if (llvm_has_annotation(pc_ptr, ANNOTATION_LOOP))
+            break;
+#endif
+        if (build_llvm(env) && num_insns == tb->icount) {
+            gen_jmp_im(pc_ptr - dc->cs_base);
+            gen_eob(dc);
+            break;
+        }
+
         /* if too long translation, stop generation too */
         if (tcg_op_buf_full() ||
             (pc_ptr - pc_start) >= (TARGET_PAGE_SIZE - 32) ||
             num_insns >= max_insns) {
             gen_jmp_im(pc_ptr - dc->cs_base);
+            dc->fallthrough = 1;
+            dc->gen_ibtc = 1;
             gen_eob(dc);
+
+            tb->jmp_pc[0] = pc_ptr;
             break;
         }
         if (singlestep) {
@@ -8041,13 +8581,28 @@ void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
             break;
         }
     }
+    if (build_llvm(env) && tb->size != dc->pc - pc_start) {
+        /* consistency check with tb info. we must make sure
+         * guest basic blocks are the same. skip this trace if inconsistent */
+        fprintf(stderr, "inconsistent block with pc 0x"TARGET_FMT_lx" size=%d"
+                " icount=%d (error size="TARGET_FMT_ld")\n",
+                tb->pc, tb->size, tb->icount, dc->pc - pc_start);
+        exit(0);
+    }
+
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
 done_generating:
-    gen_tb_end(tb, num_insns);
+
+    if (build_llvm(env)) {
+        /* Terminate the linked list.  */
+        tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+    } else {
+        gen_tb_end(tb, num_insns);
+    }
 
 #ifdef DEBUG_DISAS
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) && !build_llvm(env)) {
         int disas_flags;
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
@@ -8062,8 +8617,10 @@ done_generating:
     }
 #endif
 
-    tb->size = pc_ptr - pc_start;
-    tb->icount = num_insns;
+    if (!build_llvm(env)) {
+        tb->size = pc_ptr - pc_start;
+        tb->icount = num_insns;
+    }
 }
 
 void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb,
diff --git a/target-ppc/Makefile.objs b/target-ppc/Makefile.objs
index e667e69..363a701 100644
--- a/target-ppc/Makefile.objs
+++ b/target-ppc/Makefile.objs
@@ -1,5 +1,5 @@
 obj-y += cpu-models.o
-obj-y += translate.o
+obj-y += translate.o helper.o
 ifeq ($(CONFIG_SOFTMMU),y)
 obj-y += machine.o mmu_helper.o mmu-hash32.o monitor.o
 obj-$(TARGET_PPC64) += mmu-hash64.o arch_dump.o
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index 9706000..bf1481a 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -88,7 +88,6 @@
 
 /*****************************************************************************/
 /* MMU model                                                                 */
-typedef enum powerpc_mmu_t powerpc_mmu_t;
 enum powerpc_mmu_t {
     POWERPC_MMU_UNKNOWN    = 0x00000000,
     /* Standard 32 bits PowerPC MMU                            */
@@ -133,10 +132,10 @@ enum powerpc_mmu_t {
                              | 0x00000004,
 #endif /* defined(TARGET_PPC64) */
 };
+typedef enum powerpc_mmu_t powerpc_mmu_t;
 
 /*****************************************************************************/
 /* Exception model                                                           */
-typedef enum powerpc_excp_t powerpc_excp_t;
 enum powerpc_excp_t {
     POWERPC_EXCP_UNKNOWN   = 0,
     /* Standard PowerPC exception model */
@@ -170,6 +169,7 @@ enum powerpc_excp_t {
     POWERPC_EXCP_POWER7,
 #endif /* defined(TARGET_PPC64) */
 };
+typedef enum powerpc_excp_t powerpc_excp_t;
 
 /*****************************************************************************/
 /* Exception vectors definitions                                             */
@@ -298,7 +298,6 @@ enum {
 
 /*****************************************************************************/
 /* Input pins model                                                          */
-typedef enum powerpc_input_t powerpc_input_t;
 enum powerpc_input_t {
     PPC_FLAGS_INPUT_UNKNOWN = 0,
     /* PowerPC 6xx bus                  */
@@ -316,6 +315,7 @@ enum powerpc_input_t {
     /* Freescale RCPU bus               */
     PPC_FLAGS_INPUT_RCPU,
 };
+typedef enum powerpc_input_t powerpc_input_t;
 
 #define PPC_INPUT(env) (env->bus_model)
 
@@ -1168,6 +1168,8 @@ struct CPUPPCState {
     uint32_t tm_vscr;
     uint64_t tm_dscr;
     uint64_t tm_tar;
+
+    CPU_OPTIMIZATION_COMMON
 };
 
 #define SET_FIT_PERIOD(a_, b_, c_, d_)          \
@@ -2226,6 +2228,17 @@ static inline void cpu_get_tb_cpu_state(CPUPPCState *env, target_ulong *pc,
     *flags = env->hflags;
 }
 
+static inline target_ulong cpu_get_pc(CPUPPCState *env)
+{
+    return env->nip;
+}
+
+static inline int cpu_check_state(CPUPPCState *env,
+                                  target_ulong cs_base, int flags)
+{
+    return cs_base == 0 && (uint32_t)flags == env->hflags;
+}
+
 #if !defined(CONFIG_USER_ONLY)
 static inline int booke206_tlbm_id(CPUPPCState *env, ppcmas_tlb_t *tlbm)
 {
@@ -2311,7 +2324,7 @@ static inline uint32_t booke206_tlbnps(CPUPPCState *env, const int tlbn)
         uint32_t tlbncfg = env->spr[SPR_BOOKE_TLB0CFG + tlbn];
         uint32_t min = (tlbncfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
         uint32_t max = (tlbncfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
-        int i;
+        unsigned i;
         for (i = min; i <= max; i++) {
             ret |= (1 << (i << 1));
         }
diff --git a/target-ppc/helper.c b/target-ppc/helper.c
new file mode 100644
index 0000000..5ec684b
--- /dev/null
+++ b/target-ppc/helper.c
@@ -0,0 +1,9 @@
+#include "cpu.h"
+CPUState *cpu_create(void)
+{
+    PowerPCCPU *cpu = g_malloc0(sizeof(PowerPCCPU));
+    CPUState *cs = CPU(cpu);
+    memcpy(cpu, POWERPC_CPU(first_cpu), sizeof(PowerPCCPU));
+    cs->env_ptr = &cpu->env;
+    return cs;
+}
diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 869be15..c96f51b 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -667,3 +667,5 @@ DEF_HELPER_4(dscli, void, env, fprp, fprp, i32)
 DEF_HELPER_4(dscliq, void, env, fprp, fprp, i32)
 
 DEF_HELPER_1(tbegin, void, env)
+
+#include "hqemu-helper.h"
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 41a7258..15cedc5 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -28,7 +28,13 @@
 #include "exec/helper-gen.h"
 
 #include "trace-tcg.h"
+#include "hqemu.h"
 
+#if defined(CONFIG_USER_ONLY)
+#define IS_USER(s) 1
+#else
+#define IS_USER(s) (s->mem_idx == MMU_USER_IDX)
+#endif
 
 #define CPU_SINGLE_STEP 0x1
 #define CPU_BRANCH_STEP 0x2
@@ -180,6 +186,8 @@ void ppc_translate_init(void)
                                              offsetof(CPUPPCState, access_type), "access_type");
 
     done_init = 1;
+
+    copy_tcg_context_global();
 }
 
 /* internal defines */
@@ -11479,7 +11487,12 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
         max_insns = TCG_MAX_INSNS;
     }
 
-    gen_tb_start(tb);
+    if (!build_llvm(env)) {
+        gen_tb_start(tb);
+        if (tracer_mode != TRANS_MODE_NONE)
+            tcg_gen_hotpatch(IS_USER(ctxp), tracer_mode == TRANS_MODE_HYBRIDS ||
+                                            tracer_mode == TRANS_MODE_HYBRIDM);
+    }
     tcg_clear_temp_count();
     /* Set env in case of segfault during code fetch */
     while (ctx.exception == POWERPC_EXCP_NONE && !tcg_op_buf_full()) {
@@ -11553,6 +11566,9 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
 #if defined(DO_PPC_STATISTICS)
         handler->count++;
 #endif
+        if (build_llvm(env) && num_insns == tb->icount)
+            break;
+
         /* Check trace mode exceptions */
         if (unlikely(ctx.singlestep_enabled & CPU_SINGLE_STEP &&
                      (ctx.nip <= 0x100 || ctx.nip > 0xF00) &&
@@ -11576,6 +11592,16 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
             exit(1);
         }
     }
+
+    if (build_llvm(env) && tb->size != ctx.nip - pc_start) {
+        /* consistency check with tb info. we must make sure
+         *  guest basic blocks are the same */
+        fprintf(stderr, "inconsistant block with pc 0x"TARGET_FMT_lx" size %d"
+                " icount=%d (error size="TARGET_FMT_ld")\n",
+                tb->pc, tb->size, tb->icount, ctx.nip - pc_start);
+        exit(0);
+    }
+
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
     if (ctx.exception == POWERPC_EXCP_NONE) {
@@ -11587,13 +11613,18 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
         /* Generate the return instruction */
         tcg_gen_exit_tb(0);
     }
-    gen_tb_end(tb, num_insns);
 
-    tb->size = ctx.nip - pc_start;
-    tb->icount = num_insns;
+    if (build_llvm(env)) {
+        /* Terminate the linked list.  */
+        tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+    } else {
+        gen_tb_end(tb, num_insns);
+        tb->size = ctx.nip - pc_start;
+        tb->icount = num_insns;
+    }
 
 #if defined(DEBUG_DISAS)
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) && !build_llvm(env)) {
         int flags;
         flags = env->bfd_mach;
         flags |= ctx.le_mode << 16;
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 0ed10a9..05e26af 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -1264,7 +1264,56 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
 #endif /* CONFIG_SOFTMMU */
 }
 
-static tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *ibtc_ret_addr;
+
+/*
+ * Emit trace profiling/prediction stubs. The code sequence is as following:
+ *   S1: direct jump (the reloc part requires 4-byte alignment)
+ *   S2: trace profiling stub
+ *   S3: trace prediction stub
+ *   S4: beginning of QEMU emulation code
+ *
+ * The jump inst of S1 is initially set to jump to S3 (i.e. skipping S2).
+ * Remember the offset of S3 (patch_next) which is used to turn the
+ * trace profiling off. Also remember the offset of S4 (patch_skip)
+ * so that the trace stubs can be skipped quickly while searching pc.
+ */
+static void tcg_out_hotpatch(TCGContext *s, int is_user, int emit_helper)
+{
+    tcg_insn_unit *label_ptr[2];
+    TranslationBlock *tb = s->tb;
+
+    tb->patch_jmp = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+
+    /* S1: Direct Jump  */
+    if (is_user == 0 || emit_helper == 0) {
+        tcg_out_goto(s, s->code_ptr + 1);
+        tb->patch_next = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+        return;
+    }
+
+    label_ptr[0] = s->code_ptr;
+    tcg_out_goto_noaddr(s);
+    /* S2: Trace Profiling Stub  */
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_profile);
+    reloc_pc26(label_ptr[0], s->code_ptr);
+
+    /* S3: Trace Prediction stub */
+    tb->patch_next = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+
+    tcg_out_ld(s, TCG_TYPE_I32, tcg_target_reg_alloc_order[0],
+               TCG_AREG0, offsetof(CPUArchState, start_trace_prediction));
+    tcg_out_cmp(s, 0, tcg_target_reg_alloc_order[0], 0, 1);
+    label_ptr[1] = s->code_ptr;
+    tcg_out_goto_cond_noaddr(s, TCG_COND_EQ);
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_predict);
+    reloc_pc19(label_ptr[1], s->code_ptr);
+}
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
@@ -1302,6 +1351,16 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         s->tb_next_offset[a0] = tcg_current_code_size(s);
         break;
 
+    case INDEX_op_hotpatch:
+        tcg_out_hotpatch(s, args[0], args[1]);
+        break;
+    case INDEX_op_jmp:
+        if (const_args[0]) {
+            tcg_out_goto(s, (tcg_insn_unit *)args[0]);
+        } else {
+            tcg_out_insn(s, 3207, BR, args[0]);
+        }
+        break;
     case INDEX_op_br:
         tcg_out_goto_label(s, arg_label(a0));
         break;
@@ -1637,6 +1696,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 }
 
 static const TCGTargetOpDef aarch64_op_defs[] = {
+    { INDEX_op_hotpatch, { "i", "i" } },
+    { INDEX_op_jmp, { "ri" } },
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
     { INDEX_op_br, { } },
@@ -1748,6 +1809,10 @@ static const TCGTargetOpDef aarch64_op_defs[] = {
     { INDEX_op_muluh_i64, { "r", "r", "r" } },
     { INDEX_op_mulsh_i64, { "r", "r", "r" } },
 
+#define DEF(name,a1,a2,a3,a4) { INDEX_op_##name, {} },
+#include "tcg-opc-vector.h"
+#undef DEF
+
     { -1 },
 };
 
@@ -1777,12 +1842,24 @@ static void tcg_target_init(TCGContext *s)
     tcg_add_target_add_op_defs(aarch64_op_defs);
 }
 
+static void tcg_out_epilogue(TCGContext *s)
+{
+    /* IBTC exit entry */
+    ibtc_ret_addr = s->code_ptr;
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_X0, 0);
+}
+
+#if defined(CONFIG_LLVM)
+#define STACK_SIZE 0x800
+#else
+#define STACK_SIZE TCG_STATIC_CALL_ARGS_SIZE
+#endif
 /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)).  */
 #define PUSH_SIZE  ((30 - 19 + 1) * 8)
 
 #define FRAME_SIZE \
     ((PUSH_SIZE \
-      + TCG_STATIC_CALL_ARGS_SIZE \
+      + STACK_SIZE \
       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
       + TCG_TARGET_STACK_ALIGN - 1) \
      & ~(TCG_TARGET_STACK_ALIGN - 1))
@@ -1828,6 +1905,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]);
 
+    tcg_out_epilogue(s);
     tb_ret_addr = s->code_ptr;
 
     /* Remove TCG locals stack space.  */
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 9187d34..b95f5fb 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -139,7 +139,8 @@ static bool have_bmi2;
 # define have_bmi2 0
 #endif
 
-static tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *ibtc_ret_addr;
 
 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
@@ -323,6 +324,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
+#define OPC_NOP         (0x90)
 #define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz	(0xc7)
 #define OPC_MOVL_Iv     (0xb8)
@@ -1150,6 +1152,62 @@ static void * const qemu_st_helpers[16] = {
     [MO_BEQ]  = helper_be_stq_mmu,
 };
 
+/* helpers for LLVM */
+void * const llvm_ld_helpers[16] = {
+    [MO_UB]   = llvm_ret_ldub_mmu,
+    [MO_LEUW] = llvm_le_lduw_mmu,
+    [MO_LEUL] = llvm_le_ldul_mmu,
+    [MO_LEQ]  = llvm_le_ldq_mmu,
+    [MO_BEUW] = llvm_be_lduw_mmu,
+    [MO_BEUL] = llvm_be_ldul_mmu,
+    [MO_BEQ]  = llvm_be_ldq_mmu,
+};
+
+void * const llvm_st_helpers[16] = {
+    [MO_UB]   = llvm_ret_stb_mmu,
+    [MO_LEUW] = llvm_le_stw_mmu,
+    [MO_LEUL] = llvm_le_stl_mmu,
+    [MO_LEQ]  = llvm_le_stq_mmu,
+    [MO_BEUW] = llvm_be_stw_mmu,
+    [MO_BEUL] = llvm_be_stl_mmu,
+    [MO_BEQ]  = llvm_be_stq_mmu,
+};
+
+static inline void tcg_out_compute_gva(TCGContext *s, TCGReg addrlo,
+                                       TCGMemOp opc, int trexw, int tv_hrexw)
+{
+    const TCGReg r1 = TCG_REG_L1;
+    int s_mask = (1 << (opc & MO_SIZE)) - 1;
+
+#if defined(ALIGNED_ONLY)
+    TCGType ttype = TCG_TYPE_I32;
+    bool aligned = (opc & MO_AMASK) == MO_ALIGN || s_mask == 0;
+    if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64)
+        ttype = TCG_TYPE_I64;
+    if (aligned) {
+        tcg_out_mov(s, ttype, r1, addrlo);
+    } else {
+        /* For unaligned access check that we don't cross pages using
+           the page address of the last byte.  */
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask);
+    }
+    tgen_arithi(s, ARITH_AND + trexw, r1,
+                TARGET_PAGE_MASK | (aligned ? s_mask : 0), 0);
+#elif defined(ENABLE_TLBVERSION)
+    /* the following code is as equivalent to
+     * (((addr + (size - 1)) & TARGET_PAGE_MASK) | env->tlb_version) */
+    tcg_out_modrm_sib_offset(s, OPC_LEA + trexw, r1, addrlo, -1, 0, s_mask);
+    tgen_arithi(s, ARITH_AND + trexw, r1, TARGET_PAGE_MASK, 0);
+    tcg_out_modrm_offset(s, (OPC_ARITH_GvEv | (ARITH_OR << 3)) + trexw + tv_hrexw,
+                         r1, TCG_AREG0, offsetof(CPUArchState, tlb_version));
+#else
+    /* the following code is as equivalent to
+     * ((addr + (size - 1)) & TARGET_PAGE_MASK) */
+    tcg_out_modrm_sib_offset(s, OPC_LEA + trexw, r1, addrlo, -1, 0, s_mask);
+    tgen_arithi(s, ARITH_AND + trexw, r1, TARGET_PAGE_MASK, 0);
+#endif
+}
+
 /* Perform the TLB load and compare.
 
    Inputs:
@@ -1179,9 +1237,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     const TCGReg r1 = TCG_REG_L1;
     TCGType ttype = TCG_TYPE_I32;
     TCGType tlbtype = TCG_TYPE_I32;
-    int trexw = 0, hrexw = 0, tlbrexw = 0;
-    int s_mask = (1 << (opc & MO_SIZE)) - 1;
-    bool aligned = (opc & MO_AMASK) == MO_ALIGN || s_mask == 0;
+    int trexw = 0, hrexw = 0, tlbrexw = 0, tv_hrexw = 0;
 
     if (TCG_TARGET_REG_BITS == 64) {
         if (TARGET_LONG_BITS == 64) {
@@ -1197,20 +1253,18 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
         }
     }
 
+#if defined(ENABLE_TLBVERSION_EXT)
+    trexw = 0;
+    tv_hrexw = P_REXW;
+#endif
+
     tcg_out_mov(s, tlbtype, r0, addrlo);
-    if (aligned) {
-        tcg_out_mov(s, ttype, r1, addrlo);
-    } else {
-        /* For unaligned access check that we don't cross pages using
-           the page address of the last byte.  */
-        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask);
-    }
 
     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
-    tgen_arithi(s, ARITH_AND + trexw, r1,
-                TARGET_PAGE_MASK | (aligned ? s_mask : 0), 0);
+    tcg_out_compute_gva(s, addrlo, opc, trexw, tv_hrexw);
+
     tgen_arithi(s, ARITH_AND + tlbrexw, r0,
                 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
@@ -1219,7 +1273,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
                              + which);
 
     /* cmp 0(r0), r1 */
-    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw + tv_hrexw, r1, r0, 0);
 
     /* Prepare for both the fast path add of the tlb addend, and the slow
        path function argument setup.  There are two cases worth note:
@@ -1754,6 +1808,73 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
+/*
+ * Emit trace profiling/prediction stubs. The code sequence is as following:
+ *   S1: direct jump (the reloc part requires 4-byte alignment)
+ *   S2: trace profiling stub
+ *   S3: trace prediction stub
+ *   S4: beginning of QEMU emulation code
+ *
+ * The jump inst of S1 is initially set to jump to S3 (i.e. skipping S2).
+ * Remember the offset of S3 (patch_next) which is used to turn the
+ * trace profiling off. Also remember the offset of S4 (patch_skip)
+ * so that the trace stubs can be skipped quickly while searching pc.
+ */
+static void tcg_out_hotpatch(TCGContext *s, uint32_t is_user, uint32_t emit_helper)
+{
+    uint8_t *label_ptr[2];
+    TranslationBlock *tb = s->tb;
+
+    /* S1: direct jump */
+    while (((uintptr_t)s->code_ptr + 1) % 4)
+        tcg_out8(s, OPC_NOP);
+
+    tb->patch_jmp = (uint16_t)(s->code_ptr - s->code_buf);
+
+    tcg_out8(s, OPC_JMP_long);
+    label_ptr[0] = s->code_ptr;
+    s->code_ptr += 4;
+
+    if (is_user == 0 || emit_helper == 0) {
+        *(uint32_t *)label_ptr[0] = s->code_ptr - label_ptr[0] - 4;
+        tb->patch_next = (uint16_t)(s->code_ptr - s->code_buf);
+        return;
+    }
+
+    /* S2: trace profiling stub */
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 0);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, 4, tb->id);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    }
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_profile);
+    *(uint32_t *)label_ptr[0] = s->code_ptr - label_ptr[0] - 4;
+
+    /* S3: trace prediction stub */
+    tb->patch_next = (uint16_t)(s->code_ptr - s->code_buf);
+
+    tcg_out_ld(s, TCG_TYPE_I32, tcg_target_reg_alloc_order[0], 
+               TCG_AREG0, offsetof(CPUArchState, start_trace_prediction));
+    tcg_out_cmp(s, tcg_target_reg_alloc_order[0], 0, 1, 0);
+    tcg_out_opc(s, OPC_JCC_long + JCC_JE, 0, 0, 0);
+    label_ptr[1] = s->code_ptr;
+    s->code_ptr += 4;
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, 0);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, 4, tb->id);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    }
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_predict);
+    *(uint32_t *)label_ptr[1] = s->code_ptr - label_ptr[1] - 4;
+
+    /* S4: QEMU emulation code */
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
@@ -1777,6 +1898,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
+#if defined(CONFIG_USER_ONLY)
+            while (((uintptr_t)s->code_ptr + 1) % 4) /* need 4-byte aligned */
+                tcg_out8(s, OPC_NOP);
+#endif
             tcg_out8(s, OPC_JMP_long); /* jmp im */
             s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
             tcg_out32(s, 0);
@@ -1787,6 +1912,17 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
+    case INDEX_op_hotpatch:
+        tcg_out_hotpatch(s, args[0], args[1]);
+        break;
+    case INDEX_op_jmp:
+        if (const_args[0]) {
+            tcg_out_jmp(s, (tcg_insn_unit *)args[0]);
+        } else {
+            /* jmp *reg */
+            tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, args[0]);
+        }
+        break;
     case INDEX_op_br:
         tcg_out_jxx(s, JCC_JMP, arg_label(args[0]), 0);
         break;
@@ -2110,6 +2246,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 }
 
 static const TCGTargetOpDef x86_op_defs[] = {
+    { INDEX_op_hotpatch, { "i", "i" } },
+    { INDEX_op_jmp, { "ri" } },
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
     { INDEX_op_br, { } },
@@ -2238,6 +2376,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_qemu_ld_i64, { "r", "r", "L", "L" } },
     { INDEX_op_qemu_st_i64, { "L", "L", "L", "L" } },
 #endif
+
+#define DEF(name,a1,a2,a3,a4) { INDEX_op_##name, {} },
+#include "tcg-opc-vector.h"
+#undef DEF
+
     { -1 },
 };
 
@@ -2261,16 +2404,29 @@ static int tcg_target_callee_save_regs[] = {
 #endif
 };
 
+static void tcg_out_epilogue(TCGContext *s)
+{
+    /* IBTC exit entry */
+    ibtc_ret_addr = s->code_ptr;
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, 0);
+}
+
 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
    and tcg_register_jit.  */
 
+#if defined(CONFIG_LLVM)
+#define STACK_SIZE 0x2000
+#else
+#define STACK_SIZE TCG_STATIC_CALL_ARGS_SIZE
+#endif
+
 #define PUSH_SIZE \
     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
      * (TCG_TARGET_REG_BITS / 8))
 
 #define FRAME_SIZE \
     ((PUSH_SIZE \
-      + TCG_STATIC_CALL_ARGS_SIZE \
+      + STACK_SIZE \
       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
       + TCG_TARGET_STACK_ALIGN - 1) \
      & ~(TCG_TARGET_STACK_ALIGN - 1))
@@ -2279,10 +2435,12 @@ static int tcg_target_callee_save_regs[] = {
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
     int i, stack_addend;
+    tcg_target_long stack_align_mask;
 
     /* TB prologue */
 
     /* Reserve some stack space, also for TCG temps.  */
+    stack_align_mask = ~(TCG_TARGET_STACK_ALIGN - 1);
     stack_addend = FRAME_SIZE - PUSH_SIZE;
     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
@@ -2296,6 +2454,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_ESP, TCG_AREG0,
+               offsetof(CPUArchState, sp));
+    tgen_arithi(s, ARITH_AND, TCG_REG_ESP, stack_align_mask, 0);
     /* jmp *tb.  */
     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
 		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
@@ -2303,13 +2464,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #else
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_ESP, TCG_AREG0,
+               offsetof(CPUArchState, sp));
+    tgen_arithi(s, ARITH_AND + P_REXW, TCG_REG_ESP, stack_align_mask, 0);
     /* jmp *tb.  */
     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
 #endif
 
     /* TB epilogue */
+    tcg_out_epilogue(s);
     tb_ret_addr = s->code_ptr;
 
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_ESP, TCG_AREG0,
+               offsetof(CPUArchState, sp));
     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
 
     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 92be341..c5715dc 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -67,7 +67,7 @@ typedef enum {
 
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
-#define TCG_TARGET_STACK_ALIGN 16
+#define TCG_TARGET_STACK_ALIGN 32
 #if defined(_WIN64)
 #define TCG_TARGET_CALL_STACK_OFFSET 32
 #else
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 2c72565..ca5c7a4 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -78,7 +78,8 @@
 #define TCG_CT_CONST_ZERO 0x1000
 #define TCG_CT_CONST_MONE 0x2000
 
-static tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *tb_ret_addr;
+tcg_insn_unit *ibtc_ret_addr;
 
 #include "elf.h"
 static bool have_isa_2_06;
@@ -1785,8 +1786,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 #define CPU_TEMP_BUF_SIZE  (CPU_TEMP_BUF_NLONGS * (int)sizeof(long))
 #define REG_SAVE_SIZE      ((int)ARRAY_SIZE(tcg_target_callee_save_regs) * SZR)
 
+#if defined(CONFIG_LLVM)
+#define STACK_SIZE 0x800
+#else
+#define STACK_SIZE TCG_STATIC_CALL_ARGS_SIZE
+#endif
+
 #define FRAME_SIZE ((TCG_TARGET_CALL_STACK_OFFSET   \
-                     + TCG_STATIC_CALL_ARGS_SIZE    \
+                     + STACK_SIZE                   \
                      + CPU_TEMP_BUF_SIZE            \
                      + REG_SAVE_SIZE                \
                      + TCG_TARGET_STACK_ALIGN - 1)  \
@@ -1794,6 +1801,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 
 #define REG_SAVE_BOT (FRAME_SIZE - REG_SAVE_SIZE)
 
+static unsigned num_epilogue_insns = 1;
+static void tcg_out_epilogue(TCGContext *s)
+{
+    /* IBTC exit entry */
+    ibtc_ret_addr = s->code_ptr;
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, 0);
+}
+
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
     int i;
@@ -1832,27 +1847,29 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     if (USE_REG_RA) {
 #ifdef _CALL_AIX
         /* Make the caller load the value as the TOC into R2.  */
-        tb_ret_addr = s->code_ptr + 2;
+        tb_ret_addr = s->code_ptr + 2 + num_epilogue_insns;
         desc[1] = tb_ret_addr;
         tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_RA, TCG_REG_R2);
         tcg_out32(s, BCCTR | BO_ALWAYS);
 #elif defined(_CALL_ELF) && _CALL_ELF == 2
         /* Compute from the incoming R12 value.  */
-        tb_ret_addr = s->code_ptr + 2;
+        tb_ret_addr = s->code_ptr + 2 + num_epilogue_insns;
         tcg_out32(s, ADDI | TAI(TCG_REG_RA, TCG_REG_R12,
                                 tcg_ptr_byte_diff(tb_ret_addr, s->code_buf)));
         tcg_out32(s, BCCTR | BO_ALWAYS);
 #else
         /* Reserve max 5 insns for the constant load.  */
-        tb_ret_addr = s->code_ptr + 6;
+        tb_ret_addr = s->code_ptr + 6 + num_epilogue_insns;
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)tb_ret_addr);
         tcg_out32(s, BCCTR | BO_ALWAYS);
         while (s->code_ptr < tb_ret_addr) {
             tcg_out32(s, NOP);
         }
 #endif
+        tcg_out_epilogue(s);
     } else {
         tcg_out32(s, BCCTR | BO_ALWAYS);
+        tcg_out_epilogue(s);
         tb_ret_addr = s->code_ptr;
     }
 
@@ -1869,6 +1886,85 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+static void tcg_out_jmp_short(uintptr_t jmp_addr, uintptr_t addr)
+{
+    tcg_insn_unit i1, i2;
+    uint64_t pair;
+    intptr_t diff = addr - jmp_addr;
+
+    if (!in_range_b(diff))
+        tcg_abort();
+
+    i1 = B | (diff & 0x3fffffc);
+    i2 = NOP;
+#ifdef HOST_WORDS_BIGENDIAN
+    pair = (uint64_t)i1 << 32 | i2;
+#else
+    pair = (uint64_t)i2 << 32 | i1;
+#endif
+    *(uint64_t *)jmp_addr = pair;
+}
+
+/*
+ * Emit trace profiling/prediction stubs. The code sequence is as following:
+ *   S1: direct jump (the reloc part requires 4-byte alignment)
+ *   S2: trace profiling stub
+ *   S3: trace prediction stub
+ *   S4: beginning of QEMU emulation code
+ *
+ * The jump inst of S1 is initially set to jump to S3 (i.e. skipping S2).
+ * Remember the offset of S3 (patch_next) which is used to turn the
+ * trace profiling off. Also remember the offset of S4 (patch_skip)
+ * so that the trace stubs can be skipped quickly while searching pc.
+ */
+static void tcg_out_hotpatch(TCGContext *s, int is_user, int emit_helper)
+{
+    tcg_insn_unit *label_ptr[2];
+    TranslationBlock *tb = s->tb;
+
+    /* S1: direct jump. Ensure the next insns are 8-byte aligned. */
+    if ((uintptr_t)s->code_ptr & 7)
+        tcg_out32(s, NOP);
+
+    tb->patch_jmp = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+
+    /* S1: Direct Jump  */
+    if (is_user == 0 || emit_helper == 0) {
+        tcg_out_jmp_short((uintptr_t)s->code_ptr, (uintptr_t)(s->code_ptr + 4));
+        s->code_ptr += 2;
+        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS);
+        tb->patch_next = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+        return;
+    }
+
+    label_ptr[0] = s->code_ptr;
+    s->code_ptr += 2;
+    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+    tcg_out32(s, BCCTR | BO_ALWAYS);
+
+    /* S2: Trace Profiling Stub  */
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_profile);
+    tcg_out_jmp_short((uintptr_t)label_ptr[0], (uintptr_t)s->code_ptr);
+
+    /* S3: Trace Prediction stub */
+    tb->patch_next = (uint16_t)((intptr_t)s->code_ptr - (intptr_t)s->code_buf);
+
+    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP1, TCG_AREG0,
+               offsetof(CPUArchState, start_trace_prediction));
+    tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_TMP1, 0, 1, 7, TCG_TYPE_I32);
+    label_ptr[1] = s->code_ptr;
+    tcg_out_bc_noaddr(s, tcg_to_bc[TCG_COND_EQ]);
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], tb->id);
+    tcg_out_call(s, (tcg_insn_unit *)helper_NET_predict);
+    reloc_pc14(label_ptr[1], s->code_ptr);
+
+    /* S4: QEMU emulation code */
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
                        const int *const_args)
 {
@@ -1906,6 +2002,17 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_out32(s, BCCTR | BO_ALWAYS);
         s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
+    case INDEX_op_hotpatch:
+        tcg_out_hotpatch(s, args[0], args[1]);
+        break;
+    case INDEX_op_jmp:
+        if (const_args[0]) {
+            tcg_out_b(s, 0, (tcg_insn_unit *)args[0]);
+        } else {
+            tcg_out32(s, MTSPR | RS(args[0]) | CTR);
+            tcg_out32(s, BCCTR | BO_ALWAYS);
+        }
+        break;
     case INDEX_op_br:
         {
             TCGLabel *l = arg_label(args[0]);
@@ -2436,6 +2543,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
 }
 
 static const TCGTargetOpDef ppc_op_defs[] = {
+    { INDEX_op_hotpatch, { "i", "i" } },
+    { INDEX_op_jmp, { "ri" } },
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
     { INDEX_op_br, { } },
@@ -2572,6 +2681,10 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_qemu_st_i64, { "S", "S", "S", "S" } },
 #endif
 
+#define DEF(name,a1,a2,a3,a4) { INDEX_op_##name, {} },
+#include "tcg-opc-vector.h"
+#undef DEF
+
     { -1 },
 };
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0b9dd8f..3773253 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -39,7 +39,7 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
    Up to and including filling in the forward link immediately.  We'll do
    proper termination of the end of the list after we finish translation.  */
 
-static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
+void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
 {
     int oi = ctx->gen_next_op_idx;
     int ni = oi + 1;
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 4e20dc1..17d31df 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -28,6 +28,7 @@
 
 /* Basic output routines.  Not for general consumption.  */
 
+void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args);
 void tcg_gen_op1(TCGContext *, TCGOpcode, TCGArg);
 void tcg_gen_op2(TCGContext *, TCGOpcode, TCGArg, TCGArg);
 void tcg_gen_op3(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg);
@@ -311,6 +312,16 @@ void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
 
+static inline void tcg_gen_hotpatch(uint32_t arg1, uint32_t arg2)
+{
+    tcg_gen_op2(&tcg_ctx, INDEX_op_hotpatch, arg1, arg2);
+}
+
+static inline void tcg_gen_annotate(uint32_t arg)
+{
+    tcg_gen_op1(&tcg_ctx, INDEX_op_annotate, arg);
+}
+
 static inline void tcg_gen_discard_i32(TCGv_i32 arg)
 {
     tcg_gen_op1_i32(INDEX_op_discard, arg);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 6d0410c..5ba1e05 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -26,12 +26,16 @@
  * DEF(name, oargs, iargs, cargs, flags)
  */
 
+DEF(hotpatch, 0, 0, 2, 0)
+DEF(annotate, 0, 0, 1, TCG_OPF_NOT_PRESENT)
+
 /* predefined ops */
 DEF(discard, 1, 0, 0, TCG_OPF_NOT_PRESENT)
 DEF(set_label, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_NOT_PRESENT)
 
 /* variable number of parameters */
 DEF(call, 0, 0, 3, TCG_OPF_CALL_CLOBBER | TCG_OPF_NOT_PRESENT)
+DEF(jmp, 0, 1, 0, TCG_OPF_BB_END)
 
 DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 
@@ -191,6 +195,8 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
+#include "tcg-opc-vector.h"
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
diff --git a/tcg/tcg.c b/tcg/tcg.c
index a163541..ea5091b 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -304,19 +304,22 @@ void tcg_pool_reset(TCGContext *s)
     s->pool_current = NULL;
 }
 
-typedef struct TCGHelperInfo {
-    void *func;
-    const char *name;
-    unsigned flags;
-    unsigned sizemask;
-} TCGHelperInfo;
-
 #include "exec/helper-proto.h"
 
-static const TCGHelperInfo all_helpers[] = {
+const TCGHelperInfo all_helpers[] = {
 #include "exec/helper-tcg.h"
 };
 
+int tcg_num_helpers(void)
+{
+    return ARRAY_SIZE(all_helpers);
+}
+
+const TCGHelperInfo *get_tcg_helpers(void)
+{
+    return all_helpers;
+}
+
 void tcg_context_init(TCGContext *s)
 {
     int op, total_args, n, i;
@@ -413,7 +416,7 @@ void tcg_set_frame(TCGContext *s, int reg, intptr_t start, intptr_t size)
     s->frame_reg = reg;
 }
 
-void tcg_func_start(TCGContext *s)
+void tcg_func_start(TCGContext *s, TranslationBlock *tb)
 {
     tcg_pool_reset(s);
     s->nb_temps = s->nb_globals;
@@ -432,8 +435,10 @@ void tcg_func_start(TCGContext *s)
     s->gen_last_op_idx = -1;
     s->gen_next_op_idx = 0;
     s->gen_next_parm_idx = 0;
+    s->vec_opparam_ptr = s->vec_opparam_buf;
 
     s->be = tcg_malloc(sizeof(TCGBackendData));
+    s->tb = tb;
 }
 
 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -1004,6 +1009,7 @@ void tcg_dump_ops(TCGContext *s)
     char buf[128];
     TCGOp *op;
     int oi;
+    const TCGArg *vec_args = s->vec_opparam_buf;
 
     for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
         int i, k, nb_oargs, nb_iargs, nb_cargs;
@@ -1051,8 +1057,29 @@ void tcg_dump_ops(TCGContext *s)
                 qemu_log(",%s", t);
             }
         } else {
+            int is_vec = 0;
             qemu_log(" %s ", def->name);
 
+            /* print vector opc */
+            switch (c) {
+            case INDEX_op_vector_start ... INDEX_op_vector_end:
+                is_vec = 1;
+                break;
+            default:
+                break;
+            }
+            if (is_vec) {
+                qemu_log("$0x%" TCG_PRIlx, vec_args[0]);
+                if (c == INDEX_op_vload_128 || c == INDEX_op_vstore_128)
+                    qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), vec_args[1]));
+                else
+                    qemu_log(",$0x%" TCG_PRIlx, vec_args[1]);
+                qemu_log(",$0x%" TCG_PRIlx, vec_args[2]);
+                qemu_log("\n");
+                vec_args += 3;
+                continue;
+            }
+
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             nb_cargs = def->nb_cargs;
@@ -1138,6 +1165,172 @@ void tcg_dump_ops(TCGContext *s)
     }
 }
 
+void tcg_dump_ops_fn(TCGContext *s, void (*fn)(const char *))
+{
+    char buf[128];
+    char outbuf[128];
+    TCGOp *op;
+    int oi;
+    const TCGArg *vec_args = s->vec_opparam_buf;
+
+#define printops(args...) \
+    do { snprintf(outbuf, 128, ##args); (*fn)(outbuf); } while(0)
+
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
+        int i, k, nb_oargs, nb_iargs, nb_cargs;
+        const TCGOpDef *def;
+        const TCGArg *args;
+        TCGOpcode c;
+
+        op = &s->gen_op_buf[oi];
+        c = op->opc;
+        def = &tcg_op_defs[c];
+        args = &s->gen_opparam_buf[op->args];
+
+        if (c == INDEX_op_insn_start) {
+            printops("%s ----", oi != s->gen_first_op_idx ? "\n" : "");
+
+            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
+                target_ulong a;
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+                a = ((target_ulong)args[i * 2 + 1] << 32) | args[i * 2];
+#else
+                a = args[i];
+#endif
+                printops(" " TARGET_FMT_lx, a);
+            }
+        } else if (c == INDEX_op_call) {
+            /* variable number of arguments */
+            nb_oargs = op->callo;
+            nb_iargs = op->calli;
+            nb_cargs = def->nb_cargs;
+
+            /* function name, flags, out args */
+            printops(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name,
+                     tcg_find_helper(s, args[nb_oargs + nb_iargs]),
+                     args[nb_oargs + nb_iargs + 1], nb_oargs);
+            for (i = 0; i < nb_oargs; i++) {
+                printops(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
+                                                    args[i]));
+            }
+            for (i = 0; i < nb_iargs; i++) {
+                TCGArg arg = args[nb_oargs + i];
+                const char *t = "<dummy>";
+                if (arg != TCG_CALL_DUMMY_ARG) {
+                    t = tcg_get_arg_str_idx(s, buf, sizeof(buf), arg);
+                }
+                printops(",%s", t);
+            }
+        } else {
+            int is_vec = 0;
+            printops(" %s ", def->name);
+
+            /* print vector opc */
+            switch (c) {
+            case INDEX_op_vector_start ... INDEX_op_vector_end:
+                is_vec = 1;
+                break;
+            default:
+                break;
+            }
+            if (is_vec) {
+                printops("$0x%" TCG_PRIlx, vec_args[0]);
+                if (c == INDEX_op_vload_128 || c == INDEX_op_vstore_128)
+                    printops(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), vec_args[1]));
+                else
+                    printops(",$0x%" TCG_PRIlx, vec_args[1]);
+                printops(",$0x%" TCG_PRIlx, vec_args[2]);
+                printops("\n");
+                vec_args += 3;
+                continue;
+            }
+
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_cargs = def->nb_cargs;
+
+            k = 0;
+            for (i = 0; i < nb_oargs; i++) {
+                if (k != 0) {
+                    printops(",");
+                }
+                printops("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
+                                                   args[k++]));
+            }
+            for (i = 0; i < nb_iargs; i++) {
+                if (k != 0) {
+                    printops(",");
+                }
+                printops("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
+                                                   args[k++]));
+            }
+            switch (c) {
+            case INDEX_op_brcond_i32:
+            case INDEX_op_setcond_i32:
+            case INDEX_op_movcond_i32:
+            case INDEX_op_brcond2_i32:
+            case INDEX_op_setcond2_i32:
+            case INDEX_op_brcond_i64:
+            case INDEX_op_setcond_i64:
+            case INDEX_op_movcond_i64:
+                if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) {
+                    printops(",%s", cond_name[args[k++]]);
+                } else {
+                    printops(",$0x%" TCG_PRIlx, args[k++]);
+                }
+                i = 1;
+                break;
+            case INDEX_op_qemu_ld_i32:
+            case INDEX_op_qemu_st_i32:
+            case INDEX_op_qemu_ld_i64:
+            case INDEX_op_qemu_st_i64:
+                {
+                    TCGMemOpIdx oi = args[k++];
+                    TCGMemOp op = get_memop(oi);
+                    unsigned ix = get_mmuidx(oi);
+
+                    if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) {
+                        printops(",$0x%x,%u", op, ix);
+                    } else {
+                        const char *s_al = "", *s_op;
+                        if (op & MO_AMASK) {
+                            if ((op & MO_AMASK) == MO_ALIGN) {
+                                s_al = "al+";
+                            } else {
+                                s_al = "un+";
+                            }
+                        }
+                        s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)];
+                        printops(",%s%s,%u", s_al, s_op, ix);
+                    }
+                    i = 1;
+                }
+                break;
+            default:
+                i = 0;
+                break;
+            }
+            switch (c) {
+            case INDEX_op_set_label:
+            case INDEX_op_br:
+            case INDEX_op_brcond_i32:
+            case INDEX_op_brcond_i64:
+            case INDEX_op_brcond2_i32:
+                printops("%s$L%d", k ? "," : "", arg_label(args[k])->id);
+                i++, k++;
+                break;
+            default:
+                break;
+            }
+            for (; i < nb_cargs; i++, k++) {
+                printops("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]);
+            }
+        }
+        printops("\n");
+    }
+#undef printops
+}
+
 /* we give more priority to constraints with less registers */
 static int get_constraint_priority(const TCGOpDef *def, int k)
 {
@@ -1334,10 +1527,11 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
 /* Liveness analysis : update the opc_dead_args array to tell if a
    given input arguments is dead. Instructions updating dead
    temporaries are removed. */
-static void tcg_liveness_analysis(TCGContext *s)
+void tcg_liveness_analysis(TCGContext *s)
 {
     uint8_t *dead_temps, *mem_temps;
     int oi, oi_prev, nb_ops;
+    TCGArg *vec_args = s->vec_opparam_ptr;
 
     nb_ops = s->gen_next_op_idx;
     s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
@@ -1427,6 +1621,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                 }
             }
             break;
+        case INDEX_op_annotate:
         case INDEX_op_insn_start:
             break;
         case INDEX_op_discard:
@@ -1434,7 +1629,11 @@ static void tcg_liveness_analysis(TCGContext *s)
             dead_temps[args[0]] = 1;
             mem_temps[args[0]] = 0;
             break;
-
+        case INDEX_op_vector_start ... INDEX_op_vector_end:
+            vec_args -= 3;
+            if (opc == INDEX_op_vload_128 || opc == INDEX_op_vstore_128)
+                dead_temps[vec_args[1]] = 0;
+            break;
         case INDEX_op_add2_i32:
             opc_new = INDEX_op_add_i32;
             goto do_addsub2;
@@ -1577,7 +1776,7 @@ static void tcg_liveness_analysis(TCGContext *s)
 }
 #else
 /* dummy liveness analysis */
-static void tcg_liveness_analysis(TCGContext *s)
+void tcg_liveness_analysis(TCGContext *s)
 {
     int nb_ops;
     nb_ops = s->gen_opc_ptr - s->gen_opc_buf;
@@ -2418,6 +2617,8 @@ int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf)
                 s->gen_insn_data[num_insns][i] = a;
             }
             break;
+        case INDEX_op_annotate:
+            break;
         case INDEX_op_discard:
             temp_dead(s, args[0]);
             break;
@@ -2554,15 +2755,15 @@ struct jit_descriptor {
     struct jit_code_entry *first_entry;
 };
 
-void __jit_debug_register_code(void) __attribute__((noinline));
-void __jit_debug_register_code(void)
+void qemu_jit_debug_register_code(void) __attribute__((noinline));
+void qemu_jit_debug_register_code(void)
 {
     asm("");
 }
 
 /* Must statically initialize the version, because GDB may check
    the version before we can set it.  */
-struct jit_descriptor __jit_debug_descriptor = { 1, 0, 0, 0 };
+struct jit_descriptor qemu_jit_debug_descriptor = { 1, 0, 0, 0 };
 
 /* End GDB interface.  */
 
@@ -2771,10 +2972,10 @@ static void tcg_register_jit_int(void *buf_ptr, size_t buf_size,
     one_entry.symfile_addr = img;
     one_entry.symfile_size = img_size;
 
-    __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
-    __jit_debug_descriptor.relevant_entry = &one_entry;
-    __jit_debug_descriptor.first_entry = &one_entry;
-    __jit_debug_register_code();
+    qemu_jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
+    qemu_jit_debug_descriptor.relevant_entry = &one_entry;
+    qemu_jit_debug_descriptor.first_entry = &one_entry;
+    qemu_jit_debug_register_code();
 }
 #else
 /* No support for the feature.  Provide the entry point expected by exec.c,
@@ -2790,3 +2991,34 @@ void tcg_register_jit(void *buf, size_t buf_size)
 {
 }
 #endif /* ELF_HOST_MACHINE */
+
+
+/*
+ * copy_tcg_context_global()
+ *  Copy thread's local TCG context to the global TCG context.
+ *
+ * We first initialize main thread's tcg_ctx and copy it to tcg_ctx_global
+ * at this point. The tcg_ctx_global is copied to each thread's local
+ * tcg_ctx later using copy_tcg_context(). 
+ *
+ * Note: This copy must be done after tcg_ctx is completely initialized
+ *       and should be setup by the main thread. 
+ */
+void copy_tcg_context_global(void)
+{
+    static int init_once = 0;
+    if (init_once == 1)
+        return;
+
+    memcpy(&tcg_ctx_global, &tcg_ctx, sizeof(TCGContext));
+    init_once = 1;
+}
+
+/*
+ * copy_tcg_context()
+ *  Copy the global TCG context to the thread's local TCG context.
+ */
+void copy_tcg_context(void)
+{
+    memcpy(&tcg_ctx, &tcg_ctx_global, sizeof(TCGContext));
+}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index a696922..3374257 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -193,6 +193,7 @@ typedef struct TCGPool {
 
 #define TCG_POOL_CHUNK_SIZE 32768
 
+#define TCG_MAX_LABELS 512
 #define TCG_MAX_TEMPS 512
 #define TCG_MAX_INSNS 512
 
@@ -564,7 +565,7 @@ struct TCGContext {
     /* Threshold to flush the translated code buffer.  */
     void *code_gen_highwater;
 
-    TBContext tb_ctx;
+    TBContext *tb_ctx;
 
     /* The TCGBackendData structure is private to tcg-target.c.  */
     struct TCGBackendData *be;
@@ -578,12 +579,33 @@ struct TCGContext {
 
     TCGOp gen_op_buf[OPC_BUF_SIZE];
     TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
+    TCGArg vec_opparam_buf[OPPARAM_BUF_SIZE];
+    TCGArg *vec_opparam_ptr;
 
     uint16_t gen_insn_end_off[TCG_MAX_INSNS];
     target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
+
+    TranslationBlock *tb;
 };
 
-extern TCGContext tcg_ctx;
+extern TCGContext tcg_ctx_global;
+extern __thread TCGContext tcg_ctx;
+
+typedef struct TCGHelperInfo {
+    void *func;
+    const char *name;
+    unsigned flags;
+    unsigned sizemask;
+} TCGHelperInfo;
+
+void copy_tcg_context_global(void);
+void copy_tcg_context(void);
+int tcg_num_helpers(void);
+const TCGHelperInfo *get_tcg_helpers(void);
+void tcg_liveness_analysis(TCGContext *s);
+void tcg_dump_ops_fn(TCGContext *s, void (*fn)(const char *));
+target_long decode_sleb128(uint8_t **pp);
+
 
 /* The number of opcodes emitted so far.  */
 static inline int tcg_op_buf_count(void)
@@ -624,7 +646,7 @@ static inline void *tcg_malloc(int size)
 
 void tcg_context_init(TCGContext *s);
 void tcg_prologue_init(TCGContext *s);
-void tcg_func_start(TCGContext *s);
+void tcg_func_start(TCGContext *s, TranslationBlock *tb);
 
 int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf);
 
@@ -822,7 +844,7 @@ static inline TCGLabel *arg_label(TCGArg i)
 
 static inline ptrdiff_t tcg_ptr_byte_diff(void *a, void *b)
 {
-    return a - b;
+    return (ptrdiff_t)a - (ptrdiff_t)b;
 }
 
 /**
@@ -876,7 +898,7 @@ static inline TCGMemOpIdx make_memop_idx(TCGMemOp op, unsigned idx)
  */
 static inline TCGMemOp get_memop(TCGMemOpIdx oi)
 {
-    return oi >> 4;
+    return (TCGMemOp)(oi >> 4);
 }
 
 /**
@@ -939,6 +961,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
 #define TB_EXIT_IDX1 1
 #define TB_EXIT_ICOUNT_EXPIRED 2
 #define TB_EXIT_REQUESTED 3
+#define TB_EXIT_LLVM TB_EXIT_ICOUNT_EXPIRED
 
 #ifdef HAVE_TCG_QEMU_TB_EXEC
 uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
@@ -1011,6 +1034,31 @@ uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
 uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
                             TCGMemOpIdx oi, uintptr_t retaddr);
 
+
+/* Value zero-extended to tcg register size.  */
+tcg_target_ulong llvm_ret_ldub_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_le_lduw_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_le_ldul_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+uint64_t llvm_le_ldq_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_be_lduw_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_be_ldul_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+uint64_t llvm_be_ldq_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+
+/* Value sign-extended to tcg register size.  */
+tcg_target_ulong llvm_ret_ldsb_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_le_ldsw_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_le_ldsl_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_be_ldsw_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+tcg_target_ulong llvm_be_ldsl_mmu(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi);
+
+void llvm_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val, TCGMemOpIdx oi);
+void llvm_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val, TCGMemOpIdx oi);
+void llvm_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, TCGMemOpIdx oi);
+void llvm_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, TCGMemOpIdx oi);
+void llvm_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val, TCGMemOpIdx oi);
+void llvm_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, TCGMemOpIdx oi);
+void llvm_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, TCGMemOpIdx oi);
+
 /* Temporary aliases until backends are converted.  */
 #ifdef TARGET_WORDS_BIGENDIAN
 # define helper_ret_ldsw_mmu  helper_be_ldsw_mmu
diff --git a/translate-all.c b/translate-all.c
index 042a857..bf05326 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -63,6 +63,10 @@
 #include "qemu/bitmap.h"
 #include "qemu/timer.h"
 
+#include "hqemu.h"
+
+size_t get_cpu_size(void) { return sizeof(CPUArchState); }
+
 //#define DEBUG_TB_INVALIDATE
 //#define DEBUG_FLUSH
 /* make various TB consistency checks */
@@ -124,7 +128,8 @@ intptr_t qemu_host_page_mask;
 static void *l1_map[V_L1_SIZE];
 
 /* code generation context */
-TCGContext tcg_ctx;
+TCGContext tcg_ctx_global;
+__thread TCGContext tcg_ctx;
 
 /* translation block context */
 #ifdef CONFIG_USER_ONLY
@@ -135,7 +140,7 @@ void tb_lock(void)
 {
 #ifdef CONFIG_USER_ONLY
     assert(!have_tb_lock);
-    qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
+    qemu_mutex_lock(&tcg_ctx.tb_ctx->tb_lock);
     have_tb_lock++;
 #endif
 }
@@ -145,7 +150,7 @@ void tb_unlock(void)
 #ifdef CONFIG_USER_ONLY
     assert(have_tb_lock);
     have_tb_lock--;
-    qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
+    qemu_mutex_unlock(&tcg_ctx.tb_ctx->tb_lock);
 #endif
 }
 
@@ -153,7 +158,7 @@ void tb_lock_reset(void)
 {
 #ifdef CONFIG_USER_ONLY
     if (have_tb_lock) {
-        qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
+        qemu_mutex_unlock(&tcg_ctx.tb_ctx->tb_lock);
         have_tb_lock = 0;
     }
 #endif
@@ -161,11 +166,12 @@ void tb_lock_reset(void)
 
 static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                          tb_page_addr_t phys_page2);
-static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
+static TranslationBlock *tb_find_pc(CPUState *cpu, uintptr_t tc_ptr);
 
 void cpu_gen_init(void)
 {
     tcg_context_init(&tcg_ctx); 
+    tcg_ctx.tb_ctx = g_malloc0(sizeof(TBContext));
 }
 
 /* Encode VAL as a signed leb128 sequence at P.
@@ -190,7 +196,7 @@ static uint8_t *encode_sleb128(uint8_t *p, target_long val)
 
 /* Decode a signed leb128 sequence at *PP; increment *PP past the
    decoded value.  Return the decoded value.  */
-static target_long decode_sleb128(uint8_t **pp)
+target_long decode_sleb128(uint8_t **pp)
 {
     uint8_t *p = *pp;
     target_long val = 0;
@@ -268,6 +274,11 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     int64_t ti = profile_getclock();
 #endif
 
+#if defined(CONFIG_LLVM)
+    if (llvm_locate_trace(searched_pc))
+        return llvm_restore_state(cpu, tb, searched_pc);
+#endif
+
     if (searched_pc < host_pc) {
         return -1;
     }
@@ -297,8 +308,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     restore_state_to_opc(env, tb, data);
 
 #ifdef CONFIG_PROFILER
-    tcg_ctx.restore_time += profile_getclock() - ti;
-    tcg_ctx.restore_count++;
+    tcg_ctx_global.restore_time += profile_getclock() - ti;
+    tcg_ctx_global.restore_count++;
 #endif
     return 0;
 }
@@ -307,7 +318,7 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t retaddr)
 {
     TranslationBlock *tb;
 
-    tb = tb_find_pc(retaddr);
+    tb = tb_find_pc(cpu, retaddr);
     if (tb) {
         cpu_restore_state_from_tb(cpu, tb, retaddr);
         if (tb->cflags & CF_NOCACHE) {
@@ -485,7 +496,13 @@ static inline PageDesc *page_find(tb_page_addr_t index)
 # define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 #endif
 
+/* Note: The size of the code buffer is doubled. We steal half of the buffer
+ * acting as the trace code cache. */
+#if defined(CONFIG_LLVM)
+#define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024 * 2)
+#else
 #define DEFAULT_CODE_GEN_BUFFER_SIZE_1 (32u * 1024 * 1024)
+#endif
 
 #define DEFAULT_CODE_GEN_BUFFER_SIZE \
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
@@ -503,6 +520,9 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
            static buffer, we could size this on RESERVED_VA, on the text
            segment size of the executable, or continue to use the default.  */
         tb_size = (unsigned long)(ram_size / 4);
+#if defined(CONFIG_LLVM)
+        tb_size = (unsigned long)(ram_size / 2);
+#endif
 #endif
     }
     if (tb_size < MIN_CODE_GEN_BUFFER_SIZE) {
@@ -730,15 +750,18 @@ static inline void code_gen_alloc(size_t tb_size)
         fprintf(stderr, "Could not allocate dynamic translator buffer\n");
         exit(1);
     }
+#if defined(CONFIG_LLVM)
+    llvm_alloc_cache();
+#endif
 
     /* Estimate a good size for the number of TBs we can support.  We
        still haven't deducted the prologue from the buffer size here,
        but that's minimal and won't affect the estimate much.  */
     tcg_ctx.code_gen_max_blocks
         = tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
-    tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks);
+    tcg_ctx.tb_ctx->tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks);
 
-    qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
+    qemu_mutex_init(&tcg_ctx.tb_ctx->tb_lock);
 }
 
 /* Must be called before using the QEMU cpus. 'tb_size' is the size
@@ -765,26 +788,35 @@ bool tcg_enabled(void)
    too many translation blocks or too much generated code. */
 static TranslationBlock *tb_alloc(target_ulong pc)
 {
+    TCGContext *s = &tcg_ctx_global;
     TranslationBlock *tb;
 
-    if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) {
+    if (s->tb_ctx->nb_tbs >= s->code_gen_max_blocks) {
         return NULL;
     }
-    tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++];
+#if defined(CONFIG_LLVM)
+    if (llvm_check_cache() == 1)
+        return NULL;
+#endif
+
+    tb = &s->tb_ctx->tbs[s->tb_ctx->nb_tbs++];
     tb->pc = pc;
     tb->cflags = 0;
+
+    optimization_init_tb(tb, s->tb_ctx->nb_tbs - 1);
     return tb;
 }
 
 void tb_free(TranslationBlock *tb)
 {
+    TCGContext *s = &tcg_ctx_global;
     /* In practice this is mostly used for single use temporary TB
        Ignore the hard cases and just back up if this TB happens to
        be the last one generated.  */
-    if (tcg_ctx.tb_ctx.nb_tbs > 0 &&
-            tb == &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
-        tcg_ctx.code_gen_ptr = tb->tc_ptr;
-        tcg_ctx.tb_ctx.nb_tbs--;
+    if (s->tb_ctx->nb_tbs > 0 &&
+        tb == &s->tb_ctx->tbs[s->tb_ctx->nb_tbs - 1]) {
+        s->code_gen_ptr = tb->tc_ptr;
+        s->tb_ctx->nb_tbs--;
     }
 }
 
@@ -832,42 +864,49 @@ static void page_flush_tb(void)
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
+    TCGContext *s = &tcg_ctx_global;
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
-           (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
-           tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.tb_ctx.nb_tbs > 0 ?
-           ((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)) /
-           tcg_ctx.tb_ctx.nb_tbs : 0);
+           (unsigned long)(s->code_gen_ptr - s->code_gen_buffer),
+           s->tb_ctx->nb_tbs, s->tb_ctx->nb_tbs > 0 ?
+           ((unsigned long)(s->code_gen_ptr - s->code_gen_buffer)) /
+           s->tb_ctx->nb_tbs : 0);
 #endif
-    if ((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)
-        > tcg_ctx.code_gen_buffer_size) {
+    if ((unsigned long)(s->code_gen_ptr - s->code_gen_buffer)
+        > s->code_gen_buffer_size) {
         cpu_abort(cpu, "Internal error: code buffer overflow\n");
     }
-    tcg_ctx.tb_ctx.nb_tbs = 0;
+#if defined(CONFIG_LLVM)
+    llvm_tb_flush();
+#endif
+
+    s->tb_ctx->nb_tbs = 0;
 
     CPU_FOREACH(cpu) {
         memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+        optimization_reset(cpu->env_ptr, 1);
     }
 
-    memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash));
+    memset(s->tb_ctx->tb_phys_hash, 0, sizeof(s->tb_ctx->tb_phys_hash));
     page_flush_tb();
 
-    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    s->code_gen_ptr = s->code_gen_buffer;
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
-    tcg_ctx.tb_ctx.tb_flush_count++;
+    s->tb_ctx->tb_flush_count++;
 }
 
 #ifdef DEBUG_TB_CHECK
 
 static void tb_invalidate_check(target_ulong address)
 {
+    TCGContext *s = &tcg_ctx_global;
     TranslationBlock *tb;
     int i;
 
     address &= TARGET_PAGE_MASK;
     for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) {
-        for (tb = tb_ctx.tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) {
+        for (tb = s->tb_ctx->tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) {
             if (!(address + TARGET_PAGE_SIZE <= tb->pc ||
                   address >= tb->pc + tb->size)) {
                 printf("ERROR invalidate: address=" TARGET_FMT_lx
@@ -881,11 +920,12 @@ static void tb_invalidate_check(target_ulong address)
 /* verify that all the pages have correct rights for code */
 static void tb_page_check(void)
 {
+    TCGContext *s = &tcg_ctx_global;
     TranslationBlock *tb;
     int i, flags1, flags2;
 
     for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) {
-        for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL;
+        for (tb = s->tb_ctx->tb_phys_hash[i]; tb != NULL;
                 tb = tb->phys_hash_next) {
             flags1 = page_get_flags(tb->pc);
             flags2 = page_get_flags(tb->pc + tb->size - 1);
@@ -911,6 +951,10 @@ static inline void tb_hash_remove(TranslationBlock **ptb, TranslationBlock *tb)
         }
         ptb = &tb1->phys_hash_next;
     }
+#if defined(CONFIG_LLVM)
+    tb->mode = BLOCK_INVALID;
+    llvm_tb_remove(tb);
+#endif
 }
 
 static inline void tb_page_remove(TranslationBlock **ptb, TranslationBlock *tb)
@@ -969,16 +1013,15 @@ static inline void tb_reset_jump(TranslationBlock *tb, int n)
 /* invalidate one TB */
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 {
+    TCGContext *s = &tcg_ctx_global;
     CPUState *cpu;
     PageDesc *p;
     unsigned int h, n1;
-    tb_page_addr_t phys_pc;
     TranslationBlock *tb1, *tb2;
 
     /* remove the TB from the hash list */
-    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-    h = tb_phys_hash_func(phys_pc);
-    tb_hash_remove(&tcg_ctx.tb_ctx.tb_phys_hash[h], tb);
+    h = tb_phys_hash_func(tb->pc);
+    tb_hash_remove(&s->tb_ctx->tb_phys_hash[h], tb);
 
     /* remove the TB from the page list */
     if (tb->page_addr[0] != page_addr) {
@@ -992,7 +1035,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
         invalidate_page_bitmap(p);
     }
 
-    tcg_ctx.tb_ctx.tb_invalidated_flag = 1;
+    s->tb_ctx->tb_invalidated_flag = 1;
 
     /* remove the TB from the hash list */
     h = tb_jmp_cache_hash_func(tb->pc);
@@ -1000,6 +1043,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
         if (cpu->tb_jmp_cache[h] == tb) {
             cpu->tb_jmp_cache[h] = NULL;
         }
+        optimization_remove_entry(cpu->env_ptr, tb);
     }
 
     /* suppress this TB from the two jump lists */
@@ -1021,7 +1065,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
     }
     tb->jmp_first = (TranslationBlock *)((uintptr_t)tb | 2); /* fail safe */
 
-    tcg_ctx.tb_ctx.tb_phys_invalidate_count++;
+    s->tb_ctx->tb_phys_invalidate_count++;
 }
 
 static void build_page_bitmap(PageDesc *p)
@@ -1058,6 +1102,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
                               target_ulong pc, target_ulong cs_base,
                               int flags, int cflags)
 {
+    TCGContext *s = &tcg_ctx_global;
     CPUArchState *env = cpu->env_ptr;
     TranslationBlock *tb;
     tb_page_addr_t phys_pc, phys_page2;
@@ -1082,22 +1127,22 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
         tb = tb_alloc(pc);
         assert(tb != NULL);
         /* Don't forget to invalidate previous TB info.  */
-        tcg_ctx.tb_ctx.tb_invalidated_flag = 1;
+        s->tb_ctx->tb_invalidated_flag = 1;
     }
 
-    gen_code_buf = tcg_ctx.code_gen_ptr;
-    tb->tc_ptr = gen_code_buf;
+    gen_code_buf = s->code_gen_ptr;
+    tb->tc_ptr = tb->opt_ptr = gen_code_buf;
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
 
 #ifdef CONFIG_PROFILER
-    tcg_ctx.tb_count1++; /* includes aborted translations because of
+    s->tb_count1++; /* includes aborted translations because of
                        exceptions */
     ti = profile_getclock();
 #endif
 
-    tcg_func_start(&tcg_ctx);
+    tcg_func_start(&tcg_ctx, tb);
 
     gen_intermediate_code(env, tb);
 
@@ -1116,9 +1161,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 #endif
 
 #ifdef CONFIG_PROFILER
-    tcg_ctx.tb_count++;
-    tcg_ctx.interm_time += profile_getclock() - ti;
-    tcg_ctx.code_time -= profile_getclock();
+    s->tb_count++;
+    s->interm_time += profile_getclock() - ti;
+    s->code_time -= profile_getclock();
 #endif
 
     /* ??? Overflow could be handled better here.  In particular, we
@@ -1136,10 +1181,10 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     }
 
 #ifdef CONFIG_PROFILER
-    tcg_ctx.code_time += profile_getclock();
-    tcg_ctx.code_in_len += tb->size;
-    tcg_ctx.code_out_len += gen_code_size;
-    tcg_ctx.search_out_len += search_size;
+    s->code_time += profile_getclock();
+    s->code_in_len += tb->size;
+    s->code_out_len += gen_code_size;
+    s->search_out_len += search_size;
 #endif
 
 #ifdef DEBUG_DISAS
@@ -1151,7 +1196,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     }
 #endif
 
-    tcg_ctx.code_gen_ptr = (void *)
+    s->code_gen_ptr = (void *)
         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
                  CODE_GEN_ALIGN);
 
@@ -1247,7 +1292,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                 current_tb = NULL;
                 if (cpu->mem_io_pc) {
                     /* now we have a real cpu fault */
-                    current_tb = tb_find_pc(cpu->mem_io_pc);
+                    current_tb = tb_find_pc(cpu, cpu->mem_io_pc);
                 }
             }
             if (current_tb == tb &&
@@ -1365,7 +1410,7 @@ static void tb_invalidate_phys_page(tb_page_addr_t addr,
     tb = p->first_tb;
 #ifdef TARGET_HAS_PRECISE_SMC
     if (tb && pc != 0) {
-        current_tb = tb_find_pc(pc);
+        current_tb = tb_find_pc(cpu, pc);
     }
     if (cpu != NULL) {
         env = cpu->env_ptr;
@@ -1475,12 +1520,13 @@ static inline void tb_alloc_page(TranslationBlock *tb,
 static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
                          tb_page_addr_t phys_page2)
 {
+    TCGContext *s = &tcg_ctx_global;
     unsigned int h;
     TranslationBlock **ptb;
 
     /* add in the physical hash table */
-    h = tb_phys_hash_func(phys_pc);
-    ptb = &tcg_ctx.tb_ctx.tb_phys_hash[h];
+    h = tb_phys_hash_func(tb->pc);
+    ptb = &s->tb_ctx->tb_phys_hash[h];
     tb->phys_hash_next = *ptb;
     *ptb = tb;
 
@@ -1511,25 +1557,31 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 
 /* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr <
    tb[1].tc_ptr. Return NULL if not found */
-static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
+static TranslationBlock *tb_find_pc(CPUState *cpu, uintptr_t tc_ptr)
 {
+    TCGContext *s = &tcg_ctx_global;
     int m_min, m_max, m;
     uintptr_t v;
     TranslationBlock *tb;
 
-    if (tcg_ctx.tb_ctx.nb_tbs <= 0) {
+    if (s->tb_ctx->nb_tbs <= 0) {
         return NULL;
     }
-    if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer ||
-        tc_ptr >= (uintptr_t)tcg_ctx.code_gen_ptr) {
+#if defined(CONFIG_LLVM)
+    tb = llvm_find_pc(cpu, tc_ptr);
+    if (tb)
+        return tb;
+#endif
+    if (tc_ptr < (uintptr_t)s->code_gen_buffer ||
+        tc_ptr >= (uintptr_t)s->code_gen_ptr) {
         return NULL;
     }
     /* binary search (cf Knuth) */
     m_min = 0;
-    m_max = tcg_ctx.tb_ctx.nb_tbs - 1;
+    m_max = s->tb_ctx->nb_tbs - 1;
     while (m_min <= m_max) {
         m = (m_min + m_max) >> 1;
-        tb = &tcg_ctx.tb_ctx.tbs[m];
+        tb = &s->tb_ctx->tbs[m];
         v = (uintptr_t)tb->tc_ptr;
         if (v == tc_ptr) {
             return tb;
@@ -1539,7 +1591,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
             m_min = m + 1;
         }
     }
-    return &tcg_ctx.tb_ctx.tbs[m_max];
+    return &s->tb_ctx->tbs[m_max];
 }
 
 #if !defined(CONFIG_USER_ONLY)
@@ -1567,7 +1619,7 @@ void tb_check_watchpoint(CPUState *cpu)
 {
     TranslationBlock *tb;
 
-    tb = tb_find_pc(cpu->mem_io_pc);
+    tb = tb_find_pc(cpu, cpu->mem_io_pc);
     if (tb) {
         /* We can use retranslation to find the PC.  */
         cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc);
@@ -1599,7 +1651,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     target_ulong pc, cs_base;
     uint64_t flags;
 
-    tb = tb_find_pc(retaddr);
+    tb = tb_find_pc(cpu, retaddr);
     if (!tb) {
         cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
                   (void *)retaddr);
@@ -1675,6 +1727,7 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
 {
+    TCGContext *s = &tcg_ctx_global;
     int i, target_code_size, max_target_code_size;
     int direct_jmp_count, direct_jmp2_count, cross_page;
     TranslationBlock *tb;
@@ -1684,8 +1737,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     cross_page = 0;
     direct_jmp_count = 0;
     direct_jmp2_count = 0;
-    for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
-        tb = &tcg_ctx.tb_ctx.tbs[i];
+    for (i = 0; i < s->tb_ctx->nb_tbs; i++) {
+        tb = &s->tb_ctx->tbs[i];
         target_code_size += tb->size;
         if (tb->size > max_target_code_size) {
             max_target_code_size = tb->size;
@@ -1703,35 +1756,35 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     /* XXX: avoid using doubles ? */
     cpu_fprintf(f, "Translation buffer state:\n");
     cpu_fprintf(f, "gen code size       %td/%zd\n",
-                tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
-                tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
+                s->code_gen_ptr - s->code_gen_buffer,
+                s->code_gen_highwater - s->code_gen_buffer);
     cpu_fprintf(f, "TB count            %d/%d\n",
-            tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
+            s->tb_ctx->nb_tbs, s->code_gen_max_blocks);
     cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
-            tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
-                    tcg_ctx.tb_ctx.nb_tbs : 0,
+            s->tb_ctx->nb_tbs ? target_code_size /
+                    s->tb_ctx->nb_tbs : 0,
             max_target_code_size);
     cpu_fprintf(f, "TB avg host size    %td bytes (expansion ratio: %0.1f)\n",
-            tcg_ctx.tb_ctx.nb_tbs ? (tcg_ctx.code_gen_ptr -
-                                     tcg_ctx.code_gen_buffer) /
-                                     tcg_ctx.tb_ctx.nb_tbs : 0,
-                target_code_size ? (double) (tcg_ctx.code_gen_ptr -
-                                             tcg_ctx.code_gen_buffer) /
+            s->tb_ctx->nb_tbs ? (s->code_gen_ptr -
+                                     s->code_gen_buffer) /
+                                     s->tb_ctx->nb_tbs : 0,
+                target_code_size ? (double) (s->code_gen_ptr -
+                                             s->code_gen_buffer) /
                                              target_code_size : 0);
     cpu_fprintf(f, "cross page TB count %d (%d%%)\n", cross_page,
-            tcg_ctx.tb_ctx.nb_tbs ? (cross_page * 100) /
-                                    tcg_ctx.tb_ctx.nb_tbs : 0);
+            s->tb_ctx->nb_tbs ? (cross_page * 100) /
+                                    s->tb_ctx->nb_tbs : 0);
     cpu_fprintf(f, "direct jump count   %d (%d%%) (2 jumps=%d %d%%)\n",
                 direct_jmp_count,
-                tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp_count * 100) /
-                        tcg_ctx.tb_ctx.nb_tbs : 0,
+                s->tb_ctx->nb_tbs ? (direct_jmp_count * 100) /
+                        s->tb_ctx->nb_tbs : 0,
                 direct_jmp2_count,
-                tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
-                        tcg_ctx.tb_ctx.nb_tbs : 0);
+                s->tb_ctx->nb_tbs ? (direct_jmp2_count * 100) /
+                        s->tb_ctx->nb_tbs : 0);
     cpu_fprintf(f, "\nStatistics:\n");
-    cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
+    cpu_fprintf(f, "TB flush count      %d\n", s->tb_ctx->tb_flush_count);
     cpu_fprintf(f, "TB invalidate count %d\n",
-            tcg_ctx.tb_ctx.tb_phys_invalidate_count);
+            s->tb_ctx->tb_phys_invalidate_count);
     cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
     tcg_dump_info(f, cpu_fprintf);
 }
diff --git a/user-exec.c b/user-exec.c
index 8ad89a4..dbf04be 100644
--- a/user-exec.c
+++ b/user-exec.c
@@ -58,7 +58,7 @@ static void exception_action(CPUState *cpu)
 void cpu_resume_from_signal(CPUState *cpu, void *puc)
 {
 #ifdef __linux__
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #elif defined(__OpenBSD__)
     struct sigcontext *uc = puc;
 #endif
@@ -172,7 +172,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #elif defined(__OpenBSD__)
     struct sigcontext *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
     unsigned long pc;
     int trapno;
@@ -227,7 +227,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #elif defined(__OpenBSD__)
     struct sigcontext *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
 
     pc = PC_sig(uc);
@@ -332,7 +332,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
     ucontext_t *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
     unsigned long pc;
     int is_write;
@@ -359,7 +359,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                            void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     uint32_t *pc = uc->uc_mcontext.sc_pc;
     uint32_t insn = *pc;
     int is_write = 0;
@@ -457,7 +457,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 #if defined(__NetBSD__)
     ucontext_t *uc = puc;
 #else
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
 #endif
     unsigned long pc;
     int is_write;
@@ -484,7 +484,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     uintptr_t pc = uc->uc_mcontext.pc;
     uint32_t insn = *(uint32_t *)pc;
     bool is_write;
@@ -513,7 +513,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                        void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     unsigned long pc;
     int is_write;
 
@@ -535,7 +535,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
 int cpu_signal_handler(int host_signum, void *pinfo, void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     unsigned long ip;
     int is_write = 0;
 
@@ -566,7 +566,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                        void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     unsigned long pc;
     uint16_t *pinsn;
     int is_write = 0;
@@ -619,7 +619,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                        void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     greg_t pc = uc->uc_mcontext.pc;
     int is_write;
 
@@ -635,7 +635,7 @@ int cpu_signal_handler(int host_signum, void *pinfo,
                        void *puc)
 {
     siginfo_t *info = pinfo;
-    struct ucontext *uc = puc;
+    ucontext_t *uc = puc;
     unsigned long pc = uc->uc_mcontext.sc_iaoq[0];
     uint32_t insn = *(uint32_t *)pc;
     int is_write = 0;
diff --git a/util/memfd.c b/util/memfd.c
index 7c40691..587ef5a 100644
--- a/util/memfd.c
+++ b/util/memfd.c
@@ -40,7 +40,7 @@
 #include <sys/syscall.h>
 #include <asm/unistd.h>
 
-static int memfd_create(const char *name, unsigned int flags)
+static int qemu_memfd_create(const char *name, unsigned int flags)
 {
 #ifdef __NR_memfd_create
     return syscall(__NR_memfd_create, name, flags);
@@ -74,12 +74,12 @@ void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals,
 
 #ifdef CONFIG_LINUX
     if (seals) {
-        mfd = memfd_create(name, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+        mfd = qemu_memfd_create(name, MFD_ALLOW_SEALING | MFD_CLOEXEC);
     }
 
     if (mfd == -1) {
         /* some systems have memfd without sealing */
-        mfd = memfd_create(name, MFD_CLOEXEC);
+        mfd = qemu_memfd_create(name, MFD_CLOEXEC);
         seals = 0;
     }
 #endif
diff --git a/vl.c b/vl.c
index 873d265..0fe3be8 100644
--- a/vl.c
+++ b/vl.c
@@ -4705,6 +4705,7 @@ int main(int argc, char **argv, char **envp)
 #ifdef CONFIG_TPM
     tpm_cleanup();
 #endif
+    qemu_end_cpu_loop();
 
     return 0;
 }