5 files changed, 1249 insertions, 0 deletions
diff --git a/src/llvm/atomic/atomic-arm.c b/src/llvm/atomic/atomic-arm.c
new file mode 100644
index 0000000..4176caa
--- /dev/null
+++ b/src/llvm/atomic/atomic-arm.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* We include this file in op_helper.c */
+
+#include <stdlib.h>
+#include <pthread.h>
+#include "coremu-atomic.h"
+
+__thread uint64_t cm_exclusive_val;
+__thread uint32_t cm_exclusive_addr = -1;
+
+#define GEN_LOAD_EXCLUSIVE(type, TYPE) \
+void HELPER(load_exclusive##type)(CPUArchState *env, uint32_t reg,    \
+                uint32_t addr)                                        \
+{                                                                     \
+    unsigned long q_addr = 0;                                         \
+    DATA_##type val = 0;                                              \
+                                                                      \
+    cm_exclusive_addr = addr;                                         \
+    CM_GET_QEMU_ADDR(env, q_addr,addr);                               \
+    val = *(DATA_##type *)q_addr;                                     \
+    cm_exclusive_val = val;                                           \
+    env->regs[reg] = val;                                             \
+}
+
+GEN_LOAD_EXCLUSIVE(b, B);
+GEN_LOAD_EXCLUSIVE(w, W);
+GEN_LOAD_EXCLUSIVE(l, L);
+//GEN_LOAD_EXCLUSIVE(q, Q);
+
+#define GEN_STORE_EXCLUSIVE(type, TYPE) \
+void HELPER(store_exclusive##type)(CPUArchState *env, uint32_t res,           \
+                uint32_t reg, uint32_t addr)                                  \
+{                                                                             \
+    unsigned long q_addr = 0;                                                 \
+    DATA_##type val = 0;                                                      \
+    DATA_##type r = 0;                                                        \
+                                                                              \
+    if(addr != cm_exclusive_addr)                                             \
+        goto fail;                                                            \
+                                                                              \
+    CM_GET_QEMU_ADDR(env, q_addr,addr);                                       \
+    val = (DATA_##type)env->regs[reg];                                        \
+                                                                              \
+    r = atomic_compare_exchange##type((DATA_##type *)q_addr,                  \
+                                    (DATA_##type)cm_exclusive_val, val);      \
+                                                                              \
+    if(r == (DATA_##type)cm_exclusive_val) {                                  \
+        env->regs[res] = 0;                                                   \
+        goto done;                                                            \
+    } else {                                                                  \
+        goto fail;                                                            \
+    }                                                                         \
+                                                                              \
+fail:                                                                         \
+    env->regs[res] = 1;                                                       \
+                                                                              \
+done:                                                                         \
+    cm_exclusive_addr = -1;                                                   \
+    return;                                                                   \
+}
+
+GEN_STORE_EXCLUSIVE(b, B);
+GEN_STORE_EXCLUSIVE(w, W);
+GEN_STORE_EXCLUSIVE(l, L);
+//GEN_STORE_EXCLUSIVE(q, Q);
+
+void HELPER(load_exclusiveq)(CPUArchState *env, uint32_t reg, uint32_t addr)
+{
+   unsigned long q_addr = 0;
+   uint64_t val = 0;
+
+   cm_exclusive_addr = addr;
+   CM_GET_QEMU_ADDR(env, q_addr,addr);
+   val = *(uint64_t *)q_addr;
+   cm_exclusive_val = val;
+   env->regs[reg] = (uint32_t)val;
+   env->regs[reg + 1] = (uint32_t)(val>>32);
+}
+
+void HELPER(store_exclusiveq)(CPUArchState *env, uint32_t res, uint32_t reg, uint32_t addr)
+{
+   unsigned long q_addr = 0;
+   uint64_t val = 0;
+   uint64_t r = 0;
+
+   if(addr != cm_exclusive_addr)
+        goto fail;
+
+   CM_GET_QEMU_ADDR(env, q_addr,addr);
+   val = (uint32_t)env->regs[reg];
+   val |= ((uint64_t)env->regs[reg + 1]) << 32;
+
+   r = atomic_compare_exchangeq((uint64_t *)q_addr,
+                                    (uint64_t)cm_exclusive_val, val);
+
+   if(r == (uint64_t)cm_exclusive_val) {
+        env->regs[res] = 0;
+        goto done;
+   } else {
+        goto fail;
+   }
+
+fail:
+    env->regs[res] = 1;
+
+done:
+    cm_exclusive_addr = -1;
+    return;
+}
+
+void HELPER(clear_exclusive)(CPUArchState *env)
+{
+    cm_exclusive_addr = -1;
+}
+
+void HELPER(swpb)(CPUArchState *env, uint32_t dst, uint32_t src, uint32_t addr)
+{
+    uint8_t old, val;
+    unsigned long q_addr;
+    CM_GET_QEMU_ADDR(env, q_addr,env->regs[addr]);
+    val = (uint8_t)env->regs[src];
+    old = atomic_exchangeb((uint8_t *)q_addr, (uint8_t)val);
+    env->regs[dst] = old;
+    //printf("SWPB\n");
+}
+
+void HELPER(swp)(CPUArchState *env, uint32_t dst, uint32_t src, uint32_t addr)
+{
+    uint32_t old, val;
+    unsigned long q_addr;
+    CM_GET_QEMU_ADDR(env, q_addr,env->regs[addr]);
+    val = env->regs[src];
+    old = atomic_exchangel((uint32_t *)q_addr, val);
+    env->regs[dst] = old;
+    //printf("SWP\n");
+}
diff --git a/src/llvm/atomic/atomic-helper.h b/src/llvm/atomic/atomic-helper.h
new file mode 100644
index 0000000..9e3cedf
--- /dev/null
+++ b/src/llvm/atomic/atomic-helper.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config-target.h"
+
+#ifdef CONFIG_COREMU
+
+#if defined(TARGET_I386)
+#define __GEN_HEADER(type) \
+DEF_HELPER_3(atomic_inc##type, void, env, tl, int)                \
+DEF_HELPER_4(xchg##type, void, env, tl, int, int)                 \
+DEF_HELPER_4(atomic_op##type, void, env, tl, tl, int)             \
+DEF_HELPER_4(atomic_xadd##type, void, env, tl, int, int)          \
+DEF_HELPER_4(atomic_cmpxchg##type, void, env, tl, int, int)       \
+DEF_HELPER_2(atomic_not##type, void, env, tl)                     \
+DEF_HELPER_2(atomic_neg##type, void, env, tl)
+
+__GEN_HEADER(b)
+__GEN_HEADER(w)
+__GEN_HEADER(l)
+#ifdef TARGET_X86_64
+__GEN_HEADER(q)
+#endif
+
+DEF_HELPER_2(atomic_cmpxchg8b, void, env, tl)
+DEF_HELPER_2(atomic_cmpxchg16b, void, env, tl)
+
+DEF_HELPER_4(atomic_bts, void, env, tl, tl, int)
+DEF_HELPER_4(atomic_btr, void, env, tl, tl, int)
+DEF_HELPER_4(atomic_btc, void, env, tl, tl, int)
+
+/* fence */
+DEF_HELPER_1(fence, void, env)
+
+#elif defined(TARGET_ARM)
+#define __GEN_HEADER(type) \
+DEF_HELPER_3(load_exclusive##type, void, env, i32, i32)           \
+DEF_HELPER_4(store_exclusive##type, void, env, i32, i32, i32)
+
+__GEN_HEADER(b)
+__GEN_HEADER(w)
+__GEN_HEADER(l)
+__GEN_HEADER(q)
+
+DEF_HELPER_1(clear_exclusive, void, env)
+
+DEF_HELPER_4(swpb, void, env, i32, i32, i32)
+DEF_HELPER_4(swp, void, env, i32, i32, i32)
+#else
+#error "unsupported processor type"
+#endif
+
+#endif
+
diff --git a/src/llvm/atomic/atomic-x86.c b/src/llvm/atomic/atomic-x86.c
new file mode 100644
index 0000000..dc0baf0
--- /dev/null
+++ b/src/llvm/atomic/atomic-x86.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *  Xi Wu           <wuxi@fudan.edu.cn>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* We include this file in op_helper.c */
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <assert.h>
+#include "coremu-atomic.h"
+
+#define EAX (env->regs[R_EAX])
+#define ECX (env->regs[R_ECX])
+#define EDX (env->regs[R_EDX])
+#define EBX (env->regs[R_EBX])
+
+/* These definitions are copied from translate.c */
+#if defined(WORDS_BIGENDIAN)
+#define REG_B_OFFSET (sizeof(target_ulong) - 1)
+#define REG_H_OFFSET (sizeof(target_ulong) - 2)
+#define REG_W_OFFSET (sizeof(target_ulong) - 2)
+#define REG_L_OFFSET (sizeof(target_ulong) - 4)
+#define REG_LH_OFFSET (sizeof(target_ulong) - 8)
+#else
+#define REG_B_OFFSET 0
+#define REG_H_OFFSET 1
+#define REG_W_OFFSET 0
+#define REG_L_OFFSET 0
+#define REG_LH_OFFSET 4
+#endif
+
+#ifdef TARGET_X86_64
+#define X86_64_DEF(...)  __VA_ARGS__
+#else
+#define X86_64_DEF(...)
+#endif
+
+#define REG_LOW_MASK (~(uint64_t)0x0>>32)
+
+/* gen_op instructions */
+/* i386 arith/logic operations */
+enum {
+    OP_ADDL,
+    OP_ORL,
+    OP_ADCL,
+    OP_SBBL,
+    OP_ANDL,
+    OP_SUBL,
+    OP_XORL,
+    OP_CMPL,
+};
+
+/* */
+static target_ulong cm_get_reg_val(CPUX86State *env, int ot, int hregs, int reg)
+{
+    target_ulong val, offset;
+    CPUX86State *env1 = env;
+
+    switch(ot) {
+    case 0:  /* OT_BYTE */
+        if (reg < 4 X86_64_DEF( || reg >= 8 || hregs)) {
+            goto std_case;
+        } else {
+            offset = offsetof(CPUX86State, regs[reg - 4]) + REG_H_OFFSET;
+            val = *(((uint8_t *)env1) + offset);
+        }
+        break;
+    default:
+    std_case:
+        val =  env1->regs[reg];
+        break;
+    }
+
+    return val;
+}
+
+static void cm_set_reg_val(CPUX86State *env, int ot, int hregs, int reg, target_ulong val)
+{
+      target_ulong offset;
+
+      CPUX86State *env1 = env;
+
+      switch(ot) {
+      case 0: /* OT_BYTE */
+          if (reg < 4 X86_64_DEF (|| reg >= 8 || hregs)) {
+              offset = offsetof(CPUX86State, regs[reg]) + REG_B_OFFSET;
+              *(((uint8_t *) env1) + offset) = (uint8_t)val;
+          } else {
+              offset = offsetof(CPUX86State, regs[reg - 4]) + REG_H_OFFSET;
+              *(((uint8_t *) env1) + offset) = (uint8_t)val;
+          }
+          break;
+      case 1: /* OT_WORD */
+          offset = offsetof(CPUX86State, regs[reg]) + REG_W_OFFSET;
+          *((uint16_t *)((uint8_t *)env1 + offset)) = (uint16_t)val;
+          break;
+      case 2: /* OT_LONG */
+          env1->regs[reg] = REG_LOW_MASK & val;
+          break;
+      default:
+      case 3: /* OT_QUAD */
+          env1->regs[reg] = val;
+          break;
+      }
+}
+
+#define LD_b ldub_p
+#define LD_w lduw_p
+#define LD_l ldl_p
+#define LD_q ldq_p
+
+/* Lightweight transactional memory. */
+#define TX(vaddr, type, value, command) \
+    unsigned long __q_addr;                                   \
+    DATA_##type __oldv;                                       \
+    DATA_##type value;                                        \
+                                                              \
+    CM_GET_QEMU_ADDR(env, __q_addr, vaddr);                   \
+    do {                                                      \
+        __oldv = value = LD_##type((DATA_##type *)__q_addr);  \
+        {command;};                                           \
+        mb();                                                 \
+    } while (__oldv != (atomic_compare_exchange##type(        \
+                    (DATA_##type *)__q_addr, __oldv, value)))
+
+/* Atomically emulate INC instruction using CAS1 and memory transaction. */
+
+#define GEN_ATOMIC_INC(type, TYPE) \
+void helper_atomic_inc##type(CPUX86State *env, target_ulong a0, int c) \
+{                                                                      \
+    int eflags_c, eflags;                                              \
+    int cc_op;                                                         \
+                                                                       \
+    /* compute the previous instruction c flags */                     \
+    eflags_c = helper_cc_compute_c(CC_DST, CC_SRC, CC_SRC2, CC_OP);    \
+                                                                       \
+    TX(a0, type, value, {                                              \
+        if (c > 0) {                                                   \
+            value++;                                                   \
+            cc_op = CC_OP_INC##TYPE;                                   \
+        } else {                                                       \
+            value--;                                                   \
+            cc_op = CC_OP_DEC##TYPE;                                   \
+        }                                                              \
+    });                                                                \
+                                                                       \
+    CC_SRC = eflags_c;                                                 \
+    CC_DST = value;                                                    \
+                                                                       \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, cc_op);    \
+    CC_SRC = eflags;                                                   \
+}                                                                      \
+
+GEN_ATOMIC_INC(b, B);
+GEN_ATOMIC_INC(w, W);
+GEN_ATOMIC_INC(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_INC(q, Q);
+#endif
+
+#define OT_b 0
+#define OT_w 1
+#define OT_l 2
+#define OT_q 3
+
+#define GEN_ATOMIC_XCHG(type) \
+void helper_xchg##type(CPUX86State *env, target_ulong a0, int reg, \
+                int hreg)                                          \
+{                                                                  \
+    DATA_##type val, out;                                          \
+    unsigned long q_addr;                                          \
+                                                                   \
+    CM_GET_QEMU_ADDR(env, q_addr, a0);                             \
+    val = (DATA_##type)cm_get_reg_val(env, OT_##type, hreg, reg);  \
+    out = atomic_exchange##type((DATA_##type *)q_addr, val);       \
+    mb();                                                          \
+                                                                   \
+    cm_set_reg_val(env, OT_##type, hreg, reg, out);                \
+}
+
+GEN_ATOMIC_XCHG(b);
+GEN_ATOMIC_XCHG(w);
+GEN_ATOMIC_XCHG(l);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_XCHG(q);
+#endif
+
+#define GEN_ATOMIC_OP(type, TYPE) \
+void helper_atomic_op##type(CPUX86State *env, target_ulong a0,      \
+                target_ulong t1, int op)                            \
+{                                                                   \
+    DATA_##type operand;                                            \
+    int eflags_c, eflags;                                           \
+    int cc_op;                                                      \
+                                                                    \
+    /* compute the previous instruction c flags */                  \
+    eflags_c = helper_cc_compute_c(CC_DST, CC_SRC, CC_SRC2, CC_OP); \
+    operand = (DATA_##type)t1;                                      \
+                                                                    \
+    TX(a0, type, value, {                                           \
+        switch(op) {                                                \
+        case OP_ADCL:                                               \
+            value += operand + eflags_c;                            \
+            cc_op = CC_OP_ADD##TYPE + (eflags_c << 2);              \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_SBBL:                                               \
+            value = value - operand - eflags_c;                     \
+            cc_op = CC_OP_SUB##TYPE + (eflags_c << 2);              \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_ADDL:                                               \
+            value += operand;                                       \
+            cc_op = CC_OP_ADD##TYPE;                                \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        case OP_SUBL:                                               \
+            value -= operand;                                       \
+            cc_op = CC_OP_SUB##TYPE;                                \
+            CC_SRC = operand;                                       \
+            break;                                                  \
+        default:                                                    \
+        case OP_ANDL:                                               \
+            value &= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_ORL:                                                \
+            value |= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_XORL:                                               \
+            value ^= operand;                                       \
+            cc_op = CC_OP_LOGIC##TYPE;                              \
+            break;                                                  \
+        case OP_CMPL:                                               \
+            abort();                                                \
+            break;                                                  \
+        }                                                           \
+    });                                                             \
+    CC_DST = value;                                                 \
+    /* successful transaction, compute the eflags */                \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, cc_op); \
+    CC_SRC = eflags;                                                \
+}
+
+GEN_ATOMIC_OP(b, B);
+GEN_ATOMIC_OP(w, W);
+GEN_ATOMIC_OP(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_OP(q, Q);
+#endif
+
+/* xadd */
+#define GEN_ATOMIC_XADD(type, TYPE) \
+void helper_atomic_xadd##type(CPUX86State *env, target_ulong a0, \
+                int reg, int hreg)                               \
+{                                                                \
+    DATA_##type operand, oldv;                                   \
+    int eflags;                                                  \
+                                                                 \
+    operand = (DATA_##type)cm_get_reg_val(                       \
+            env, OT_##type, hreg, reg);                          \
+                                                                 \
+    TX(a0, type, newv, {                                         \
+        oldv = newv;                                             \
+        newv += operand;                                         \
+    });                                                          \
+                                                                 \
+    /* transaction successes */                                  \
+    /* xchg the register and compute the eflags */               \
+    cm_set_reg_val(env, OT_##type, hreg, reg, oldv);             \
+    CC_SRC = oldv;                                               \
+    CC_DST = newv;                                               \
+                                                                 \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,      \
+		    CC_OP_ADD##TYPE);                            \
+    CC_SRC = eflags;                                             \
+}
+
+GEN_ATOMIC_XADD(b, B);
+GEN_ATOMIC_XADD(w, W);
+GEN_ATOMIC_XADD(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_XADD(q, Q);
+#endif
+
+/* cmpxchg */
+#define GEN_ATOMIC_CMPXCHG(type, TYPE) \
+void helper_atomic_cmpxchg##type(CPUX86State *env, target_ulong a0, \
+                int reg, int hreg)                                  \
+{                                                                   \
+    DATA_##type reg_v, eax_v, res;                                  \
+    int eflags;                                                     \
+    unsigned long q_addr;                                           \
+                                                                    \
+    CM_GET_QEMU_ADDR(env, q_addr, a0);                              \
+    reg_v = (DATA_##type)cm_get_reg_val(env, OT_##type, hreg, reg); \
+    eax_v = (DATA_##type)cm_get_reg_val(env, OT_##type, 0, R_EAX);  \
+                                                                    \
+    res = atomic_compare_exchange##type(                            \
+            (DATA_##type *)q_addr, eax_v, reg_v);                   \
+    mb();                                                           \
+                                                                    \
+    if (res != eax_v)                                               \
+        cm_set_reg_val(env, OT_##type, 0, R_EAX, res);              \
+                                                                    \
+    CC_SRC = res;                                                   \
+    CC_DST = eax_v - res;                                           \
+                                                                    \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,         \
+		    CC_OP_SUB##TYPE);                               \
+    CC_SRC = eflags;                                                \
+}
+
+GEN_ATOMIC_CMPXCHG(b, B);
+GEN_ATOMIC_CMPXCHG(w, W);
+GEN_ATOMIC_CMPXCHG(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_CMPXCHG(q, Q);
+#endif
+
+#if defined(_LP64)
+/* cmpxchgb (8, 16) */
+void helper_atomic_cmpxchg8b(CPUX86State *env, target_ulong a0)
+{
+    uint64_t edx_eax, ecx_ebx, res;
+    int eflags;
+    unsigned long q_addr;
+
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, CC_OP);
+    CM_GET_QEMU_ADDR(env, q_addr, a0);
+
+    edx_eax = (((uint64_t)EDX << 32) | (uint32_t)EAX);
+    ecx_ebx = (((uint64_t)ECX << 32) | (uint32_t)EBX);
+
+    res = atomic_compare_exchangeq((uint64_t *)q_addr, edx_eax, ecx_ebx);
+    mb();
+
+    if (res == edx_eax) {
+         eflags |= CC_Z;
+    } else {
+        EDX = (uint32_t)(res >> 32);
+        EAX = (uint32_t)res;
+        eflags &= ~CC_Z;
+    }
+
+    CC_SRC = eflags;
+}
+#else
+void helper_atomic_cmpxchg8b(CPUX86State *env, target_ulong a0)
+{
+    assert("helper_atomic_cmpxchg8b: not supported.\n");
+    exit(0);
+}
+#endif
+
+void helper_atomic_cmpxchg16b(CPUX86State *env, target_ulong a0)
+{
+    uint8_t res;
+    int eflags;
+    unsigned long q_addr;
+
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, CC_OP);
+    CM_GET_QEMU_ADDR(env, q_addr, a0);
+
+    uint64_t old_rax = *(uint64_t *)q_addr;
+    uint64_t old_rdx = *(uint64_t *)(q_addr + 8);
+    res = atomic_compare_exchange16b((uint64_t *)q_addr, EAX, EDX, EBX, ECX);
+    mb();
+
+    if (res) {
+        eflags |= CC_Z;         /* swap success */
+    } else {
+        EDX = old_rdx;
+        EAX = old_rax;
+        eflags &= ~CC_Z;        /* read the old value ! */
+    }
+
+    CC_SRC = eflags;
+}
+
+/* not */
+#define GEN_ATOMIC_NOT(type) \
+void helper_atomic_not##type(CPUX86State *env, \
+                target_ulong a0)               \
+{                                              \
+    TX(a0, type, value, {                      \
+        value = ~value;                        \
+    });                                        \
+}
+
+GEN_ATOMIC_NOT(b);
+GEN_ATOMIC_NOT(w);
+GEN_ATOMIC_NOT(l);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_NOT(q);
+#endif
+
+/* neg */
+#define GEN_ATOMIC_NEG(type, TYPE) \
+void helper_atomic_neg##type(CPUX86State *env,              \
+                target_ulong a0)                            \
+{                                                           \
+    int eflags;                                             \
+                                                            \
+    TX(a0, type, value, {                                   \
+        value = -value;                                     \
+    });                                                     \
+                                                            \
+    /* We should use the old value to compute CC */         \
+    CC_SRC = CC_DST = -value;                               \
+                                                            \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, \
+		    CC_OP_SUB##TYPE);                       \
+    CC_SRC = eflags;                                        \
+}                                                           \
+
+GEN_ATOMIC_NEG(b, B);
+GEN_ATOMIC_NEG(w, W);
+GEN_ATOMIC_NEG(l, L);
+#ifdef TARGET_X86_64
+GEN_ATOMIC_NEG(q, Q);
+#endif
+
+/* This is only used in BTX instruction, with an additional offset.
+ * Note that, when using register bitoffset, the value can be larger than
+ * operand size - 1 (operand size can be 16/32/64), refer to intel manual 2A
+ * page 3-11. */
+#define TX2(vaddr, type, value, offset, command) \
+    unsigned long __q_addr;                                   \
+    DATA_##type __oldv;                                       \
+    DATA_##type value;                                        \
+                                                              \
+    CM_GET_QEMU_ADDR(env, __q_addr, vaddr);                   \
+    __q_addr += offset >> 3;                                  \
+    do {                                                      \
+        __oldv = value = LD_##type((DATA_##type *)__q_addr);  \
+        {command;};                                           \
+        mb();                                                 \
+    } while (__oldv != (atomic_compare_exchange##type(        \
+                    (DATA_##type *)__q_addr, __oldv, value)))
+
+#define GEN_ATOMIC_BTX(ins, command) \
+void helper_atomic_##ins(CPUX86State *env, target_ulong a0,   \
+                target_ulong offset, int ot)                  \
+{                                                             \
+    uint8_t old_byte;                                         \
+    int eflags;                                               \
+                                                              \
+    TX2(a0, b, value, offset, {                               \
+        old_byte = value;                                     \
+        {command;};                                           \
+    });                                                       \
+                                                              \
+    CC_SRC = (old_byte >> (offset & 0x7));                    \
+    CC_DST = 0;                                               \
+    eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2,   \
+		    CC_OP_SARB + ot);                         \
+    CC_SRC = eflags;                                          \
+}
+
+/* bts */
+GEN_ATOMIC_BTX(bts, {
+    value |= (1 << (offset & 0x7));
+});
+/* btr */
+GEN_ATOMIC_BTX(btr, {
+    value &= ~(1 << (offset & 0x7));
+});
+/* btc */
+GEN_ATOMIC_BTX(btc, {
+    value ^= (1 << (offset & 0x7));
+});
+
+/* fence **/
+void helper_fence(CPUX86State *env)
+{
+    mb();
+}
+
+#undef EAX
+#undef ECX
+#undef EDX
+#undef EBX
diff --git a/src/llvm/atomic/coremu-atomic.h b/src/llvm/atomic/coremu-atomic.h
new file mode 100644
index 0000000..998232b
--- /dev/null
+++ b/src/llvm/atomic/coremu-atomic.h
@@ -0,0 +1,412 @@
+/*
+ * COREMU Parallel Emulator Framework
+ *
+ * Atomic support for COREMU system.
+ * XXX: Now only support x86-64 architecture.
+ *
+ * Copyright (C) 2010 Parallel Processing Institute (PPI), Fudan Univ.
+ *  <http://ppi.fudan.edu.cn/system_research_group>
+ *
+ * Authors:
+ *  Zhaoguo Wang    <zgwang@fudan.edu.cn>
+ *  Yufei Chen      <chenyufei@fudan.edu.cn>
+ *  Ran Liu         <naruilone@gmail.com>
+ *  Xi Wu           <wuxi@fudan.edu.cn>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _COREMU_ATOMIC_H
+#define _COREMU_ATOMIC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "config-target.h"
+#include "hqemu.h"
+
+/* Given the guest virtual address, get the corresponding host address.
+ * This macro resembles ldxxx in softmmu_template.h
+ * NOTE: This must be inlined since the use of GETPC needs to get the
+ * return address. Using always inline also works, we use macro here to be more
+ * explicit. */
+#if defined(CONFIG_USER_ONLY)
+#define CM_GET_QEMU_ADDR(__env1, q_addr, v_addr) \
+do {					         \
+    q_addr = v_addr + GUEST_BASE;	         \
+} while (0)
+
+#else
+#define CM_GET_QEMU_ADDR(__env1, q_addr, v_addr) \
+do {                                                                        \
+    CPUState *cpu = ENV_GET_CPU(__env1);                                    \
+    int __mmu_idx, __index;                                                 \
+    uintptr_t __retaddr;                                                    \
+    __index = (v_addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);            \
+    /* get the CPL, hence determine the MMU mode */                         \
+    __mmu_idx = cpu_mmu_index(__env1, false);                               \
+    /* We use this function in the implementation of atomic instructions */ \
+    /* and we are going to modify these memory. So we use addr_write. */    \
+    if (unlikely(__env1->tlb_table[__mmu_idx][__index].addr_write           \
+                != ((v_addr & TARGET_PAGE_MASK) | tlb_version(__env1)))) {  \
+        __retaddr = GETPC();                                                \
+        tlb_fill(cpu, v_addr, 1, __mmu_idx, __retaddr);                     \
+    }                                                                       \
+    q_addr = v_addr + __env1->tlb_table[__mmu_idx][__index].addend;         \
+} while(0)
+#endif
+
+/* XXX These are also used by atomic instruction handling.
+ * Put these defines in some other files? */
+#define DATA_b uint8_t
+#define DATA_w uint16_t
+#define DATA_l uint32_t
+#define DATA_q uint64_t
+
+#define __inline__ inline __attribute__((always_inline))
+
+#if defined(__i386__) || defined(__x86_64__)
+// Is this the correct way to detect 64 system?
+#if defined(_LP64)
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t rax, uint64_t rdx,
+                           uint64_t rbx, uint64_t rcx)
+{
+    uint8_t z;
+    __asm __volatile__ ( "lock; cmpxchg16b %3\n\t"
+                         "setz %2\n\t"
+                         : "=a" (rax), "=d" (rdx), "=r" (z), "+m" (*memp)
+                         : "a" (rax), "d" (rdx), "b" (rbx), "c" (rcx)
+                         : "memory", "cc" );
+    return z;
+}
+#else
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t rax, uint64_t rdx,
+                           uint64_t rbx, uint64_t rcx)
+{
+    assert("atomic_compare_exchange16b: not supported.\n");
+    exit(0);
+}
+
+static __inline__ uint8_t
+atomic_compare_exchangeq(uint64_t *addr,
+		uint64_t oldval, uint64_t newval)
+{
+    assert("atomic_compare_exchangeq: not supported.\n");
+    exit(0);
+}
+
+#endif
+
+/* Memory Barriers: x86-64 ONLY now */
+#define mb()    asm volatile("mfence":::"memory")
+#define rmb()   asm volatile("lfence":::"memory")
+#define wmb()   asm volatile("sfence" ::: "memory")
+
+#define LOCK_PREFIX "lock; "
+
+#define coremu_xglue(a, b) a ## b
+// If a/b is macro, it will expand first, then pass to coremu_xglue
+#define coremu_glue(a, b) coremu_xglue(a, b)
+
+#define coremu_xstr(s) # s
+#define coremu_str(s) coremu_xstr(s)
+
+#define DATA_BITS 8
+#include "coremu-template.h"
+
+#define DATA_BITS 16
+#include "coremu-template.h"
+
+#define DATA_BITS 32
+#include "coremu-template.h"
+
+#if defined(_LP64)
+#define DATA_BITS 64
+#include "coremu-template.h"
+#else
+static inline uint64_t atomic_exchangeq(uint64_t *p, uint64_t val)
+{
+    assert("atomic_exchangeq: not supported.\n");
+    exit(0);
+}
+
+#endif
+
+#elif defined(__arm__)
+
+#if defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || \
+    defined(__ARM_ARCH_7EM__) || \
+    defined(__ARM_ARCH_7M__)  || \
+    defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || \
+    defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__)
+#define USE_ARMV6_INSTRUCTIONS
+#endif
+
+#ifdef USE_ARMV6_INSTRUCTIONS
+#define mb()	__asm__ __volatile__("dmb" : : : "memory")
+#define raw_local_irq_save(x)                                   \
+        ({                                                      \
+        __asm__ __volatile__(                                   \
+        "mrs    %0, cpsr                @ local_irq_save\n"     \
+        "cpsid  i"                                              \
+        : "=r" (x) : : "memory", "cc");                         \
+        })
+#else
+#define mb()    __asm__ __volatile__("":::"memory")
+#define raw_local_irq_save(x)                                   \
+        ({                                                      \
+                unsigned long temp;                             \
+                (void) (&temp == &x);                           \
+        __asm__ __volatile__(                                   \
+        "mrs    %0, cpsr                @ local_irq_save\n"     \
+"       orr     %1, %0, #128\n"                                 \
+"       msr     cpsr_c, %1"                                     \
+        : "=r" (x), "=r" (temp)                                 \
+        :                                                       \
+        : "memory", "cc");                                      \
+        })
+#endif
+
+#define raw_local_irq_restore(x)                                \
+	__asm__ __volatile(                                     \
+	"msr    cpsr_c, %0              @ local_irq_restore\n"  \
+	:                                                       \
+	: "r" (x)                                               \
+	: "memory", "cc")
+
+static __inline__ uint8_t atomic_compare_exchangeb(uint8_t *addr,
+        uint8_t oldval, uint8_t newval)
+{
+    uint8_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexb  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexbeq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint16_t atomic_compare_exchangew(uint16_t *addr,
+        uint16_t oldval, uint16_t newval)
+{
+    uint16_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexh  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexheq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint32_t atomic_compare_exchangel(uint32_t *addr,
+        uint32_t oldval, uint32_t newval)
+{
+    uint32_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrex  %1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       strexeq %0, %5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint64_t atomic_compare_exchangeq(uint64_t *addr,
+        uint64_t oldval, uint64_t newval)
+{
+    uint64_t ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned long tmp;
+    __asm__ __volatile__("@ atomic_cmpxchgl\n"
+    "1:     ldrexd  %1, %H1, [%3]\n"
+    "       mov    %0, #0\n"
+    "       teq    %1, %4\n"
+    "       teqeq  %H1, %H4\n"
+    "       strexdeq %0, %5, %H5, [%3]\n"
+    "       teq    %0, #0\n"
+    "       bne    1b\n"
+            : "=&r" (tmp), "=&r" (ret), "+Qo" (*addr)
+            : "r" (addr), "Ir" (oldval), "r" (newval)
+            : "cc");
+#else
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *addr;
+    if (likely(ret == oldval))
+        *addr = newval;
+    raw_local_irq_restore(flags);
+#endif
+    return ret;
+}
+
+static __inline__ uint8_t
+atomic_compare_exchange16b(uint64_t *memp,
+                           uint64_t old_less, uint64_t old_most,
+                           uint64_t new_less, uint64_t new_most)
+{
+    uint8_t ret = 0;
+    unsigned long flags;
+    raw_local_irq_save(flags);
+    ret = *memp;
+    if (likely(*memp == old_less && *(memp+1) == old_most))
+    {
+        *memp = new_less;
+	*(memp+1) = new_most;
+	ret = 1;
+    }
+    raw_local_irq_restore(flags);
+    return ret;
+}
+
+static __inline__ unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
+{
+    unsigned long ret;
+#ifdef USE_ARMV6_INSTRUCTIONS
+    unsigned int tmp;
+#endif
+
+    mb();
+
+    switch (size) {
+#ifdef USE_ARMV6_INSTRUCTIONS
+        case 1:
+		__asm __volatile("@	__xchg1\n"
+		"1:	ldrexb	%0, [%3]\n"
+		"	strexb	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+        case 2:
+		__asm __volatile("@	__xchg1\n"
+		"1:	ldrexh	%0, [%3]\n"
+		"	strexh	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+	case 4:
+		__asm __volatile("@	__xchg4\n"
+		"1:	ldrex	%0, [%3]\n"
+		"	strex	%1, %2, [%3]\n"
+		"	teq	%1, #0\n"
+		"	bne	1b"
+			: "=&r" (ret), "=&r" (tmp)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+#else
+	case 1:
+		__asm __volatile("@	__xchg1\n"
+		"	swpb	%0, %1, [%2]"
+			: "=&r" (ret)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+
+	case 4:
+		__asm __volatile("@	__xchg4\n"
+		"	swp	%0, %1, [%2]"
+			: "=&r" (ret)
+			: "r" (x), "r" (ptr)
+			: "memory", "cc");
+		break;
+	case 2:
+		{
+    		unsigned long flags = 0;
+		raw_local_irq_save(flags);
+		ret = *(volatile uint16_t *)ptr;
+		*(volatile uint16_t *)ptr = x;
+		raw_local_irq_restore(flags);
+		break;
+		}
+
+#endif
+	default:
+		exit(0);
+    }
+    mb();
+
+    return ret;
+}
+
+#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+#define GEN_ATOMIC_XCHG_HELPER(TYPE) \
+static __inline__ DATA_##TYPE atomic_exchange##TYPE(DATA_##TYPE *p, DATA_##TYPE val) { return xchg(p, val); }
+
+GEN_ATOMIC_XCHG_HELPER(b);
+GEN_ATOMIC_XCHG_HELPER(w);
+GEN_ATOMIC_XCHG_HELPER(l);
+
+#endif
+
+#endif /* _COREMU_ATOMIC_H */
+
diff --git a/src/llvm/atomic/coremu-template.h b/src/llvm/atomic/coremu-template.h
new file mode 100644
index 0000000..66b185c
--- /dev/null
+++ b/src/llvm/atomic/coremu-template.h
@@ -0,0 +1,101 @@
+/* The following code may be included multiple times in a single file. */
+
+#if DATA_BITS == 64
+#  define DATA_TYPE uint64_t
+#  define SUFFIX q
+#elif DATA_BITS == 32
+#  define DATA_TYPE uint32_t
+#  define SUFFIX l
+#elif DATA_BITS == 16
+#  define DATA_TYPE uint16_t
+#  define SUFFIX w
+#elif DATA_BITS == 8
+#  define DATA_TYPE uint8_t
+#  define SUFFIX b
+#else
+#error unsupported data size
+#endif
+
+static __inline__ void coremu_glue(atomic_inc, SUFFIX)(DATA_TYPE *p) {
+    asm volatile(
+        LOCK_PREFIX "inc"coremu_str(SUFFIX)" %0"
+        : "+m"(*p)
+        :
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_dec, SUFFIX)(DATA_TYPE *p) {
+    asm volatile(
+        LOCK_PREFIX "dec"coremu_str(SUFFIX)" %0"
+        : "+m"(*p)
+        :
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_add, SUFFIX)(DATA_TYPE* addr,
+        DATA_TYPE val) {
+    asm volatile(
+        LOCK_PREFIX "add"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "a"(val)
+        : "cc");
+}
+
+/* swap the value VAL and *p.
+ * Return the value swapped out from memory. */
+static inline DATA_TYPE coremu_glue(atomic_exchange, SUFFIX)(
+        DATA_TYPE *p, DATA_TYPE val)
+{
+    DATA_TYPE out;
+    __asm __volatile(
+            "lock; xchg"coremu_str(SUFFIX)" %1,%2 \n\t"
+            : "=a" (out), "+m" (*p)
+            : "a" (val)
+            );
+    return out;
+}
+/* Return previous value in addr. So if the return value is the same as oldval,
+ * swap occured. */
+static __inline__ DATA_TYPE coremu_glue(atomic_compare_exchange, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE oldval, DATA_TYPE newval) {
+    asm volatile(
+        LOCK_PREFIX "cmpxchg"coremu_str(SUFFIX)" %2, %1"
+        : "+a"(oldval), "+m"(*addr)
+        : "q"(newval)
+        : "cc");
+
+    return oldval;
+}
+
+static __inline__ void coremu_glue(atomic_and, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE mask) {
+    asm volatile(
+        LOCK_PREFIX "and"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "r"(mask)
+        : "cc");
+}
+
+static __inline__ void coremu_glue(atomic_or, SUFFIX)(DATA_TYPE *addr,
+        DATA_TYPE mask) {
+    asm volatile(
+        LOCK_PREFIX "or"coremu_str(SUFFIX)" %1, %0"
+        : "+m"(*addr)
+        : "r"(mask)
+        : "cc");
+}
+
+static __inline__ DATA_TYPE coremu_glue(atomic_xadd, SUFFIX)(
+        DATA_TYPE* addr, DATA_TYPE val) {
+    asm volatile(
+        LOCK_PREFIX "xadd"coremu_str(SUFFIX)" %0, %1"
+        : "+a"(val), "+m"(*addr)
+        :
+        : "cc");
+
+    return val;
+}
+
+#undef DATA_BITS
+#undef DATA_TYPE
+#undef SUFFIX