9 files changed, 2574 insertions, 0 deletions
diff --git a/src/llvm/pass/CombineCasts.cpp b/src/llvm/pass/CombineCasts.cpp
new file mode 100644
index 0000000..71a74ff
--- /dev/null
+++ b/src/llvm/pass/CombineCasts.cpp
@@ -0,0 +1,321 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm-target.h"
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineCasts"
+
+/*
+ * CombineCasts Pass
+ */
+class CombineCasts : public FunctionPass {
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Type *FloatTy;
+    Type *DoubleTy;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit CombineCasts() : FunctionPass(ID) {}
+    explicit CombineCasts(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+        FloatTy     = Type::getFloatTy(Context);
+        DoubleTy    = Type::getDoubleTy(Context);
+    }
+
+    Instruction *getUniqueUser(Instruction *I) {
+        if (I->hasOneUse())
+            return I->user_back();
+        return nullptr;
+    };
+
+    bool combineLoadCast(LoadInst *LI);
+    bool combineStoreCast(StoreInst *SI);
+    bool combineCastCast(Function &F);
+    bool simplifySignChange(Function &F);
+    bool runOnFunction(Function &F);
+};
+
+char CombineCasts::ID = 0;
+INITIALIZE_PASS(CombineCasts, "combinecast",
+        "Combine bitcast with guest memory loads/stores", false, false)
+
+FunctionPass *llvm::createCombineCasts(IRFactory *IF) 
+{
+    return new CombineCasts(IF);
+}
+
+static bool hasSameCastingTy(ArrayRef<BitCastInst *> IL) {
+    Type *SrcTy = IL[0]->getSrcTy();
+    Type *DstTy = IL[0]->getDestTy();
+    for (BitCastInst *I : IL) {
+        if (I->getSrcTy() != SrcTy)
+            return false;
+        if (I->getDestTy() != DstTy)
+            return false;
+    }
+    return true;
+}
+
+/* This function aims to change the load type if (1) the type of loaded data is
+ * casted to another type, (2) only one user of the load instruction is bitcast,
+ * and (3) all other users of the load instruction are stores.
+ *
+ * For example:
+ *  %0 = load <typeA>*              %0 = load <typeB>*
+ *  %1 = bitcast %0, <typeB>        %1 = bitcast %0, <typeA>
+ *
+ *  %2 = op <typeB> %1, ...    =>   %2 = op <typeB> %0, ...
+ *
+ *  store %0, <typeA>*              store %1, <typeA>*
+ *  store %1, <typeB>*              store %0, <typeB>*
+ */
+bool CombineCasts::combineLoadCast(LoadInst *LI)
+{
+    Instruction *Ptr = dyn_cast<Instruction>(LI->getPointerOperand());
+
+    if (!Ptr)
+        return false;
+
+    /* Find all bitcast users of this load. */
+    SmallVector<BitCastInst *, 4> BCIs;
+    for (User *U : LI->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        switch (UI->getOpcode()) {
+        default:
+            return false;
+        case Instruction::PHI:
+        case Instruction::Load:
+        case Instruction::Store:
+            break;
+        case Instruction::BitCast:
+            BCIs.push_back(cast<BitCastInst>(UI));
+            break;
+        }
+    }
+
+    if (BCIs.empty() || !hasSameCastingTy(BCIs))
+        return false;
+
+    Instruction *InsertPos = LI;
+    unsigned Alignment = LI->getAlignment();
+    unsigned Volatile = LI->isVolatile();
+    Type *SrcTy = LI->getType();
+    Type *DstTy = BCIs[0]->getDestTy();
+
+    Type *PtrTy = PointerType::get(DstTy, LI->getPointerAddressSpace());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), PtrTy, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, PtrTy, "", InsertPos);
+
+    Instruction *NewLI = new LoadInst(Ptr, "", Volatile, Alignment, InsertPos);
+    Instruction *NewBCI = new BitCastInst(NewLI, SrcTy, "", InsertPos);
+
+    if (MF->isGuestMemory(LI))
+        MF->setGuestMemory(NewLI);
+    for (BitCastInst *BCI : BCIs)
+        BCI->replaceAllUsesWith(NewLI);
+    LI->replaceAllUsesWith(NewBCI);
+
+    toErase.push_back(LI);
+    for (BitCastInst *BCI : BCIs)
+        toErase.push_back(BCI);
+
+    return true;
+}
+
+/* This function aims to change the store type if stored data is casted from
+ * another type.
+ *
+ * For example:
+ *  %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>   =>   store %0, <typeA>*
+ *  store %1, <typeB>*
+ */
+bool CombineCasts::combineStoreCast(StoreInst *SI)
+{
+    Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand());
+    Instruction *Data = dyn_cast<Instruction>(SI->getValueOperand());
+
+    if (!Ptr || !Data || !isa<BitCastInst>(Data))
+        return false;
+
+    Instruction *InsertPos = SI;
+    unsigned Alignment = SI->getAlignment();
+    unsigned Volatile = SI->isVolatile();
+    BitCastInst *BCI = cast<BitCastInst>(Data);
+    Value *V = BCI->getOperand(0);
+    Type *SrcTy = V->getType();
+
+    Type *PtrTy = PointerType::get(SrcTy, SI->getPointerAddressSpace());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), PtrTy, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, PtrTy, "", InsertPos);
+
+    Instruction *NewSI = new StoreInst(V, Ptr, Volatile, Alignment, InsertPos);
+
+    if (MF->isGuestMemory(SI))
+        MF->setGuestMemory(NewSI);
+
+    toErase.push_back(SI);
+    return true;
+}
+
+/* This function aims to eliminate redundant casts.
+ * For example:
+ *  %0 = <typeA>                   %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>  =>
+ *  %2 = bitcast %1, <typeC>       %2 = bitcast %0, <typeC>
+ *     = op <typeC> %2, ...           = op <typeC> %2, ...
+ *
+ * And if <typeA> is the same as <typeC>, the code is further optimized to
+ *  %0 = <typeA>                   %0 = <typeA>
+ *  %1 = bitcast %0, <typeB>  =>
+ *  %2 = bitcast %1, <typeC>
+ *     = op <typeC> %2, ...           = op <typeA> %0, ...
+ */
+bool CombineCasts::combineCastCast(Function &F)
+{
+    SmallVector<Instruction*, 4> Worklist;
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (isa<BitCastInst>(I))
+            Worklist.push_back(I);
+    }
+
+    for (auto I : Worklist) {
+        BitCastInst *CI = cast<BitCastInst>(I);
+        BitCastInst *CSrc = dyn_cast<BitCastInst>(CI->getOperand(0));
+        if (!CSrc)
+            continue;
+
+        Type *SrcTy = CSrc->getOperand(0)->getType();
+        Type *DstTy = CI->getType();
+        Value *Result = (SrcTy == DstTy) ? CSrc->getOperand(0) :
+            new BitCastInst(CSrc->getOperand(0), CI->getType(), "", CI);
+        I->replaceAllUsesWith(Result);
+        toErase.push_back(I);
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+/* This function converts sign change of float/double data (i.e., -num),
+ * which is implemented with integer operations, to use float/double ops.
+ * For example:
+ *  %0 = bitcast float %num to i32
+ *  %1 = xor i32 %0, 0x80000000       =>    %0 = fsub float 0, %num
+ *  %2 = bitcast %1, float
+ */
+bool CombineCasts::simplifySignChange(Function &F)
+{
+    SmallVector<BitCastInst*, 16> Worklist;
+
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
+            Type *SrcTy = BCI->getSrcTy();
+            Type *DstTy = BCI->getDestTy();
+            if (SrcTy == FloatTy && DstTy == Int32Ty)
+                Worklist.push_back(BCI);
+            else if (SrcTy == DoubleTy && DstTy == Int64Ty)
+                Worklist.push_back(BCI);
+        }
+    }
+
+    for (auto I : Worklist) {
+        Type *Ty = I->getSrcTy();
+        Value *C = (Ty == FloatTy) ? CONST32(0x80000000)
+                                   : CONST64(0x8000000000000000LL);
+
+        /* Check whether the single user of this bitcast is Xor. */
+        Instruction *UI = getUniqueUser(I);
+        if (UI && UI->getOpcode() == Instruction::Xor && UI->getOperand(1) == C) {
+            /* Check whether the single user of this Xor is a bitcast
+             * instruction that casts the type back to the src type. */
+            Instruction *UUI = getUniqueUser(UI);
+            if (UUI && UUI->getOpcode() == Instruction::BitCast &&
+                cast<BitCastInst>(UUI)->getDestTy() == Ty) {
+                Value *V = BinaryOperator::Create(Instruction::FSub,
+                                                  ConstantFP::get(Ty, 0),
+                                                  I->getOperand(0), "", I);
+                UUI->replaceAllUsesWith(V);
+                toErase.push_back(UUI);
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+bool CombineCasts::runOnFunction(Function &F)
+{
+    bool Changed = false;
+    SmallVector<LoadInst *, 16> Loads;
+    SmallVector<StoreInst *, 16> Stores;
+
+    /* Collect all guest memory and non-volatile cpu state loads/stores. */
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+
+        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            if (MF->isGuestMemory(LI) || !LI->isVolatile())
+                Loads.push_back(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+            if (MF->isGuestMemory(SI) || !SI->isVolatile())
+                Stores.push_back(SI);
+        }
+    }
+
+    for (auto LI : Loads)
+        Changed |= combineLoadCast(LI);
+    for (auto SI : Stores)
+        Changed |= combineStoreCast(SI);
+
+    if (toErase.size())
+        ProcessErase(toErase);
+
+    Changed |= combineCastCast(F);
+    Changed |= simplifySignChange(F);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/CombineGuestMemory.cpp b/src/llvm/pass/CombineGuestMemory.cpp
new file mode 100644
index 0000000..0740a8b
--- /dev/null
+++ b/src/llvm/pass/CombineGuestMemory.cpp
@@ -0,0 +1,389 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineGuestMemory"
+
+/*
+ * CombineGuestMemory Pass
+ */
+class CombineGuestMemory : public FunctionPass {
+
+    struct StateInfo {
+        StateInfo() : Ptr(nullptr) {}
+        StateInfo(Value *ptr, APInt &offset, APInt &size)
+            : Ptr(ptr), Offset(offset), Size(size) {}
+        Value *Ptr;
+        APInt Offset;
+        APInt Size;
+    };
+
+    typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::map<size_t, size_t> StateMap;
+    typedef DenseMap<ValuePair, StateInfo> CSMap;
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+    Value *CPU;
+    Value *GuestBase;
+    Instruction *InitLastInst;
+    StateMap LegalStates;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit CombineGuestMemory() : FunctionPass(ID) {}
+    explicit CombineGuestMemory(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+
+        GuestBase = IF->getGuestBase();
+
+        addLegalStates();
+    }
+
+    unsigned getAddressSpaceOperand(Value *I) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(I))
+            return LI->getPointerAddressSpace();
+        if (StoreInst *SI = dyn_cast<StoreInst>(I))
+            return SI->getPointerAddressSpace();
+        return -1U;
+    }
+
+    int getNumUsers(Instruction *I) {
+        return distance(I->user_begin(), I->user_end());
+    }
+
+    void addLegalStates();
+    bool isLegalState(Value *Ptr, APInt &Offset, APInt &Size);
+    bool isConsecutiveAccess(Value *A, Value *B, Value *&Ptr, APInt &Offset, APInt &Size);
+    bool tryCombineLoad(Value *A, Value *B, CSMap &States);
+    bool tryCombineStore(Value *A, Value *B, CSMap &States);
+    bool combineMemory(SmallVector<Value *, 8> &Memory, SmallVector<Value *, 8> &States);
+    bool runOnFunction(Function &F);
+};
+
+char CombineGuestMemory::ID = 0;
+INITIALIZE_PASS(CombineGuestMemory, "combinegm",
+        "Combine guest memory loads and stores", false, false)
+
+FunctionPass *llvm::createCombineGuestMemory(IRFactory *IF) 
+{
+    return new CombineGuestMemory(IF);
+}
+
+
+void CombineGuestMemory::addLegalStates()
+{
+#if defined(TARGET_I386)
+    size_t Start = offsetof(CPUArchState, xmm_regs[0]);
+    size_t Size = sizeof(XMMReg);
+    for (int i = 0; i < CPU_NB_REGS; ++i)
+        LegalStates[Start + Size * i] = Size;
+#elif defined(TARGET_ARM)
+    size_t Start = offsetof(CPUArchState, vfp.regs[0]);
+    size_t Size = sizeof(float64) * 2;
+    for (int i = 0; i < 32; ++i)
+        LegalStates[Start + Size * i] = Size;
+#endif
+}
+
+bool CombineGuestMemory::isConsecutiveAccess(Value *A, Value *B, Value *&Ptr,
+                                             APInt &Offset, APInt &Size)
+{
+    Value *PtrA = getPointerOperand(A);
+    Value *PtrB = getPointerOperand(B);
+    unsigned ASA = getAddressSpaceOperand(A);
+    unsigned ASB = getAddressSpaceOperand(B);
+
+    if (!PtrA || !PtrB || (ASA != ASB))
+        return false;
+
+    Type *TyA = cast<PointerType>(PtrA->getType())->getElementType();
+    Type *TyB = cast<PointerType>(PtrB->getType())->getElementType();
+    if (DL->getTypeStoreSize(TyA) != DL->getTypeStoreSize(TyB))
+        return false;
+
+    unsigned PtrBitWidth = DL->getTypeSizeInBits(TyA);
+    APInt Sz(PtrBitWidth, DL->getTypeStoreSize(TyA));
+
+    APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+    PtrA = StripPointerWithConstantOffset(DL, PtrA, OffsetA, GuestBase);
+    PtrB = StripPointerWithConstantOffset(DL, PtrB, OffsetB, GuestBase);
+
+    APInt OffsetDelta = OffsetB - OffsetA;
+    if (PtrA == PtrB && OffsetDelta == Sz) {
+        Ptr = PtrA;
+        Offset = OffsetA;
+        Size = Sz + Sz;
+        return true;
+    }
+
+    return false;
+}
+
+bool CombineGuestMemory::isLegalState(Value *Ptr, APInt &Offset, APInt &Size)
+{
+    if (Ptr != CPU)
+        return false;
+    uint64_t Start = Offset.getZExtValue();
+    if (LegalStates.find(Start) == LegalStates.end() ||
+        Size.getZExtValue() > LegalStates[Start])
+        return false;
+    return true;
+}
+
+static bool hasMemoryViolation(Instruction *SA, Instruction *SB,
+                               Instruction *EA, Instruction *EB)
+{
+    std::set<Value*> Insts;
+    Insts.insert(SA);
+    Insts.insert(SB);
+    Insts.insert(EA);
+    Insts.insert(EB);
+
+    BasicBlock::iterator BI = BasicBlock::iterator(SA);
+    BasicBlock::iterator BE = BasicBlock::iterator(EA);
+    for (; BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isa<CallInst>(I))
+            return true;
+        if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+            continue;
+        if (Insts.find(I) == Insts.end())
+            return true;
+    }
+
+    BI = BasicBlock::iterator(SB);
+    BE = BasicBlock::iterator(EB);
+    for (; BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isa<CallInst>(I))
+            return true;
+        if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+            continue;
+        if (Insts.find(I) == Insts.end())
+            return true;
+    }
+    return false;
+}
+
+bool CombineGuestMemory::tryCombineLoad(Value *A, Value *B, CSMap &States)
+{
+    /* First, check if the guest loads are 'only' used by the store instructions
+     * to consecutive CPU states, and if any other loads/stores occurs between
+     * the queried operation. */
+    LoadInst *LA = cast<LoadInst>(A);
+    LoadInst *LB = cast<LoadInst>(B);
+    if (getNumUsers(LA) != 1 || getNumUsers(LB) != 1)
+        return false;
+
+    Value *VA = *LA->user_begin();
+    Value *VB = *LB->user_begin();
+    CSMap::iterator CSI = States.find(ValuePair(VA, VB));
+    if (CSI == States.end())
+        return false;
+
+    Instruction *SA = cast<Instruction>(VA);
+    Instruction *SB = cast<Instruction>(VB);
+
+    if (hasMemoryViolation(LA, LB, SA, SB))
+        return false;
+
+    /* Here we found the guest memory operations are loaded and stored to the
+     * CPU states immediately. The operations are safe to combine. */
+    Instruction *InsertPos = SA;
+    StateInfo &SI = CSI->second;
+    uint64_t Size = SI.Size.getZExtValue();
+    unsigned AS = getAddressSpaceOperand(LA);
+    unsigned Align = Size / 2;
+    Type *Ty = PointerType::get(VectorType::get(Int8Ty, Size), AS);
+    Instruction *Ptr = cast<Instruction>(LA->getPointerOperand());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), Ty, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, Ty, "", InsertPos);
+    Instruction *NewLI = new LoadInst(Ptr, "", true, Align, InsertPos);
+    MF->setGuestMemory(NewLI);
+
+    Ty = PointerType::getUnqual(VectorType::get(Int8Ty, Size));
+    Value *Offset = ConstantInt::get(Ty->getContext(), SI.Offset);
+    Ptr = GetElementPtrInst::CreateInBounds(CPU, Offset, "", InitLastInst);
+    Ptr = new BitCastInst(Ptr, Ty, "", InitLastInst);
+    new StoreInst(NewLI, Ptr, false, InsertPos);
+
+    States.erase(CSI);
+    toErase.push_back(SA);
+    toErase.push_back(SB);
+    return true;
+}
+
+bool CombineGuestMemory::tryCombineStore(Value *A, Value *B, CSMap &States)
+{
+    /* First, check if the CPU state loads are 'only' used by the guest store
+     * instructions, and if any other loads/stores occurs between the
+     * queried operation. */
+    StoreInst *SA = cast<StoreInst>(A);
+    StoreInst *SB = cast<StoreInst>(B);
+    Instruction *LA = dyn_cast<Instruction>(SA->getOperand(0));
+    Instruction *LB = dyn_cast<Instruction>(SB->getOperand(0));
+
+    if (!LA || !LB)
+        return false;
+    if (getNumUsers(LA) != 1 || getNumUsers(LB) != 1)
+        return false;
+
+    CSMap::iterator CSI = States.find(ValuePair(LA, LB));
+    if (CSI == States.end())
+        return false;
+
+    if (hasMemoryViolation(LA, LB, SA, SB))
+        return false;
+
+    /* Here we found the CPU states are loaded and stored to the guest memory
+     * immediately. The operations are safe to combine. */
+    Instruction *InsertPos = SA;
+    StateInfo &SI = CSI->second;
+    uint64_t Size = SI.Size.getZExtValue();
+    Type *Ty = PointerType::getUnqual(VectorType::get(Int8Ty, Size));
+    Value *Offset = ConstantInt::get(Ty->getContext(), SI.Offset);
+    Instruction *Ptr = GetElementPtrInst::CreateInBounds(CPU, Offset, "", InitLastInst);
+    Ptr = new BitCastInst(Ptr, Ty, "", InitLastInst);
+    Value *V = new LoadInst(Ptr, "", false, InsertPos);
+
+    unsigned AS = getAddressSpaceOperand(SA);
+    unsigned Align = Size / 2;
+    Ty = PointerType::get(VectorType::get(Int8Ty, Size), AS);
+    Ptr = cast<Instruction>(SA->getPointerOperand());
+    if (isa<IntToPtrInst>(Ptr))
+        Ptr = new IntToPtrInst(Ptr->getOperand(0), Ty, "", InsertPos);
+    else
+        Ptr = new BitCastInst(Ptr, Ty, "", InsertPos);
+    Instruction *NewSI = new StoreInst(V, Ptr, true, Align, InsertPos);
+    MF->setGuestMemory(NewSI);
+
+    States.erase(CSI);
+    toErase.push_back(SA);
+    toErase.push_back(SB);
+    return true;
+}
+
+bool CombineGuestMemory::combineMemory(SmallVector<Value *, 8> &Memory,
+                                       SmallVector<Value *, 8> &States)
+{
+    bool Changed = false;
+    SmallPtrSet<Value *, 4> Used;
+    CSMap ConsecutiveStates;
+    Value *Ptr;
+    APInt Offset, Size;
+
+    /* Find consecutive CPU states. */
+    for (unsigned i = 1, e = States.size(); i != e; i++) {
+        if (!isConsecutiveAccess(States[i-1], States[i], Ptr, Offset, Size))
+            continue;
+
+        if (!isLegalState(Ptr, Offset, Size))
+            continue;
+
+        ConsecutiveStates[ValuePair(States[i-1], States[i])] =
+            StateInfo(Ptr, Offset, Size);
+    }
+
+    if (ConsecutiveStates.size() == 0)
+        return false;
+
+    /* Find and combine consecutive guest memory accesses if their referrenced
+     * CPU states are also consecutive. */
+    for (unsigned i = 1, e = Memory.size(); i != e; i++) {
+        if (Used.count(Memory[i-1]) || Used.count(Memory[i]))
+            continue;
+        if (!isConsecutiveAccess(Memory[i-1], Memory[i], Ptr, Offset, Size))
+            continue;
+
+        bool ret = false;
+        if (isa<LoadInst>(Memory[i-1]) && isa<LoadInst>(Memory[i])) {
+            ret = tryCombineLoad(Memory[i-1], Memory[i], ConsecutiveStates);
+        } else if (isa<StoreInst>(Memory[i-1]) && isa<StoreInst>(Memory[i])) {
+            ret = tryCombineStore(Memory[i-1], Memory[i], ConsecutiveStates);
+        }
+        if (ret) {
+            Used.insert(Memory[i-1]);
+            Used.insert(Memory[i]);
+            Changed = true;
+        }
+    }
+    return Changed;
+}
+
+bool CombineGuestMemory::runOnFunction(Function &F)
+{
+    bool Changed = false;
+
+#if defined(CONFIG_SOFTMMU)
+    return Changed;
+#endif
+
+    /* Skip if no state is allowed to be combined. */
+    if (LegalStates.empty())
+        return Changed;
+
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << "CombineGuestMemory: Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    InitLastInst = F.getEntryBlock().getTerminator();
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        SmallVector<Value *, 8> Memory;
+        SmallVector<Value *, 8> States;
+        for (auto BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I)) {
+                Memory.push_back(I);
+            } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+                if (!LI->isVolatile())
+                    States.push_back(LI);
+            } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+                if (!SI->isVolatile())
+                    States.push_back(SI);
+            }
+        }
+        if (Memory.size() >= 2 && States.size() >= 2)
+            Changed |= combineMemory(Memory, States);
+    }
+
+    if (!toErase.empty())
+        ProcessErase(toErase);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/CombineZExtTrunc.cpp b/src/llvm/pass/CombineZExtTrunc.cpp
new file mode 100644
index 0000000..de9a87f
--- /dev/null
+++ b/src/llvm/pass/CombineZExtTrunc.cpp
@@ -0,0 +1,70 @@
+/*
+ *  (C) 2015 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm-target.h"
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "CombineZExtTrunc"
+
+/*
+ * CombineZExtTrunc Pass
+ */
+class CombineZExtTrunc : public FunctionPass {
+public:
+    static char ID;
+    explicit CombineZExtTrunc() : FunctionPass(ID) {}
+    bool runOnFunction(Function &F);
+};
+
+char CombineZExtTrunc::ID = 0;
+INITIALIZE_PASS(CombineZExtTrunc, "combinezet",
+        "Combine ZExt followed by Trunc", false, false)
+
+FunctionPass *llvm::createCombineZExtTrunc()
+{
+    return new CombineZExtTrunc;
+}
+
+bool CombineZExtTrunc::runOnFunction(Function &F)
+{
+    bool Changed = false;
+    IVec toErase;
+
+    SmallVector<Instruction*, 4> Worklist;
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; II++) {
+        Instruction *I = &*II;
+        if (isa<TruncInst>(I))
+            Worklist.push_back(I);
+    }
+
+    for (auto I : Worklist) {
+        TruncInst *TI = cast<TruncInst>(I);
+        ZExtInst *ZI = dyn_cast<ZExtInst>(TI->getOperand(0));
+        if (!ZI)
+            continue;
+
+        Type *SrcTy = ZI->getOperand(0)->getType();
+        Type *DstTy = TI->getType();
+        if (SrcTy == DstTy) {
+            I->replaceAllUsesWith(ZI->getOperand(0));
+            if (TI->use_empty())
+                toErase.push_back(TI);
+            Changed = true;
+        }
+    }
+
+    if (toErase.size())
+        ProcessErase(toErase);
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/FastMathPass.cpp b/src/llvm/pass/FastMathPass.cpp
new file mode 100644
index 0000000..2b6a592
--- /dev/null
+++ b/src/llvm/pass/FastMathPass.cpp
@@ -0,0 +1,87 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "fpu/softfloat-native-def.h"
+
+#define PASS_DEBUG "FastMathPass"
+
+class FastMathPass : public FunctionPass {
+public:
+    static char ID;
+    std::map<std::string, std::string> FPUNameMap;
+
+    explicit FastMathPass() : FunctionPass(ID)
+    {
+        TCGHelperInfo *FPUHelper = (TCGHelperInfo *)get_native_fpu_helpers();
+        for (int i = 0, e = num_native_fpu_helpers(); i != e; ++i) {
+            /* ex: llvm_int32_to_float32 --> int32_to_float32 */
+            TCGHelperInfo &fpu = FPUHelper[i];
+            const char *native = fpu.name;
+            const char *soft =  native + 5;
+            FPUNameMap[soft] = native;
+        }
+    }
+    bool runOnFunction(Function &F);
+};
+
+bool FastMathPass::runOnFunction(Function &F)
+{
+    IVec toErase;
+    SmallVector<CallInst *, 16> InlineCalls;
+    Module *Mod = F.getParent();
+
+    for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+        if (CallInst *CI = dyn_cast<CallInst>(&*I)) {
+            if (CI->isInlineAsm() ||
+                CI->getCalledFunction() == nullptr ||
+                CI->getCalledFunction()->isIntrinsic())
+                continue;
+
+            std::string Fname = CI->getCalledFunction()->getName();
+            if (FPUNameMap.count(Fname) == 0)
+                continue;
+
+            Function *Fn = Mod->getFunction(FPUNameMap[Fname]);
+            FunctionType *FTy = cast<FunctionType>(
+                    cast<PointerType>(Fn->getType())->getElementType());
+
+            unsigned NumArgs = FTy->getNumParams();
+            assert(NumArgs <= CI->getNumArgOperands());
+
+            SmallVector<Value *, 4> Params;
+            for (unsigned i = 0; i != NumArgs; ++i)
+                Params.push_back(CI->getArgOperand(i));
+
+            CallInst *NewCI = CallInst::Create(Fn, Params, "", CI);
+            CI->replaceAllUsesWith(NewCI);
+            InlineCalls.push_back(NewCI);
+            toErase.push_back(CI);
+        }
+    }
+
+    ProcessErase(toErase);
+
+    while (!InlineCalls.empty())
+        InlineFunc(InlineCalls.pop_back_val());
+
+    return false;
+}
+
+char FastMathPass::ID = 0;
+INITIALIZE_PASS(FastMathPass, "fastmath",
+        "Transform softfloat subroutines to native FP operations", false, false)
+
+FunctionPass *llvm::createFastMathPass()
+{
+    return new FastMathPass();
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/ProfileExec.cpp b/src/llvm/pass/ProfileExec.cpp
new file mode 100644
index 0000000..56a68e1
--- /dev/null
+++ b/src/llvm/pass/ProfileExec.cpp
@@ -0,0 +1,172 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm-debug.h"
+#include "llvm-soft-perfmon.h"
+#include "llvm-pass.h"
+#include "llvm-opc.h"
+#include "llvm.h"
+#include "utils.h"
+
+#define PASS_NAME  "ProfileExec"
+
+extern LLVMEnv *LLEnv;
+
+/*
+ * Profile Pass
+ */
+class ProfileExec : public FunctionPass {
+    enum {
+        IDX_LOOP = 0,
+        IDX_EXIT,
+        IDX_INBR,
+    };
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    MDFactory *MF;
+    IntegerType *Int8Ty;
+    IntegerType *Int32Ty;
+    IntegerType *Int64Ty;
+    IntegerType *IntPtrTy;
+    PointerType *Int8PtrTy;
+    PointerType *Int32PtrTy;
+    PointerType *Int64PtrTy;
+
+public:
+    static char ID;
+    explicit ProfileExec() : FunctionPass(ID) {}
+    explicit ProfileExec(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), DL(IF->getDL()), MF(IF->getMDFactory())
+    {
+        LLVMContext &Context = IF->getContext();;
+        Int8Ty      = IntegerType::get(Context, 8);
+        Int32Ty     = IntegerType::get(Context, 32);
+        Int64Ty     = IntegerType::get(Context, 64);
+        IntPtrTy    = DL->getIntPtrType(Context);
+        Int8PtrTy   = Type::getInt8PtrTy(Context, 0);
+        Int32PtrTy  = Type::getInt32PtrTy(Context, 0);
+        Int64PtrTy  = Type::getInt64PtrTy(Context, 0);
+    }
+    bool runOnFunction(Function &F);
+
+    Instruction *getInsertPos(BasicBlock *BB) {
+        if (BB == &BB->getParent()->getEntryBlock())
+            return &*++BB->begin();
+        return BB->getFirstNonPHI();
+    }
+};
+
+char ProfileExec::ID = 0;
+INITIALIZE_PASS(ProfileExec, "profile", "Profile trace execution", false, false)
+
+FunctionPass *llvm::createProfileExec(IRFactory *IF) 
+{
+    return new ProfileExec(IF);
+}
+
+bool ProfileExec::runOnFunction(Function &F)
+{
+    if (!LLEnv->isTraceMode())
+        return false;
+    if (!SP->isEnabled())
+        return false;
+
+    Instruction *CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << PASS_NAME << ": Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    TraceInfo *Trace = IF->getTrace();
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; FI++) {
+        BasicBlock *BB = &*FI;
+        if (distance(succ_begin(BB), succ_end(BB)) != 0)
+            continue;
+
+        /* Find exit points and indirect branches. */
+        Trace->NumExit++;
+        if (isa<IndirectBrInst>(BB->getTerminator()))
+            Trace->NumIndirectBr++;
+    }
+
+    /* Insert code to profile trace exit counts. */
+    if (SP->Mode & SPM_EXIT) {
+        Instruction *InsertPos = &*++BasicBlock::iterator(CPU);
+        Value *NumExitPtr = GetElementPtrInst::CreateInBounds(CPU,
+                            CONSTPtr(offsetof(CPUArchState, num_trace_exits)),
+                            "", InsertPos);
+        NumExitPtr = new BitCastInst(NumExitPtr, Int64PtrTy, "", InsertPos);
+        Instruction *NumExits = new LoadInst(NumExitPtr, "", true, InsertPos);
+        NumExits = BinaryOperator::Create(Instruction::Add, NumExits,
+                                          CONST64(1), "", InsertPos);
+        new StoreInst(NumExits, NumExitPtr, true, InsertPos);
+    }
+
+    if (!(SP->Mode & SPM_TRACE))
+        return false;
+
+    SmallVector<CallInst*, 16> InlineCalls;
+    Function *Helper = IF->ResolveFunction("helper_profile_exec");
+
+    /* Prepare counter structures. */
+    if (!Trace->ExecCount) {
+        Trace->ExecCount = new uint64_t *[MAX_SPM_THREADS];
+        for (int i = 0; i < MAX_SPM_THREADS; i++)
+            Trace->ExecCount[i] = new uint64_t[3] {0, 0, 0};
+    }
+
+    /* Find all profiling point. */
+    std::vector<std::pair<Instruction *, int> > ProfilePoint;
+
+    SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> BackEdges;
+    FindFunctionBackedges(F, BackEdges);
+    for (unsigned i = 0, e = BackEdges.size(); i != e; ++i) {
+        auto BackEdgeBB = const_cast<BasicBlock*>(BackEdges[i].first);
+        ProfilePoint.push_back(std::make_pair(BackEdgeBB->getTerminator(), IDX_LOOP));
+    }
+
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; FI++) {
+        BasicBlock *BB = &*FI;
+        if (distance(succ_begin(BB), succ_end(BB)) != 0)
+            continue;
+        bool isIndirectBr = isa<IndirectBrInst>(BB->getTerminator());
+        ProfilePoint.push_back(std::make_pair(getInsertPos(BB),
+                                    isIndirectBr ? IDX_INBR : IDX_EXIT));
+    }
+
+    /* Insert profiling routines. */
+    for (unsigned i = 0, e = ProfilePoint.size(); i != e; ++i) {
+        Instruction *InsertPos = ProfilePoint[i].first;
+        Value *Ty = CONST32(ProfilePoint[i].second);
+
+        Value *Counter = ConstantExpr::getIntToPtr(
+                            CONSTPtr((uintptr_t)Trace->ExecCount),
+                            PointerType::getUnqual(Int8Ty));
+
+        SmallVector<Value *, 4> Params;
+        Type *ParamTy = Helper->getFunctionType()->getParamType(0);
+        Value *Env = new BitCastInst(CPU, ParamTy, "", InsertPos);
+        Params.push_back(Env);
+        Params.push_back(Counter);
+        Params.push_back(Ty);
+
+        CallInst *CI = CallInst::Create(Helper, Params, "", InsertPos);
+        MF->setConst(CI);
+        InlineCalls.push_back(CI);
+    }
+
+    while (!InlineCalls.empty())
+        InlineFunc(InlineCalls.pop_back_val());
+
+    return true;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/RedundantStateElimination.cpp b/src/llvm/pass/RedundantStateElimination.cpp
new file mode 100644
index 0000000..2e5f715
--- /dev/null
+++ b/src/llvm/pass/RedundantStateElimination.cpp
@@ -0,0 +1,179 @@
+/*
+ *  (C) 2017 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+#include "utils.h"
+
+#define PASS_NAME "RedundantStateElimination"
+
+/*
+ * The RedundantStateElimination pass aims to remove
+ * (1) redundant stores to PC, and
+ * (2) redundant loads and stores enclosed by two helper function calls.
+ */
+class RedundantStateElimination : public FunctionPass {
+    IRFactory *IF;
+    MDFactory *MF;
+    const DataLayout *DL;
+    Value *CPU;
+    IVec toErase;
+
+public:
+    static char ID;
+    explicit RedundantStateElimination() : FunctionPass(ID) {}
+    explicit RedundantStateElimination(IRFactory *IF)
+        : FunctionPass(ID), IF(IF), MF(IF->getMDFactory()), DL(IF->getDL()) {}
+
+    int getNumUsers(Instruction *I) {
+        return distance(I->user_begin(), I->user_end());
+    }
+
+    bool isStateOfPC(Value *Ptr) {
+        intptr_t Off = 0;
+        Value *Base = getBaseWithConstantOffset(DL, Ptr, Off);
+        if (Base == CPU && IRFactory::isStateOfPC(Off))
+            return true;
+        return false;
+    }
+
+    bool isDirectDominator(LoadInst *LI, StoreInst *SI) {
+        Instruction *A = LI, *B = SI;
+        if (A->getParent() != B->getParent())
+            return false;
+        for (auto II = BasicBlock::iterator(A), EE = A->getParent()->end();
+             II != EE; ++II) {
+            if (&*II == B)
+                return true;
+            /* If a non-const helper function is between the two instructions,
+             * this is not a direct domination because the helper function could
+             * cause side effect. */
+            auto CI = dyn_cast<CallInst>(II);
+            if (CI && !MDFactory::isConst(CI))
+                return false;
+        }
+        return false;
+    }
+
+    bool removePCState(Function &F);
+    bool removeHelperState(Function &F);
+    bool runOnFunction(Function &F);
+};
+
+char RedundantStateElimination::ID = 0;
+INITIALIZE_PASS(RedundantStateElimination, "rse",
+        "Eliminate redundant CPU state loads/stores", false, false)
+
+FunctionPass *llvm::createRedundantStateElimination(IRFactory *IF)
+{
+    return new RedundantStateElimination(IF);
+}
+
+/* Eliminate redundant stores to PC for each basic block. */
+bool RedundantStateElimination::removePCState(Function &F)
+{
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        bool Found = false;
+
+        for (auto BI = FI->rbegin(), BE = FI->rend(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I))
+                continue;
+
+            if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+                if (isStateOfPC(getPointerOperand(SI))) {
+                    if (!Found)
+                        Found = true;
+                    else
+                        toErase.push_back(SI);
+                }
+            } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+                if (isStateOfPC(getPointerOperand(LI)))
+                    Found = false;
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+
+}
+
+/* Eliminate redundant loads/stores enclosed by two helper function calls.
+ * The redundant loads and stores are generated by StateMappingPass for
+ * handling synchronization of CPU states around helper function calls.
+ * A load and store can be removed if a state value is loaded and immediately
+ * stored back to the same state. For example:
+ *
+ * Before optimization:                     After optimization:
+ *   instructions to sync states              instructions to sync states
+ *   call void @helper_function1()            call void @helper_function1()
+ *
+ *   %v0 = load i32, i32* %state0
+ *   %v1 = load i32, i32* %state1
+ *   store i32 %v0, i32* %state0
+ *   store i32 %v1, i32* %state1
+ *
+ *   call void @helper_function2()            call void @helper_function2()
+ *   instructions to reload states            instructions to reload states
+ */
+bool RedundantStateElimination::removeHelperState(Function &F)
+{
+    for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+        for (auto BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) {
+            Instruction *I = &*BI;
+            if (MF->isGuestMemory(I))
+                continue;
+
+            StoreInst *SI = dyn_cast<StoreInst>(I);
+            if (!SI || SI->isVolatile())
+                continue;
+
+            LoadInst *LI = dyn_cast<LoadInst>(getValueOperand(SI));
+            if (LI && isDirectDominator(LI, SI)) {
+                /* We can try removing the store instruction if LI is a direct
+                 * dominator of SI. */
+                Value *PtrA = getPointerOperand(LI);
+                Value *PtrB = getPointerOperand(SI);
+                if (StripPointer(PtrA) == CPU && PtrA == PtrB)
+                    toErase.push_back(SI);
+            }
+        }
+    }
+
+    if (toErase.empty())
+        return false;
+
+    ProcessErase(toErase);
+    return true;
+}
+
+bool RedundantStateElimination::runOnFunction(Function &F)
+{
+    bool Changed = false;
+
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU) {
+        dbg() << DEBUG_PASS << "RedundantStateElimination: Cannot find CPU pointer.\n";
+        return false;
+    }
+
+    Changed |= removePCState(F);
+#if 0
+    Changed |= removeHelperState(F);
+#endif
+
+    return Changed;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
+
diff --git a/src/llvm/pass/ReplaceIntrinsic.cpp b/src/llvm/pass/ReplaceIntrinsic.cpp
new file mode 100644
index 0000000..62505f4
--- /dev/null
+++ b/src/llvm/pass/ReplaceIntrinsic.cpp
@@ -0,0 +1,137 @@
+/*
+ *  (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-types.h"
+#include "llvm-debug.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+
+
+#define PASS_NAME  "ReplaceIntrinsic"
+
+/*
+ * HQEMU does not allow helpers to contain any memory or debug intrinsics.
+ * This pass substitutes memory intrinsics to load/store instuctions and
+ * removes debug intrinsics (generated by Clang with -g flag).
+ */
+class ReplaceIntrinsic : public FunctionPass {
+    IVec toErase;
+public:
+    static char ID;
+    explicit ReplaceIntrinsic() : FunctionPass(ID) {}
+
+    Value *ConvertType(Value *V, Type *T, Instruction *InsertPos) {
+        if (likely(V->getType() == T))
+            return V;
+        return new BitCastInst(V, T, "", InsertPos);
+    }
+
+    bool replaceMemoryIntrinsic(IntrinsicInst *I);
+    bool runOnFunction(Function &F);
+};
+
+char ReplaceIntrinsic::ID = 0;
+INITIALIZE_PASS(ReplaceIntrinsic, "replaceintrinsic",
+                "Replace memory and debug intrinsics generated by clang",
+                false, false)
+
+FunctionPass *llvm::createReplaceIntrinsic()
+{
+    return new ReplaceIntrinsic();
+}
+
+
+/*
+ *  Transform memcpy/memmove/memset to load/store instruction.
+ *  Clang attempts to move memory data using LLVM memory intrinsic instructions.
+ *  This causes the statemapping pass to miss some guest states. (Statemapping
+ *  only considers guest states accessed by general load/store insts).
+ *  So, we simply rewrite the memory intrinsics to load/store instuctions.
+ */
+bool ReplaceIntrinsic::replaceMemoryIntrinsic(IntrinsicInst *I)
+{
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+        break;
+    default:
+        return false;
+    }
+
+    LLVMContext &Context = I->getContext();
+    Type *Int8PtrTy = Type::getInt8PtrTy(Context);
+    CallInst *CI = cast<CallInst>(I);
+
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        /* memcpy/memmove */
+        Value *Src = MTI->getSource();
+        Value *Dst = MTI->getDest();
+        Value *NumBytes = MTI->getLength();
+
+        if (CI->getArgOperand(0)->getType() != Int8PtrTy ||
+            CI->getArgOperand(1)->getType() != Int8PtrTy ||
+            !isa<ConstantInt>(NumBytes) ||
+            MTI->isVolatile())
+            return false;
+
+        /* Remove this instruction if the access size is zero. */
+        size_t Len = cast<ConstantInt>(NumBytes)->getZExtValue();
+        if (Len == 0)
+            goto done;
+
+        Type *Ty = Type::getIntNPtrTy(Context, Len * 8);
+        Src = ConvertType(Src, Ty, I);
+        Dst = ConvertType(Dst, Ty, I);
+        Src = new LoadInst(Src, "", false, I);
+        new StoreInst(Src, Dst, false, I);
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        /* memset */
+        Value *Src = MSI->getValue();
+        Value *Dst = MSI->getDest();
+        Value *NumBytes = MSI->getLength();
+
+        if (CI->getArgOperand(0)->getType() != Int8PtrTy ||
+            !isa<ConstantInt>(Src) ||
+            !isa<ConstantInt>(NumBytes) ||
+            MSI->isVolatile())
+            return false;
+
+        size_t Val = cast<ConstantInt>(Src)->getZExtValue();
+        size_t Len = cast<ConstantInt>(NumBytes)->getZExtValue();
+        if (Val != 0)
+            return false;
+        if (Len == 0)
+            goto done;
+
+        Type *Ty = Type::getIntNPtrTy(Context, Len * 8);
+        Src = ConstantInt::get(Type::getIntNTy(Context, Len * 8), 0);
+        Dst = ConvertType(Dst, Ty, I);
+        new StoreInst(Src, Dst, false, I);
+    }
+
+done:
+    toErase.push_back(I);
+    return true;
+}
+
+bool ReplaceIntrinsic::runOnFunction(Function &F)
+{
+    for (auto I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+        Instruction *Inst = &*I;
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+            if (replaceMemoryIntrinsic(II))
+                continue;
+            if (isa<DbgInfoIntrinsic>(II))
+                toErase.push_back(II);
+        }
+    }
+    ProcessErase(toErase);
+    return true;
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff --git a/src/llvm/pass/SimplifyPointer.cpp b/src/llvm/pass/SimplifyPointer.cpp
new file mode 100644
index 0000000..87afbdd
--- /dev/null
+++ b/src/llvm/pass/SimplifyPointer.cpp
@@ -0,0 +1,334 @@
+//===- SimplifyPointer.cpp - Reassociate guest pointer arithmetic ---------===//
+//
+//           The HQEMU Dynamic Binary Translator Infrastructure
+//
+// (C) 2016 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+//     COVART Laboratory, CSIE Department, National Taiwan University, Taiwan.
+//     See COPYRIGHT in top-level directory.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements a simple pointer arithmetic reassociator for easier
+// pointer stripping. It gets scalar evolution results of all guest pointers
+// which are in simplest form. Next, it inserts new instructions to evaluate the
+// simplified expressions to construct new pointers, and rewrites corresponding
+// guest load/store with new pointers.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+
+#include "llvm-opc.h"
+#include "llvm-pass.h"
+#include "llvm-target.h"
+#include "utils.h"
+
+#define PASS_NAME "SIMPTR"
+#define DEBUG_TYPE "SIMPTR"
+
+//#define VERBOSE
+
+/// \brief Dump pass debug message with pass name mark.
+static inline llvm::raw_ostream &pout() {
+  return dbg() << DEBUG_PASS << PASS_NAME ": ";
+}
+
+/// \returns True if \p A dominates \p B.
+static bool dominates(Value *A, Value *B, DominatorTree *DT) {
+  auto *AI = dyn_cast<Instruction>(A);
+  auto *BI = dyn_cast<Instruction>(B);
+  if (AI && BI)
+    return DT->dominates(AI, BI);
+  return false;
+}
+
+class SimplifyPointer : public FunctionPass {
+public:
+  using ValueList  = SmallVector<Value *, 32>;
+  using InstrList  = SmallVector<Instruction *, 32>;
+  using ExprValMap = DenseMap<const SCEV *, Value *>;
+
+  // Pass identification, replacement for type id.
+  static char ID;
+
+  // LLVM pass constructor and destructor.
+  explicit SimplifyPointer() : FunctionPass(ID){};
+  explicit SimplifyPointer(IRFactory *IF)
+      : FunctionPass(ID), IF(IF), MF(IF->getMDFactory()), DL(IF->getDL()) {
+    // Initialize all.
+    initializeSimplifyPointerPass(*PassRegistry::getPassRegistry());
+  }
+
+  // LLVM pass public interfaces.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// \return The evaluation result of expression \p S or null if not cached.
+  Value *lookupBaseExpressionCache(const SCEV *S) const {
+    auto V = BaseExprVal.find(S);
+    if (V != BaseExprVal.end())
+      return V->second;
+    return nullptr;
+  }
+
+  /// \returns True if spread constants in the expression tree of \p S can be
+  /// collected by reassociation and reduced to \p FoldVal.
+  ///
+  /// It traverses the expression tree of \p S and propagates constant nodes
+  /// from add, multiply and recurrent add nodes, i.e., (%1 + %2 + 5) * (%3 - 7)
+  /// should return 5 * -7 = -35.
+  bool foldConstantExpression(const SCEV *S, int64_t &FoldVal) const;
+
+  /// \returns The first non-pointer value traced along the use-define chain of
+  /// casting which starts from \p V and ends with a IntToPtrInst, or null if
+  /// the length of searching chain exceeds \p MaxLookup.
+  ///
+  /// In the context of DBT, pointer type is represented and manipulated as
+  /// integer data until used as a pointer. Therefore, it follows:
+  ///
+  /// [Expression Tree]
+  ///    +   + +   +
+  ///     \ /   \ /
+  ///      - ... -
+  ///       \   /
+  ///         +
+  ///   [Scalar Root]
+  ///         |
+  ///     [Casting]
+  ///         |
+  ///    [Load/Store]
+  ///
+  /// This method targets the scalar root value.
+  Value *findPointerScalarRoot(Value *V, unsigned MaxLookup = 4);
+
+  /// \brief Simplify the pointer arithmetic of \p LSI based on scalar evolution
+  /// results which folds constants into simplest form. After extracting the
+  /// folded constant from the expression, the rest nodes can form a base
+  /// expression which is likely a common sub-expression of other \p LSI.
+  ///
+  /// It assumes \p LSI has the following use-define chain starting from its
+  /// pointer and containing only add, multiply and recurrent add nodes.
+  ///
+  /// [Expression Tree]      [Expression Tree]      [Expression Tree]
+  ///    +   A B   +            +   + B   A            +   +
+  ///     \ /   \ /              \ /   \ /              \ /
+  ///      - ... -                - ... -                -   (B-A)
+  ///       \   /                  \   /                  \   /
+  ///         +                      +                      +
+  ///   [Scalar Root]    >>    [Scalar Root]    >>    [Scalar Root]
+  ///         |                      |                      |
+  ///     [Casting]              [Casting]              [Casting]
+  ///         |                      |                      |
+  ///       [LSI]                  [LSI]                  [LSI]
+  ///
+  /// Suppose A and B are constants, they can be folded into (B-A) with scalar
+  /// evolution results. Need to insert instructions for other operations in
+  /// tree (e.g., the new sub in the right-most figure).
+  ///
+  /// First it tries to find the folded constant and substract it from root
+  /// expression to form the base expression. Then it generates instructions to
+  /// evaluate the base expression.
+  bool tryToSimplifyPointer(Instruction *I);
+
+  // HQEMU internal infrastructure.
+  IRFactory *IF = nullptr;
+  MDFactory *MF = nullptr;
+  // LLVM analysis and data type info.
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT    = nullptr;
+  ScalarEvolution *SE  = nullptr;
+
+  /// The cache of base expression to corresponding evaluated value map.
+  ExprValMap BaseExprVal;
+};
+
+bool SimplifyPointer::foldConstantExpression(const SCEV *S,
+                                             int64_t &FoldVal) const {
+  // Handle expression tree of scalar root containing only add, multiply and
+  // recurrent add nodes.
+  if (auto *AddSE = dyn_cast<SCEVAddExpr>(S)) {
+    FoldVal = 0;
+    for (auto Op : AddSE->operands()) {
+      int64_t Val;
+      if (foldConstantExpression(Op, Val))
+        FoldVal += Val;
+    }
+    return true;
+  } else if (auto *MulSE = dyn_cast<SCEVMulExpr>(S)) {
+    FoldVal = 1;
+    for (auto Op : MulSE->operands()) {
+      int64_t Val;
+      // If one operand of multiplication fails to report a constant, entire
+      // expression becomes non-constant as well.
+      if (foldConstantExpression(Op, Val))
+        FoldVal *= Val;
+      else
+        return false;
+    }
+    return true;
+  } else if (auto *RecSE = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Trace only the start expression, because the step expression must be
+    // multiplied by the loop trip count which is unlikely constant.
+    return foldConstantExpression(RecSE->getStart(), FoldVal);
+  } else if (auto *ConstSE = dyn_cast<SCEVConstant>(S)) {
+    FoldVal = ConstSE->getValue()->getValue().getSExtValue();
+    return true;
+  }
+  return false;
+}
+
+Value *SimplifyPointer::findPointerScalarRoot(Value *V, unsigned MaxLookup) {
+  if (!V || !V->getType()->isPointerTy())
+    return V;
+
+  for (unsigned i = 0; i < MaxLookup; ++i) {
+    if (BitCastInst *Cast = dyn_cast<BitCastInst>(V)) {
+      V = Cast->getOperand(0);
+    } else if (IntToPtrInst *Cast = dyn_cast<IntToPtrInst>(V)) {
+      // Found first scalar, return it.
+      V = Cast->getOperand(0);
+      return V;
+    }
+  }
+  return nullptr;
+}
+
+bool SimplifyPointer::tryToSimplifyPointer(Instruction *LSI) {
+  Value *Ptr   = getPointerOperand(LSI);
+  Value *Root  = findPointerScalarRoot(Ptr);
+  Type *RootTy = Root->getType();
+  Type *PtrTy  = Ptr->getType();
+  if (!Ptr || !Root || !RootTy->isIntegerTy())
+    return false;
+
+#ifdef VERBOSE
+  if (DM.getDebugMode() & DEBUG_PASS) {
+    pout() << "Visiting memory instruction.\n";
+    pout() << "- " << *LSI << ".\n";
+  }
+#endif
+
+  // Traverse the simplest form expression tree and collect folded constants.
+  // Note the folded constant can be zero (base = root) if no folded constant
+  // is found.
+  auto *RootSE         = SE->getSCEV(Root);
+  int64_t FoldConst = 0;
+  foldConstantExpression(RootSE, FoldConst);
+
+  // Substract offset constant from root expression to get the base expression,
+  // then query base expression cache to find whether it has been evaluated.
+  auto *BaseSE = SE->getMinusSCEV(RootSE,
+                                  SE->getConstant(RootTy, FoldConst, true));
+  Value *Base  = lookupBaseExpressionCache(BaseSE);
+
+  // Create instructions to evaluate base expression if cache miss or previously
+  // computed value doesn't dominate load/store instruction.
+  if (!Base || !dominates(Base, LSI, DT)) {
+#ifdef VERBOSE
+    pout() << "  Need to build base expression.\n";
+    pout() << "  - Base   " << *BaseSE << ".\n";
+    pout() << "  - Offset " << FoldConst << ".\n";
+#endif
+    // Expand the base expression if it is safe.
+    if (isSafeToExpand(BaseSE, *SE)) {
+#if defined(LLVM_V35)
+      SCEVExpander Expander(*SE, "");
+#else
+      SCEVExpander Expander(*SE, *DL, "");
+#endif
+      Base = Expander.expandCodeFor(BaseSE, RootTy, LSI);
+    }
+  } else {
+#ifdef VERBOSE
+    pout() << "  Use cached base expression value.\n";
+    pout() << "  - Base   " << *BaseSE << ".\n";
+    pout() << "  - Offset " << FoldConst << ".\n";
+#endif
+  }
+
+  // Neither using cached value nor re-computing works, abort.
+  if (!Base)
+    return false;
+
+  // Add back folded constant (offset) to new root value and feed the result as
+  // new pointer to load/store instruction.
+  IRBuilder<> Builder(IF->getContext());
+
+  bool FoldZero = (FoldConst == 0);
+  Value *Offset = ConstantInt::get(RootTy, FoldConst);
+
+  Builder.SetInsertPoint(LSI);
+  Value *NewRoot = FoldZero ? Base : Builder.CreateAdd(Base, Offset);
+  Value *NewPtr  = Builder.CreateIntToPtr(NewRoot, PtrTy);
+  LSI->replaceUsesOfWith(Ptr, NewPtr);
+
+  // Cache base expression value.
+  BaseExprVal[BaseSE] = Base;
+
+  return true;
+}
+
+void SimplifyPointer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+#if defined(LLVM_V35)
+  AU.addRequired<ScalarEvolution>();
+#else
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+#endif
+}
+
+bool SimplifyPointer::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+#if defined(LLVM_V35)
+  SE = &getAnalysis<ScalarEvolution>();
+#else
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+#endif
+
+  bool Changed = false;
+
+  InstrList MemoryInstrs;
+  for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = &*FI;
+
+    // Skip dead basic blocks.
+    if (!DT->isReachableFromEntry(BB))
+      continue;
+
+    // Collect all guest memory instructions.
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+      Instruction *I = &*BI;
+      if (MDFactory::isGuestMemory(I))
+        MemoryInstrs.push_back(I);
+    }
+  }
+
+  // Try to simplify pointers of collected load/store instructions.
+  for (Instruction *I : MemoryInstrs)
+    Changed |= tryToSimplifyPointer(I);
+
+  return Changed;
+}
+
+char SimplifyPointer::ID = 0;
+INITIALIZE_PASS_BEGIN(SimplifyPointer, "simplifypointer",
+                      "Reassiciate pointer arithmetic", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+#if defined(LLVM_V35)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+#else
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+#endif
+INITIALIZE_PASS_END(SimplifyPointer, "simplifypointer",
+                    "Reassiciate pointer arithmetic", false, false)
+
+FunctionPass *llvm::createSimplifyPointer(IRFactory *IF) {
+  return new SimplifyPointer(IF);
+}
+
+/*
+ * vim: ts=2 sts=2 sw=2 expandtab
+ */
diff --git a/src/llvm/pass/StateMappingPass.cpp b/src/llvm/pass/StateMappingPass.cpp
new file mode 100644
index 0000000..0d9dd9b
--- /dev/null
+++ b/src/llvm/pass/StateMappingPass.cpp
@@ -0,0 +1,885 @@
+/*
+ *  (C) 2010 by Computer System Laboratory, IIS, Academia Sinica, Taiwan.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "llvm-debug.h"
+#include "llvm-opc.h"
+#include "llvm-target.h"
+#include "llvm-pass.h"
+
+#define PASS_NAME "StateMapping"
+
+
+/*
+ * StateMappingPass is used to eliminate the redundant loads and stores to the
+ * CPUArchState. The loads and stores of the guest memory operations are not
+ * removed in order not to violate the memory model of the guest architecture.
+ *
+ * The state mapping rules are:
+ * - A guest state is not overlapped: (i.e., same access size)
+ *   - Same type: map to this type.
+ *   - Different type: select type in the order: vector, float and integer;
+ *                     use bitcast to convert between different types.
+ * - A guest state is overlapped with other state(s):
+ *   - Query StateType to find state size (i.e., boundary) and type:
+ *     - Vector type: use insert/extract to manipulate a vector element.
+ *     - Other types: use shift to manipulate a vector element.
+ */
+class StateMappingPass : public FunctionPass  {
+    IRFactory *IF; /* Uplink to the IRFactory */
+
+public:
+    static char ID;
+    explicit StateMappingPass() : FunctionPass(ID) {}
+    explicit StateMappingPass(IRFactory *IF) : FunctionPass(ID), IF(IF) {}
+
+    bool runOnFunction(Function &F);
+};
+
+struct StateMapping {
+    StateMapping()
+        : State(nullptr), Addr(nullptr), Ty(nullptr), AI(nullptr),
+          hasLoad(false), hasStore(false) {}
+
+    StateData *State;
+    Value *Addr;
+    Type *Ty;
+    AllocaInst *AI;
+    bool hasLoad;
+    bool hasStore;
+
+    intptr_t getSize()  { return State->End - State->Start; }
+    intptr_t getStart() { return State->Start;     }
+    intptr_t getEnd()   { return State->End;       }
+    Value *getAddr()    { return Addr;             }
+    Type *getType()     { return Ty;               }
+    bool isVector()     { return Ty->isVectorTy(); }
+
+    bool overlap(StateRange &Range) {
+        if (Range.empty())
+            return false;
+        intptr_t Start = getStart();
+        intptr_t End = getEnd();
+        auto I = --Range.upper_bound(Start);
+        for (; I != Range.end() && I->first < End; ++I) {
+            if (I->second > Start)
+                return true;
+        }
+        return false;
+    }
+};
+
+struct ElementInfo {
+    ElementInfo() : Shift(0), NumElts(0), EltTy(nullptr), StateTy(nullptr) {}
+
+    intptr_t Shift;
+    unsigned NumElts;
+    Type *EltTy;
+    Type *StateTy;
+};
+
+class StateMapper {
+    typedef std::vector<StateMapping> StateMapList;
+
+    IRFactory *IF;
+    const DataLayout *DL;
+    Instruction *CPU;         /* The CPU pointer */
+    Instruction *PreCastPos;  /* The position to cast CPU states */
+    Instruction *PreLoadPos;  /* The position to preload CPU states */
+    IVec toErase;             /* The instructions to be removed */
+
+    FlatType &StateType;
+    StateAnalyzer Analyzer;
+    StateMapList StateMaps;
+
+public:
+    StateMapper(IRFactory *IF)
+        : IF(IF), DL(IF->getDL()), StateType(IF->getTranslator().getStateType()),
+          Analyzer(DL) {}
+
+    bool run(Function &F) {
+        if (!init(F))
+            return false;
+
+        AnalyzeState(F);
+        if (!StateMaps.empty())
+            PromoteState(F);
+
+        ProcessErase(toErase);
+        return true;
+    }
+
+    /* Rearrange instructions in the 'init' block. */
+    bool init(Function &F);
+
+    /* Analyze instructions in a Function that access CPU states. */
+    void AnalyzeState(Function &F);
+
+    /* Compute state mapping information. */
+    void ComputeStateMap(StateMapping &StateMap, StateData &State);
+
+    /* Determine if the state can be operated as a vector. */
+    Type *TryVectorState(StateData &State, Type *Ty);
+
+    /* Map state references to the virtual states. */
+    void PromoteState(Function &F);
+
+    /* Rewrite state loads and stores. */
+    void RewriteLoad(StateMapping &StateMap, StateRef &Ref);
+    void RewriteStore(StateMapping &StateMap, StateRef &Ref);
+    void RewriteLoadVector(StateMapping &StateMap, StateRef &Ref);
+    void RewriteStoreVector(StateMapping &StateMap, StateRef &Ref);
+
+    /* Compute state and element types for element insertion and extraction. */
+    void getElementInfo(StateMapping &StateMap, StateRef &Ref, ElementInfo &Info);
+
+    /* Sync CPU states around helper calls. */
+    void SyncHelperState();
+
+    /* Store dirty states at the leaf blocks. */
+    void ProcessExitBB(BasicBlock *BB);
+
+    /* Get the pointer without GEP and BitCast. */
+    void StripPointer(Value *V, IVec &IV);
+
+    /* Move the pointer before InsertPos. */
+    void MoveStatePointer(Value *V);
+
+    /* Load state from Src and store it to Dest. */
+    void CopyState(Value *Dest, Value *Src, Instruction *InsertPos);
+
+    bool isLegalState(Value *Ptr, intptr_t &Off);
+
+    /* Return true if the input is alias of a state pointer. */
+    bool isStatePointer(Value *V) {
+        if (auto BCI = dyn_cast<BitCastInst>(V)) {
+            if (BCI->getOperand(0) == CPU)
+                return true;
+            return isStatePointer(BCI->getOperand(0));
+        } else if (auto GEP = dyn_cast<GetElementPtrInst>(V))
+            return GEP->getOperand(0) == CPU;
+        return false;
+    }
+
+    bool isSimpleFunction(Function *F) {
+        HelperMap &Helpers = IF->getHelpers();
+        if (Helpers.find(F->getName()) == Helpers.end() ||
+            Helpers[F->getName()]->hasNestedCall)
+            return false;
+        return true;
+    }
+
+    Value *ConvertType(Value *V, Type *Ty, Instruction *InsertPos) {
+        return V->getType() == Ty ? V : new BitCastInst(V, Ty, "", InsertPos);
+    }
+};
+
+/* Return a pre-defined state name. */
+static std::string getStateName(intptr_t Off)
+{
+#if defined(TARGET_I386)
+    if (Off == offsetof(CPUArchState,xmm_regs[0])) return "xmm0";
+    if (Off == offsetof(CPUArchState,xmm_regs[1])) return "xmm1";
+    if (Off == offsetof(CPUArchState,xmm_regs[2])) return "xmm2";
+    if (Off == offsetof(CPUArchState,xmm_regs[3])) return "xmm3";
+    if (Off == offsetof(CPUArchState,xmm_regs[4])) return "xmm4";
+    if (Off == offsetof(CPUArchState,xmm_regs[5])) return "xmm5";
+    if (Off == offsetof(CPUArchState,xmm_regs[6])) return "xmm6";
+    if (Off == offsetof(CPUArchState,xmm_regs[7])) return "xmm7";
+    if (Off == offsetof(CPUArchState,xmm_t0)) return "xmm_t0";
+#endif
+    return "";
+}
+
+/* Determine if the offset is to access the temporary state. */
+static inline bool isLocalState(intptr_t Off)
+{
+#if defined(TARGET_I386)
+    if (Off == offsetof(CPUArchState, xmm_t0))
+        return true;
+#endif
+    return false;
+}
+
+/* Return states that should be ignored during state mapping. */
+static bool isSkipState(intptr_t Off)
+{
+    if (Off == (intptr_t)(offsetof(CPUState, tcg_exit_req) - ENV_OFFSET))
+        return true;
+
+#define stateof(X) \
+    (Off >= (intptr_t)offsetof(CPUArchState,X) && \
+     Off < (intptr_t)(offsetof(CPUArchState,X) + sizeof(((CPUArchState*)0)->X)))
+#define is_fpstatus(X) \
+    (stateof(X.float_detect_tininess)       || \
+     stateof(X.float_rounding_mode)         || \
+     stateof(X.float_exception_flags)       || \
+     stateof(X.floatx80_rounding_precision) || \
+     stateof(X.flush_to_zero)               || \
+     stateof(X.flush_inputs_to_zero)        || \
+     stateof(X.default_nan_mode))
+
+#if defined(TARGET_ARM)
+    if (is_fpstatus(vfp.fp_status) || is_fpstatus(vfp.standard_fp_status))
+        return true;
+#elif defined(TARGET_I386)
+    if (is_fpstatus(fp_status))
+        return true;
+#endif
+    return false;
+
+#undef stateof
+#undef is_fpstatus
+}
+
+/* Check if the state is legal for state mapping. A legal state must have CPU
+ * as the base pointer, plus a positive constant offset. */
+bool StateMapper::isLegalState(Value *Ptr, intptr_t &Off)
+{
+    Value *Base = getBaseWithConstantOffset(DL, Ptr, Off);
+    if (Off < 0)
+        return false;
+    if (Base == CPU && !isSkipState(Off) && !IRFactory::isStateOfPC(Off))
+        return true;
+    return false;
+}
+
+/* Get the pointer without GEP and BitCast. The stripped GEP and BitCast
+ * instructions are returned to the caller. */
+void StateMapper::StripPointer(Value *V, IVec &IV)
+{
+    std::set<Value *> Visited;
+    Visited.insert(V);
+    do {
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+            IV.push_back(GEP);
+            V = GEP->getOperand(0);
+        } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+            IV.push_back(BCI);
+            V = BCI->getOperand(0);
+        } else
+            return;
+        if (Visited.find(V) != Visited.end())
+            break;
+        Visited.insert(V);
+    } while (true);
+}
+
+/* Move the pointer before InsertPos. */
+void StateMapper::MoveStatePointer(Value *V)
+{
+    IVec toMove;
+    StripPointer(V, toMove);
+    while (!toMove.empty()) {
+        Instruction *I = toMove.back();
+        toMove.pop_back();
+        if (I->getParent() == CPU->getParent())
+            continue;
+        I->moveBefore(PreCastPos);
+    }
+}
+
+/* Copy state data from src address to destination address. */
+void StateMapper::CopyState(Value *Dest, Value *Src, Instruction *InsertPos)
+{
+    if (!isa<AllocaInst>(Src)) {
+        MoveStatePointer(Src);
+        LoadInst *LI = new LoadInst(Src, "", false, InsertPos);
+        new StoreInst(LI, Dest, false, InsertPos);
+
+        if (Src->getType()->getPointerElementType()->isVectorTy())
+            LI->setAlignment(4);
+    } else {
+        MoveStatePointer(Dest);
+        LoadInst *LI = new LoadInst(Src, "", false, InsertPos);
+        StoreInst *SI = new StoreInst(LI, Dest, false, InsertPos);
+
+        if (Dest->getType()->getPointerElementType()->isVectorTy())
+            SI->setAlignment(4);
+    }
+}
+
+/* Store dirty states at the leaf blocks. */
+void StateMapper::ProcessExitBB(BasicBlock *BB)
+{
+    Instruction *InsertPos = nullptr;
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+        if (MDFactory::isExit(&*BI)) {
+            InsertPos = &*BI;
+            break;
+        }
+    }
+    if (!InsertPos)
+        InsertPos = BB->getTerminator();
+
+    for (auto &StateMap : StateMaps) {
+        if (!StateMap.hasStore || isLocalState(StateMap.getStart()))
+            continue;
+        CopyState(StateMap.Addr, StateMap.AI, InsertPos);
+    }
+}
+
+/* Sync CPU states around helper calls. */
+void StateMapper::SyncHelperState()
+{
+    CallList &Calls = Analyzer.getCalls();
+    if (Calls.empty())
+        return;
+
+    /*
+     * Rules of syncing states around calls:
+     * 1. Dirty states (i.e., stores) are written back before calls.
+     * 2. All states, including loads and stores, are read back after calls.
+     *
+     * If the helper is a simple function, only dependent states are synced.
+     * If the helper is a complicated function, all states are synced.
+     */
+    HelperMap &Helpers = IF->getHelpers();
+    DenseMap<CallInst*, std::set<unsigned> > StoreBeforeCall;
+    DenseMap<CallInst*, std::set<unsigned> > LoadAfterCall;
+
+    for (auto CI : Calls) {
+        Function *Func = CI->getCalledFunction();
+        std::string Name = Func->getName();
+
+        if (isSimpleFunction(Func)) {
+            /* A pre-defined helper without nested call. */
+            HelperInfo *Helper = Helpers[Name];
+            for (unsigned i = 0, e = StateMaps.size(); i != e; ++i) {
+                auto &StateMap = StateMaps[i];
+                if (StateMap.hasStore && StateMap.overlap(Helper->StateUse))
+                    StoreBeforeCall[CI].insert(i);
+
+                if (StateMap.overlap(Helper->StateDef))
+                    LoadAfterCall[CI].insert(i);
+
+                if (Helper->mayConflictArg) {
+                    unsigned NumArgs = CI->getNumArgOperands();
+                    for (unsigned j = 1; j < NumArgs; ++j) {
+                        intptr_t Off = 0;
+                        Value *Arg = CI->getArgOperand(j);
+                        if (!isLegalState(Arg, Off))
+                            continue;
+                        if (Off + Helper->ConflictSize <= StateMap.getStart() ||
+                            Off >= StateMap.getEnd())
+                            continue;
+                        if (StateMap.hasStore)
+                            StoreBeforeCall[CI].insert(i);
+                        LoadAfterCall[CI].insert(i);
+                    }
+                }
+            }
+        } else {
+            /* Sync states for a complicated function (an unknown helper or a
+             * helper with nested calls). */
+            for (unsigned i = 0, e = StateMaps.size(); i != e; ++i) {
+                auto &StateMap = StateMaps[i];
+                if (StateMap.hasStore)
+                    StoreBeforeCall[CI].insert(i);
+                LoadAfterCall[CI].insert(i);
+            }
+        }
+    }
+
+    /* Perform state syncing. */
+    for (auto CI : Calls) {
+        Instruction *InsertPos = CI;
+
+        if (!StoreBeforeCall.empty()) {
+            for (auto i : StoreBeforeCall[CI]) {
+                auto &StateMap = StateMaps[i];
+                CopyState(StateMap.Addr, StateMap.AI, InsertPos);
+            }
+        }
+
+        InsertPos = &*std::next(BasicBlock::iterator(InsertPos));
+        if (isa<UnreachableInst>(InsertPos)) {
+            /* No read back is required after tail call. */
+            continue;
+        }
+
+        if (!LoadAfterCall.empty()) {
+            for (auto i : LoadAfterCall[CI]) {
+                auto &StateMap = StateMaps[i];
+                CopyState(StateMap.AI, StateMap.Addr, InsertPos);
+            }
+        }
+    }
+}
+
+static inline bool isSameSize(StateMapping &StateMap, StateRef &Ref)
+{
+    return StateMap.getSize() == Ref.getSize();
+}
+
+/* Compute state and element types for element insertion and extraction. */
+void StateMapper::getElementInfo(StateMapping &StateMap, StateRef &Ref,
+                                 ElementInfo &Info)
+{
+    intptr_t StateSize = StateMap.getSize();
+    intptr_t Size = Ref.getSize();
+    intptr_t Shift = Ref.Start - StateMap.getStart();
+    Type *StateTy = StateMap.getType();
+    LLVMContext &Context = StateTy->getContext();
+
+    if (!StateMap.isVector()) {
+        /* Use int-N to emulate the state. */
+        Info.NumElts = 1;
+        Info.EltTy = Type::getIntNTy(Context, Size * 8);
+        Info.StateTy = Type::getIntNTy(Context, StateSize * 8);
+        Info.Shift = Shift;
+        return;
+    }
+
+    /* The state is emulated as a vector. */
+    if (StateSize % Size == 0 && Shift % Size == 0) {
+        Type *EltTy = Type::getIntNTy(Context, Size * 8);
+
+        Info.NumElts = 1;
+        Info.EltTy = EltTy;
+        Info.StateTy = VectorType::get(EltTy, StateSize / Size);
+        Info.Shift = Shift / Size;
+    } else {
+        VectorType *VecTy = cast<VectorType>(StateTy);
+        Type *EltTy = VecTy->getScalarType();
+        intptr_t EltSize = DL->getTypeSizeInBits(EltTy) / 8;
+
+        Info.NumElts = Size / EltSize;
+        Info.EltTy = VectorType::get(EltTy, Info.NumElts);
+        Info.StateTy = StateTy;
+        Info.Shift = Shift / EltSize;
+    }
+}
+
+void StateMapper::RewriteLoad(StateMapping &StateMap, StateRef &Ref)
+{
+    LoadInst *LI = cast<LoadInst>(Ref.I);
+    Type *Ty = LI->getType();
+    Instruction *InsertPos = LI;
+
+    /* The same reference size as the state size. */
+    if (isSameSize(StateMap, Ref)) {
+        Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+        V = ConvertType(V, Ty, InsertPos);
+        LI->replaceAllUsesWith(V);
+        toErase.push_back(LI);
+        return;
+    }
+
+    if (StateMap.isVector()) {
+        RewriteLoadVector(StateMap, Ref);
+        return;
+    }
+
+    /* This is a non-vector state. Transform the state to the type of Int-N
+     * and use logical shift to extract/insert element data. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Extract the element. */
+    if (Info.Shift) {
+        Value *Shift = ConstantInt::get(V->getType(), Info.Shift * 8);
+        V = BinaryOperator::Create(Instruction::LShr, V, Shift, "", InsertPos);
+    }
+    V = new TruncInst(V, Info.EltTy, "", InsertPos);
+    V = ConvertType(V, Ty, InsertPos);
+
+    LI->replaceAllUsesWith(V);
+    toErase.push_back(LI);
+}
+
+void StateMapper::RewriteStore(StateMapping &StateMap, StateRef &Ref)
+{
+    StoreInst *SI = cast<StoreInst>(Ref.I);
+    Value *Data = SI->getValueOperand();
+    Instruction *InsertPos = SI;
+
+    /* The same reference size as the state size. */
+    if (isSameSize(StateMap, Ref)) {
+        Value *V = ConvertType(Data, StateMap.getType(), InsertPos);
+        new StoreInst(V, StateMap.AI, false, InsertPos);
+        toErase.push_back(SI);
+        return;
+    }
+
+    if (StateMap.isVector()) {
+        RewriteStoreVector(StateMap, Ref);
+        return;
+    }
+
+    /* This is a non-vector state. Transform the state to the type of Int-N
+     * and use logical shift to extract/insert element data. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Insert the element. */
+    Data = ConvertType(Data, Info.EltTy, InsertPos);
+    Data = new ZExtInst(Data, Info.StateTy, "", InsertPos);
+
+    if (Info.Shift) {
+        Value *Shift = ConstantInt::get(Data->getType(), Info.Shift * 8);
+        Data = BinaryOperator::Create(Instruction::Shl, Data, Shift, "", InsertPos);
+    }
+
+    unsigned numBits = StateMap.getSize() * 8;
+    unsigned loBit = Info.Shift * 8, hiBit = loBit + Ref.getSize() * 8;
+    APInt mask = ~APInt::getBitsSet(numBits, loBit, hiBit);
+    Value *Mask = ConstantInt::get(Data->getContext(), mask);
+
+    V = BinaryOperator::Create(Instruction::And, V, Mask, "", InsertPos);
+    V = BinaryOperator::Create(Instruction::Or, V, Data, "", InsertPos);
+    V = ConvertType(V, StateMap.getType(), InsertPos);
+
+    new StoreInst(V, StateMap.AI, false, InsertPos);
+    toErase.push_back(SI);
+}
+
+void StateMapper::RewriteLoadVector(StateMapping &StateMap, StateRef &Ref)
+{
+    LoadInst *LI = cast<LoadInst>(Ref.I);
+    Type *Ty = LI->getType();
+    Instruction *InsertPos = LI;
+
+    /* Compute offset, size and element type of this vector operation. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+
+    /* Extract the element(s) from the vector value. */
+    IntegerType *I32 = IntegerType::get(V->getContext(), 32);
+
+    if (Info.EltTy->isVectorTy()) {
+        /* Multiple elements to load. Use shufflevector. */
+        Value *UndefVal = UndefValue::get(Info.StateTy);
+        SmallVector<Constant*, 8> Indices;
+        for (unsigned i = 0, e = Info.Shift; i != e; ++i)
+            Indices.push_back(ConstantInt::get(I32, Info.Shift + i));
+        Value *CV = ConstantVector::get(Indices);
+        V = new ShuffleVectorInst(V, UndefVal, CV, "", InsertPos);
+    } else {
+        /* Only one element. Use extractelement. */
+        V = ExtractElementInst::Create(V,
+                ConstantInt::get(I32, Info.Shift), "", InsertPos);
+    }
+
+    V = ConvertType(V, Ty, InsertPos);
+
+    LI->replaceAllUsesWith(V);
+    toErase.push_back(LI);
+}
+
+void StateMapper::RewriteStoreVector(StateMapping &StateMap, StateRef &Ref)
+{
+    StoreInst *SI = cast<StoreInst>(Ref.I);
+    Value *Data = SI->getValueOperand();
+    Instruction *InsertPos = SI;
+
+    /* Compute offset, size and element type of this vector operation. */
+    ElementInfo Info;
+    getElementInfo(StateMap, Ref, Info);
+
+    Value *V = new LoadInst(StateMap.AI, "", false, InsertPos);
+    V = ConvertType(V, Info.StateTy, InsertPos);
+    Data = ConvertType(Data, Info.EltTy, InsertPos);
+
+    /* Extract element(s) from data and insert it into the vector value. */
+    IntegerType *I32 = IntegerType::get(V->getContext(), 32);
+
+    if (Info.EltTy->isVectorTy()) {
+        SmallVector<Value *, 8> Partial;
+        for (unsigned i = 0, e = Info.NumElts; i != e; ++i) {
+            Partial.push_back(ExtractElementInst::Create(Data,
+                        ConstantInt::get(I32, i), "", InsertPos));
+        }
+        for (unsigned i = 0, e = Info.NumElts; i != e; ++i) {
+            V = InsertElementInst::Create(V, Partial[i],
+                    ConstantInt::get(I32, Info.Shift + i), "", InsertPos);
+        }
+    } else {
+        /* Only one element. Use insertelement. */
+        V = InsertElementInst::Create(V, Data,
+                ConstantInt::get(I32, Info.Shift), "", InsertPos);
+    }
+
+    V = ConvertType(V, StateMap.getType(), InsertPos);
+
+    new StoreInst(V, StateMap.AI, false, InsertPos);
+    toErase.push_back(SI);
+}
+
+/* Map state references to the virtual states. */
+void StateMapper::PromoteState(Function &F)
+{
+    /* Pre-load CPU states. */
+    Type *IntPtrTy = DL->getIntPtrType(CPU->getContext());
+    for (auto &StateMap : StateMaps) {
+        if (!StateMap.Addr) {
+            Value *Off = ConstantInt::get(IntPtrTy, StateMap.getStart());
+            Value *GEP = GetElementPtrInst::CreateInBounds(CPU, Off, "",
+                                                           PreCastPos);
+            StateMap.Addr = new BitCastInst(GEP,
+                            PointerType::getUnqual(StateMap.getType()), "",
+                            PreCastPos);
+        }
+
+        std::string StateName = StateMap.Addr->getName();
+        if (StateName == "")
+            StateName = getStateName(StateMap.getStart());
+        if (StateName == "")
+            StateName = "state";
+        StateName.append(".a");
+
+        StateMap.AI = CreateAlloca(StateMap.getType(), 0, StateName, PreCastPos);
+        CopyState(StateMap.AI, StateMap.Addr, PreLoadPos);
+    }
+
+    /* Rewrite loads and stores. */
+    for (auto &StateMap : StateMaps) {
+        for (auto Ref : StateMap.State->Refs) {
+            if (isa<LoadInst>(Ref->I))
+                RewriteLoad(StateMap, *Ref);
+            else
+                RewriteStore(StateMap, *Ref);
+        }
+    }
+
+    /* Sync CPU states around helper calls. */
+    SyncHelperState();
+
+    /* Post-store dirty values back to CPU states for each exiting block. */
+    for (auto BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+        BasicBlock *BB = &*BI;
+        if (distance(succ_begin(BB), succ_end(BB)) == 0)    /* leaf node */
+            ProcessExitBB(BB);
+    }
+}
+
+/* Determine if the state can be operated as a vector. */
+Type *StateMapper::TryVectorState(StateData &State, Type *Ty)
+{
+    intptr_t StateStart = State.Start;
+    intptr_t StateEnd = State.End;
+    intptr_t StateSize = StateEnd - StateStart;
+
+    /* If the reference type (from the IR) is already a vector type, use it.
+     * Otherwise, query StateType to determine if it is a vector state. */
+    VectorType *VecTy = dyn_cast<VectorType>(Ty);
+    if (!VecTy) {
+        auto TI = --StateType.upper_bound(StateStart);
+        for (; TI != StateType.end() && TI->first < StateEnd; ++TI) {
+            if (TI->second->isVectorTy()) {
+                VecTy = cast<VectorType>(TI->second);
+                break;
+            }
+        }
+    }
+
+    if (!VecTy)
+        return nullptr;
+
+    /* This is a vector state. Now, we need to check whether all state refs can
+     * be composed by the vector element type: (a) the state size is a multiple
+     * of the vector element size, and (b) the size and shift of each state ref
+     * are both a multiple of the vector element size. */
+    Type *ElementTy = VecTy->getScalarType();
+    intptr_t ElementSize = DL->getTypeSizeInBits(ElementTy) / 8;
+    if (StateSize % ElementSize != 0)
+        return nullptr;
+
+    for (auto Ref : State.Refs) {
+        if (Ref->getSize() % ElementSize != 0 ||
+            (Ref->Start - StateStart) % ElementSize != 0)
+            return nullptr;
+    }
+    return VectorType::get(ElementTy, StateSize / ElementSize);
+}
+
+/* Compute state mapping information based on the state mapping rules. */
+void StateMapper::ComputeStateMap(StateMapping &StateMap, StateData &State)
+{
+    /* State mapping rule:
+     * - A guest state is not overlapped: (i.e., same access size)
+     *   - Same type: map to this type.
+     *   - Different type: select type in the order: vector, float and integer;
+     *                     use bitcast to convert between different types.
+     * - A guest state is overlapped with other state(s):
+     *   - Query StateType to find state size (i.e., boundary) and type:
+     *     - Vector type: use insert/extract to manipulate a vector element.
+     *     - Other types: use shift to manipulate a sub-register element. */
+    bool sameSize = true;
+    bool hasLoad = false;
+    bool hasStore = false;
+
+    for (auto Ref : State.Refs) {
+        hasLoad  |= isa<LoadInst>(Ref->I);
+        hasStore |= isa<StoreInst>(Ref->I);
+    }
+
+    StateRef *Ref = State.Refs.front();
+    Type *Ty = Ref->getType();
+    Value *Addr = getPointerOperand(Ref->I);
+    intptr_t Size = Ref->getSize();
+
+    for (unsigned i = 1, e = State.Refs.size(); i != e; ++i) {
+        StateRef *NextRef = State.Refs[i];
+        Type *NextTy = NextRef->getType();
+        Value *NextAddr = getPointerOperand(NextRef->I);
+
+        /* Check type. */
+        if (Ty != NextTy) {
+            /* Select type in the order: vector, float and integer. */
+            bool Swap = false;
+            if (Ty->isVectorTy() && NextTy->isVectorTy()) {
+                /* We prefer a vector type of small element type. */
+                Type *ATy = cast<VectorType>(Ty)->getScalarType();
+                Type *BTy = cast<VectorType>(NextTy)->getScalarType();
+                if (DL->getTypeSizeInBits(BTy) < DL->getTypeSizeInBits(ATy))
+                    Swap = true;
+            } else if (!Ty->isVectorTy() && NextTy->isVectorTy()) {
+                Swap = true;
+            } else if (Ty->isIntegerTy() && NextTy->isFloatTy()) {
+                Swap = true;
+            }
+
+            if (Swap) {
+                std::swap(Ty, NextTy);
+                std::swap(Addr, NextAddr);
+            }
+        }
+
+        /* Check size. */
+        if (Size != NextRef->getSize())
+            sameSize = false;
+    }
+
+    if (sameSize) {
+        /* The same reference size as the state size. */
+        StateMap.Ty = Ty;
+        StateMap.Addr = Addr;
+    } else {
+        /* Different reference sizes. */
+        intptr_t StateSize = State.End - State.Start;
+        Type *VecTy = TryVectorState(State, Ty);
+        StateMap.Ty = VecTy ? VecTy
+                            : Type::getIntNTy(Ty->getContext(), StateSize * 8);
+        StateMap.Addr = nullptr;
+    }
+    StateMap.State = &State;
+    StateMap.hasLoad = hasLoad;
+    StateMap.hasStore = hasStore;
+}
+
+/* Analyze instructions in a Function that access CPU states. */
+void StateMapper::AnalyzeState(Function &F)
+{
+    /* Collect instructions (load/store/call) that access CPU states.
+     * Loads/stores that access guest memory or are tagged with volatile
+     * (e.g., accessing the states: %pc and %tcg_exit_req) are ignored. */
+
+    for (auto II = inst_begin(F), EE = inst_end(F); II != EE; ++II) {
+        Instruction *I = &*II;
+        intptr_t Off = 0;
+        if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            if (MDFactory::isGuestMemory(I) || LI->isVolatile())
+                continue;
+
+            if (isLegalState(LI->getPointerOperand(), Off))
+                Analyzer.addStateRef(I, Off);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+            if (MDFactory::isGuestMemory(I) || SI->isVolatile())
+                continue;
+
+            if (isLegalState(SI->getPointerOperand(), Off))
+                Analyzer.addStateRef(I, Off);
+        } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+            /* Skip const helper, inlineasm and intrinsic function call. */
+            if (MDFactory::isConst(CI))
+                continue;
+            if (CI->isInlineAsm() || isa<IntrinsicInst>(CI))
+                continue;
+
+            Analyzer.addCall(CI);
+        }
+    }
+
+    /* Ask Analyzer to put state references into groups. */
+    Analyzer.computeState();
+
+    StateList &States = Analyzer.getStateList();
+    if (States.empty())
+        return;
+
+    /* Compute state mapping info. */
+    StateMaps.resize(States.size());
+    for (unsigned i = 0, e = States.size(); i != e; ++i)
+        ComputeStateMap(StateMaps[i], States[i]);
+}
+
+/* Rearrange instructions in the 'init' block. */
+bool StateMapper::init(Function &F)
+{
+    /*
+     * We would like to rearrange the instructions in the 'init' block, in which
+     * gep/cast instructions are in front of other instructions in the block.
+     * For example:
+     *   %0 = getelementptr i8* %cpu, i64 0
+     *   %1 = bitcast i8* %0 to i32*               # gep/cast insns
+     *   --------------------------------------    # precast_pos
+     *   --------------------------------------    # preload_pos
+     *   %2 = load i32, i32* %1                    # the other insns
+     *   br label %entry
+     */
+    CPU = IF->getDefaultCPU(F);
+    if (!CPU || CPU->getParent() != &F.getEntryBlock())
+        return false;
+
+    Instruction *InsertPos = &*std::next(BasicBlock::iterator(CPU));
+    PreLoadPos = new UnreachableInst(CPU->getContext(), InsertPos);
+    PreCastPos = new UnreachableInst(CPU->getContext(), PreLoadPos);
+
+    toErase.push_back(PreLoadPos);
+    toErase.push_back(PreCastPos);
+
+    /* Move gep/cast instructions. */
+    IVec toMove;
+    BasicBlock *BB = CPU->getParent();
+    for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+        Instruction *I = &*BI;
+        if (isStatePointer(I))
+            toMove.push_back(I);
+    }
+    for (auto I : toMove)
+        I->moveBefore(PreCastPos);
+
+    return true;
+}
+
+/*
+ * StateMappingPass
+ */
+bool StateMappingPass::runOnFunction(Function &F)
+{
+    return StateMapper(IF).run(F);
+}
+
+char StateMappingPass::ID = 0;
+INITIALIZE_PASS(StateMappingPass, "statemap",
+        "Eliminate redundant loads/stores by mapping CPU states", false, false)
+
+FunctionPass *llvm::createStateMappingPass(IRFactory *IF)
+{
+    return new StateMappingPass(IF);
+}
+
+/*
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */