summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
committerneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
commit36ab9a2e1ab7d2b1884270275584f989cfd65e2b (patch)
tree8cf47855850351281fdc988a8a65fb469fbfb73f
parent76430d4106c5cb90837c5f728272d9f249e085c4 (diff)
downloadFreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.zip
FreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.tar.gz
Revamp the x86 instruction emulation in bhyve.
On a nested page table fault the hypervisor will: - fetch the instruction using the guest %rip and %cr3 - decode the instruction in 'struct vie' - emulate the instruction in host kernel context for local apic accesses - any other type of mmio access is punted up to user-space (e.g. ioapic) The decoded instruction is passed as collateral to the user-space process that is handling the PAGING exit. The emulation code is fleshed out to include more addressing modes (e.g. SIB) and more types of operands (e.g. imm8). The source code is unified into a single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well as /usr/sbin/bhyve. Reviewed by: grehan Obtained from: NetApp
-rw-r--r--sys/amd64/include/vmm.h3
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h113
-rw-r--r--sys/amd64/vmm/intel/vmcs.h1
-rw-r--r--sys/amd64/vmm/intel/vmx.c45
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c481
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.h91
-rw-r--r--sys/amd64/vmm/vmm_lapic.c83
-rw-r--r--sys/amd64/vmm/vmm_lapic.h6
-rw-r--r--usr.sbin/bhyve/Makefile5
-rw-r--r--usr.sbin/bhyve/fbsdrun.c4
-rw-r--r--usr.sbin/bhyve/instruction_emul.c641
-rw-r--r--usr.sbin/bhyve/instruction_emul.h36
-rw-r--r--usr.sbin/bhyve/ioapic.c1
-rw-r--r--usr.sbin/bhyve/mem.c51
-rw-r--r--usr.sbin/bhyve/mem.h2
-rw-r--r--usr.sbin/bhyve/pci_passthru.c1
16 files changed, 649 insertions, 915 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8f78b8f..2fb2194 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -150,6 +150,8 @@ void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
#endif /* KERNEL */
+#include <machine/vmm_instruction_emul.h>
+
#define VM_MAXCPU 8 /* maximum virtual cpus */
/*
@@ -268,6 +270,7 @@ struct vm_exit {
uint64_t cr3;
uint64_t gpa;
int rwx;
+ struct vie vie;
} paging;
/*
* VMX specific payload. Used when there is no "better"
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t mrr, mem_region_write_t mrw,
+ void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ uint64_t rip, int inst_length, uint64_t cr3,
+ struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+ uint64_t gla, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 84532f4..f39eed2 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -67,6 +67,7 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 7a9cfb8..b185c57 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -63,7 +63,6 @@ __FBSDID("$FreeBSD$");
#include "vmx.h"
#include "x86.h"
#include "vmx_controls.h"
-#include "vmm_instruction_emul.h"
#define PINBASED_CTLS_ONE_SETTING \
(PINBASED_EXTINT_EXITING | \
@@ -1150,23 +1149,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
}
static int
-vmx_lapic_fault(struct vm *vm, int cpu,
- uint64_t gpa, uint64_t rip, int inst_length,
- uint64_t cr3, uint64_t ept_qual)
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
{
- int read, write, handled;
- struct vie vie;
-
- /*
- * For this to be a legitimate access to the local apic:
- * - the GPA in the local apic page
- * - the GPA must be aligned on a 16 byte boundary
- */
- if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
- return (UNHANDLED);
-
- if ((gpa & 0xF) != 0)
- return (UNHANDLED);
+ int read, write, error;
/* EPT violation on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
@@ -1188,15 +1175,22 @@ vmx_lapic_fault(struct vm *vm, int cpu,
}
/* Fetch, decode and emulate the faulting instruction */
- if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0)
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
return (UNHANDLED);
- if (vmm_decode_instruction(&vie) != 0)
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
return (UNHANDLED);
- handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie);
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
- return (handled);
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
}
static int
@@ -1206,7 +1200,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
struct vmcs *vmcs;
struct vmxctx *vmxctx;
uint32_t eax, ecx, edx;
- uint64_t qual, gpa, cr3, intr_info;
+ uint64_t qual, gla, gpa, cr3, intr_info;
handled = 0;
vmcs = &vmx->vmcs[vcpu];
@@ -1299,11 +1293,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
break;
case EXIT_REASON_EPT_FAULT:
+ gla = vmcs_gla();
gpa = vmcs_gpa();
cr3 = vmcs_guest_cr3();
- handled = vmx_lapic_fault(vmx->vm, vcpu,
- gpa, vmexit->rip, vmexit->inst_length,
- cr3, qual);
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
if (!handled) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.cr3 = cr3;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 7ef4dbb..5e5399b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -30,6 +30,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#ifdef _KERNEL
#include <sys/param.h>
#include <sys/pcpu.h>
#include <sys/systm.h>
@@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$");
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
-#include "vmm_instruction_emul.h"
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
-#define GB (1024 * 1024 * 1024)
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_RAX,
@@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_R15
};
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
static void
vie_init(struct vie *vie)
{
bzero(vie, sizeof(struct vie));
- vie->op_size = VIE_OP_SIZE_32BIT;
-
vie->base_register = VM_REG_LAST;
vie->index_register = VM_REG_LAST;
- vie->operand_register = VM_REG_LAST;
}
static int
@@ -129,7 +395,7 @@ error:
}
int
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
@@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
static int
vie_peek(struct vie *vie, uint8_t *x)
{
+
if (vie->num_processed < vie->num_valid) {
*x = vie->inst[vie->num_processed];
return (0);
@@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x)
static void
vie_advance(struct vie *vie)
{
- if (vie->num_processed >= vie->num_valid)
- panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
vie->num_processed++;
}
@@ -213,24 +478,16 @@ decode_opcode(struct vie *vie)
{
uint8_t x;
- static const uint8_t flags[256] = {
- [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
- [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
- [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
- };
-
if (vie_peek(vie, &x))
return (-1);
- vie->opcode_byte = x;
- vie->opcode_flags = flags[x];
+ vie->op = one_byte_opcodes[x];
- vie_advance(vie);
-
- if (vie->opcode_flags == 0)
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
return (-1);
- else
- return (0);
+
+ vie_advance(vie);
+ return (0);
}
/*
@@ -241,9 +498,6 @@ decode_modrm(struct vie *vie)
{
uint8_t x;
- if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
- return (0);
-
if (vie_peek(vie, &x))
return (-1);
@@ -251,35 +505,40 @@ decode_modrm(struct vie *vie)
vie->rm = (x >> 0) & 0x7;
vie->reg = (x >> 3) & 0x7;
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
(vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
- /*
- * Table 2-5: Special Cases of REX Encodings
- *
- * mod=0, r/m=5 is used in the compatibility mode to
- * indicate a disp32 without a base register.
- *
- * mod!=3, r/m=4 is used in the compatibility mode to
- * indicate that the SIB byte is present.
- *
- * The 'b' bit in the REX prefix is don't care in
- * this case.
- */
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
} else {
vie->rm |= (vie->rex_b << 3);
}
vie->reg |= (vie->rex_r << 3);
- /* SIB addressing not supported yet */
+ /* SIB */
if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
- return (-1);
+ goto done;
vie->base_register = gpr_map[vie->rm];
- if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
- vie->operand_register = gpr_map[vie->reg];
-
switch (vie->mod) {
case VIE_MOD_INDIRECT_DISP8:
vie->disp_bytes = 1;
@@ -295,12 +554,76 @@ decode_modrm(struct vie *vie)
break;
}
- /* calculate the operand size */
- if (vie->rex_w)
- vie->op_size = VIE_OP_SIZE_64BIT;
-
- if (vie->opcode_flags & VIE_F_FROM_IMM)
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
vie_advance(vie);
@@ -348,13 +671,14 @@ decode_immediate(struct vie *vie)
uint8_t x;
union {
char buf[4];
+ int8_t signed8;
int32_t signed32;
} u;
if ((n = vie->imm_bytes) == 0)
return (0);
- if (n != 4)
+ if (n != 1 && n != 4)
panic("decode_immediate: invalid imm_bytes %d", n);
for (i = 0; i < n; i++) {
@@ -365,14 +689,62 @@ decode_immediate(struct vie *vie)
vie_advance(vie);
}
- vie->immediate = u.signed32; /* sign-extended */
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
return (0);
}
+#define VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+#endif /* VERIFY_GLA */
+
int
-vmm_decode_instruction(struct vie *vie)
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
+
if (decode_rex(vie))
return (-1);
@@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie)
if (decode_modrm(vie))
return (-1);
+ if (decode_sib(vie))
+ return (-1);
+
if (decode_displacement(vie))
return (-1);
if (decode_immediate(vie))
return (-1);
+#ifdef VERIFY_GLA
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+#endif
+
+ vie->decoded = 1; /* success */
+
return (0);
}
+#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h
deleted file mode 100644
index 1fa9e2b..0000000
--- a/sys/amd64/vmm/vmm_instruction_emul.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*-
- * Copyright (c) 2012 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _VMM_INSTRUCTION_EMUL_H_
-#define _VMM_INSTRUCTION_EMUL_H_
-
-enum vie_op_size {
- VIE_OP_SIZE_32BIT, /* default */
- VIE_OP_SIZE_64BIT,
- VIE_OP_SIZE_8BIT
-};
-
-#define VIE_INST_SIZE 15
-struct vie {
- uint8_t inst[VIE_INST_SIZE];
-
- uint8_t rex_w:1,
- rex_r:1,
- rex_x:1,
- rex_b:1;
-
- uint8_t mod:2,
- reg:4,
- rm:4;
-
-
- uint8_t opcode_byte;
- uint16_t opcode_flags;
- uint8_t disp_bytes;
- uint8_t imm_bytes;
-
- int num_valid;
- int num_processed;
-
- enum vm_reg_name base_register;
- enum vm_reg_name index_register;
- enum vm_reg_name operand_register;
-
- int op_size;
- int64_t displacement;
- int64_t immediate;
-};
-
-#define VIE_F_HAS_MODRM (1 << 0)
-#define VIE_F_FROM_RM (1 << 1)
-#define VIE_F_FROM_REG (1 << 2)
-#define VIE_F_TO_RM (1 << 3)
-#define VIE_F_TO_REG (1 << 4)
-#define VIE_F_FROM_IMM (1 << 5)
-
-#define VIE_MOD_INDIRECT 0
-#define VIE_MOD_INDIRECT_DISP8 1
-#define VIE_MOD_INDIRECT_DISP32 2
-#define VIE_MOD_DIRECT 3
-
-#define VIE_RM_SIB 4
-#define VIE_RM_DISP32 5
-
-struct vm;
-
-int vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
- uint64_t cr3, struct vie *vie);
-
-int vmm_decode_instruction(struct vie *vie);
-
-#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index bb22122..dabcf06 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -34,12 +34,12 @@ __FBSDID("$FreeBSD$");
#include <sys/smp.h>
#include <x86/specialreg.h>
+#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
#include "vmm_lapic.h"
#include "vlapic.h"
-#include "vmm_instruction_emul.h"
static int
lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
@@ -177,64 +177,45 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
}
int
-lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie)
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
{
- int handled, error;
- uint64_t val;
+ int error;
+ uint64_t off;
struct vlapic *vlapic;
- const int UNHANDLED = 0;
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
- /* Only 32-bit accesses to local apic */
- if (vie->op_size != VIE_OP_SIZE_32BIT)
- return (UNHANDLED);
+ off = gpa - DEFAULT_APIC_BASE;
/*
- * XXX
- * The operand register in which we store the result of the
- * read must be a GPR that we can modify even if the vcpu
- * is "running". All the GPRs qualify except for %rsp.
- *
- * This is a limitation of the vm_set_register() API
- * and can be fixed if necessary.
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
*/
- if (vie->operand_register == VM_REG_GUEST_RSP)
- return (UNHANDLED);
-
- if (read) {
- if ((vie->opcode_flags & VIE_F_TO_REG) == 0)
- return (UNHANDLED);
-
- if (vie->operand_register >= VM_REG_LAST)
- return (UNHANDLED);
-
- handled = lapic_read(vlapic, offset, &val);
- if (handled) {
- error = vm_set_register(vm, cpu, vie->operand_register,
- val);
- if (error)
- panic("lapic_mmio: error %d setting gpr %d",
- error, vie->operand_register);
- }
- } else {
- if ((vie->opcode_flags & VIE_F_FROM_REG) &&
- (vie->operand_register < VM_REG_LAST)) {
- error = vm_get_register(vm, cpu, vie->operand_register,
- &val);
- if (error) {
- panic("lapic_mmio: error %d getting gpr %d",
- error, vie->operand_register);
- }
- } else if (vie->opcode_flags & VIE_F_FROM_IMM) {
- val = vie->immediate;
- } else {
- return (UNHANDLED);
- }
-
- handled = lapic_write(vlapic, offset, val);
- }
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
- return (handled);
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+ return (error);
}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 59fc016..a79912e 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -30,13 +30,15 @@
#define _VMM_LAPIC_H_
struct vm;
-struct vie;
boolean_t lapic_msr(u_int num);
int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
-int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
int lapic_timer_tick(struct vm *vm, int cpu);
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 9dc7a53..c45b904 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -7,11 +7,14 @@ PROG= bhyve
DEBUG_FLAGS= -g -O0
SRCS= acpi.c atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c
-SRCS+= instruction_emul.c ioapic.c mem.c mevent.c mptbl.c
+SRCS+= ioapic.c mem.c mevent.c mptbl.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
SRCS+= xmsr.c spinup_ap.c
+.PATH: ${.CURDIR}/../../sys/amd64/vmm
+SRCS+= vmm_instruction_emul.c
+
NO_MAN=
DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c
index 43fa797..b1c7098 100644
--- a/usr.sbin/bhyve/fbsdrun.c
+++ b/usr.sbin/bhyve/fbsdrun.c
@@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$");
#include "mptbl.h"
#include "pci_emul.h"
#include "xmsr.h"
-#include "instruction_emul.h"
#include "ioapic.h"
#include "spinup_ap.h"
@@ -455,7 +454,8 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
stats.vmexit_paging++;
err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa, vmexit->rip,
- vmexit->u.paging.cr3, vmexit->u.paging.rwx);
+ vmexit->u.paging.cr3, vmexit->u.paging.rwx,
+ &vmexit->u.paging.vie);
if (err) {
if (err == EINVAL) {
diff --git a/usr.sbin/bhyve/instruction_emul.c b/usr.sbin/bhyve/instruction_emul.c
deleted file mode 100644
index 78c3608..0000000
--- a/usr.sbin/bhyve/instruction_emul.c
+++ /dev/null
@@ -1,641 +0,0 @@
-/*-
- * Copyright (c) 2012 Sandvine, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <strings.h>
-#include <unistd.h>
-#include <assert.h>
-#include <machine/vmm.h>
-#include <vmmapi.h>
-
-#include "fbsdrun.h"
-#include "mem.h"
-#include "instruction_emul.h"
-
-#define PREFIX_LOCK 0xF0
-#define PREFIX_REPNE 0xF2
-#define PREFIX_REPE 0xF3
-#define PREFIX_CS_OVERRIDE 0x2E
-#define PREFIX_SS_OVERRIDE 0x36
-#define PREFIX_DS_OVERRIDE 0x3E
-#define PREFIX_ES_OVERRIDE 0x26
-#define PREFIX_FS_OVERRIDE 0x64
-#define PREFIX_GS_OVERRIDE 0x65
-#define PREFIX_BRANCH_NOT_TAKEN 0x2E
-#define PREFIX_BRANCH_TAKEN 0x3E
-#define PREFIX_OPSIZE 0x66
-#define is_opsz_prefix(x) ((x) == PREFIX_OPSIZE)
-#define PREFIX_ADDRSIZE 0x67
-
-#define OPCODE_2BYTE_ESCAPE 0x0F
-#define OPCODE_3BYTE_ESCAPE 0x38
-
-#define MODRM_MOD_MASK 0xC0
-#define MODRM_MOD_SHIFT 6
-#define MODRM_RM_MASK 0x07
-#define MODRM_RM_SHIFT 0
-#define MODRM_REG_MASK 0x38
-#define MODRM_REG_SHIFT 3
-
-#define MOD_INDIRECT 0x0
-#define MOD_INDIRECT_DISP8 0x1
-#define MOD_INDIRECT_DISP32 0x2
-#define MOD_DIRECT 0x3
-
-#define RM_EAX 0x0
-#define RM_ECX 0x1
-#define RM_EDX 0x2
-#define RM_EBX 0x3
-#define RM_SIB 0x4
-#define RM_DISP32 0x5
-#define RM_EBP RM_DISP32
-#define RM_ESI 0x6
-#define RM_EDI 0x7
-
-#define REG_EAX 0x0
-#define REG_ECX 0x1
-#define REG_EDX 0x2
-#define REG_EBX 0x3
-#define REG_ESP 0x4
-#define REG_EBP 0x5
-#define REG_ESI 0x6
-#define REG_EDI 0x7
-#define REG_R8 0x8
-#define REG_R9 0x9
-#define REG_R10 0xA
-#define REG_R11 0xB
-#define REG_R12 0xC
-#define REG_R13 0xD
-#define REG_R14 0xE
-#define REG_R15 0xF
-
-#define HAS_MODRM 1
-#define FROM_RM (1<<1)
-#define FROM_REG (1<<2)
-#define TO_RM (1<<3)
-#define TO_REG (1<<4)
-#define ZEXT (1<<5)
-#define FROM_8 (1<<6)
-#define FROM_16 (1<<7)
-#define TO_8 (1<<8)
-#define TO_16 (1<<9)
-
-#define REX_MASK 0xF0
-#define REX_PREFIX 0x40
-#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX )
-#define REX_W_MASK 0x8
-#define REX_R_MASK 0x4
-#define REX_X_MASK 0x2
-#define REX_B_MASK 0x1
-
-#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \
- (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \
- (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \
- (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \
- (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \
- (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \
- (x) == PREFIX_ADDRSIZE || is_rex_prefix((x)))
-
-#define PAGE_FRAME_MASK 0x80
-#define PAGE_OFFSET_MASK 0xFFF
-#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK)
-#define PML4E_OFFSET_MASK 0x0000FF8000000000
-#define PML4E_SHIFT 39
-
-#define INSTR_VERIFY
-
-struct decoded_instruction
-{
- void *instruction;
- uint8_t *opcode;
- uint8_t *modrm;
- uint8_t *sib;
- uint8_t *displacement;
- uint8_t *immediate;
-
- uint16_t opcode_flags;
-
- uint8_t addressing_mode;
- uint8_t rm;
- uint8_t reg;
- uint8_t opsz;
- uint8_t rex_r;
- uint8_t rex_w;
- uint8_t rex_b;
- uint8_t rex_x;
-
- int32_t disp;
-};
-
-static enum vm_reg_name vm_reg_name_mappings[] = {
- [REG_EAX] = VM_REG_GUEST_RAX,
- [REG_EBX] = VM_REG_GUEST_RBX,
- [REG_ECX] = VM_REG_GUEST_RCX,
- [REG_EDX] = VM_REG_GUEST_RDX,
- [REG_ESP] = VM_REG_GUEST_RSP,
- [REG_EBP] = VM_REG_GUEST_RBP,
- [REG_ESI] = VM_REG_GUEST_RSI,
- [REG_EDI] = VM_REG_GUEST_RDI,
- [REG_R8] = VM_REG_GUEST_R8,
- [REG_R9] = VM_REG_GUEST_R9,
- [REG_R10] = VM_REG_GUEST_R10,
- [REG_R11] = VM_REG_GUEST_R11,
- [REG_R12] = VM_REG_GUEST_R12,
- [REG_R13] = VM_REG_GUEST_R13,
- [REG_R14] = VM_REG_GUEST_R14,
- [REG_R15] = VM_REG_GUEST_R15
-};
-
-uint16_t one_byte_opcodes[256] = {
- [0x88] = HAS_MODRM | FROM_REG | TO_RM | TO_8 | FROM_8,
- [0x89] = HAS_MODRM | FROM_REG | TO_RM,
- [0x8B] = HAS_MODRM | FROM_RM | TO_REG,
-};
-
-uint16_t two_byte_opcodes[256] = {
- [0xB6] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_8,
- [0xB7] = HAS_MODRM | FROM_RM | TO_REG | ZEXT | FROM_16,
-};
-
-static uintptr_t
-gla2gpa(uint64_t gla, uint64_t guest_cr3)
-{
- uint64_t *table;
- uint64_t mask, entry;
- int level, shift;
- uintptr_t page_frame;
-
- table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK);
- mask = PML4E_OFFSET_MASK;
- shift = PML4E_SHIFT;
- for (level = 0; level < 4; ++level)
- {
- entry = table[(gla & mask) >> shift];
- table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK);
-
- /* This entry does not point to another page table */
- if (entry & PAGE_FRAME_MASK || level >= 3)
- break;
-
- table = paddr_guest2host((uintptr_t)table);
- mask >>= 9;
- shift -= 9;
- }
-
- mask = (1 << shift) - 1;
- page_frame = ((uintptr_t)table & ~mask);
- return (page_frame | (gla & mask));
-}
-
-static void *
-gla2hla(uint64_t gla, uint64_t guest_cr3)
-{
- uintptr_t gpa;
-
- gpa = gla2gpa(gla, guest_cr3);
-
- return (paddr_guest2host(gpa));
-}
-
-/*
- * Decodes all of the prefixes of the instruction. Only a subset of REX
- * prefixes are currently supported. If any unsupported prefix is
- * encountered, returns -1.
- */
-static int
-decode_prefixes(struct decoded_instruction *decoded)
-{
- uint8_t *current_prefix;
-
- current_prefix = decoded->instruction;
-
- if (is_rex_prefix(*current_prefix)) {
- decoded->rex_w = *current_prefix & REX_W_MASK;
- decoded->rex_r = *current_prefix & REX_R_MASK;
- decoded->rex_x = *current_prefix & REX_X_MASK;
- decoded->rex_b = *current_prefix & REX_B_MASK;
- current_prefix++;
- } else if (is_opsz_prefix(*current_prefix)) {
- decoded->opsz = 1;
- current_prefix++;
- } else if (is_prefix(*current_prefix)) {
- return (-1);
- }
-
- decoded->opcode = current_prefix;
- return (0);
-}
-
-/*
- * Decodes the instruction's opcode. If the opcode is not understood, returns
- * -1 indicating an error. Sets the instruction's mod_rm pointer to the
- * location of the ModR/M field.
- */
-static int
-decode_opcode(struct decoded_instruction *decoded)
-{
- uint8_t opcode;
- uint16_t flags;
- int extra;
-
- opcode = *decoded->opcode;
- extra = 0;
-
- if (opcode != 0xf)
- flags = one_byte_opcodes[opcode];
- else {
- opcode = *(decoded->opcode + 1);
- flags = two_byte_opcodes[opcode];
- extra = 1;
- }
-
- if (!flags)
- return (-1);
-
- if (flags & HAS_MODRM) {
- decoded->modrm = decoded->opcode + 1 + extra;
- }
-
- decoded->opcode_flags = flags;
-
- return (0);
-}
-
-/*
- * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer
- * to the location of the SIB if one is expected to be present, or 0 if not.
- */
-static int
-decode_mod_rm(struct decoded_instruction *decoded)
-{
- uint8_t modrm;
- uint8_t *extension_operands;
-
- if (decoded->modrm) {
- modrm = *decoded->modrm;
-
- decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT;
- decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT;
- decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT;
-
- if (decoded->rex_b)
- decoded->rm |= (1<<3);
-
- if (decoded->rex_r)
- decoded->reg |= (1<<3);
-
- extension_operands = decoded->modrm + 1;
-
- if (decoded->rm == RM_SIB) {
- decoded->sib = decoded->modrm + 1;
- extension_operands = decoded->sib + 1;
- }
-
- switch (decoded->addressing_mode) {
- case MOD_INDIRECT:
- case MOD_DIRECT:
- decoded->displacement = 0;
- break;
- case MOD_INDIRECT_DISP8:
- decoded->displacement = extension_operands;
- break;
- case MOD_INDIRECT_DISP32:
- decoded->displacement = extension_operands;
- break;
- }
- }
-
- return (0);
-}
-
-/*
- * Decodes the instruction's SIB field. No such instructions are currently
- * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise.
- */
-static int
-decode_sib(struct decoded_instruction *decoded)
-{
-
- if (decoded->sib)
- return (-1);
-
- return (0);
-}
-
-/*
- * Grabs and saves the instruction's immediate operand and displacement if
- * they are present. Immediates are not currently supported, so if an
- * immediate is present it will return -1 indicating an error.
- */
-static int
-decode_extension_operands(struct decoded_instruction *decoded)
-{
-
- if (decoded->displacement) {
- if (decoded->addressing_mode == MOD_INDIRECT_DISP8) {
- decoded->disp = *((int8_t *)decoded->displacement);
- } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) {
- decoded->disp = *((int32_t *)decoded->displacement);
- }
- }
-
- if (decoded->immediate) {
- return (-1);
- }
-
- return (0);
-}
-
-static int
-decode_instruction(void *instr, struct decoded_instruction *decoded)
-{
- int error;
-
- bzero(decoded, sizeof(*decoded));
- decoded->instruction = instr;
-
- error = decode_prefixes(decoded);
- if (error)
- return (error);
-
- error = decode_opcode(decoded);
- if (error)
- return (error);
-
- error = decode_mod_rm(decoded);
- if (error)
- return (error);
-
- error = decode_sib(decoded);
- if (error)
- return (error);
-
- error = decode_extension_operands(decoded);
- if (error)
- return (error);
-
- return (0);
-}
-
-static enum vm_reg_name
-get_vm_reg_name(uint8_t reg)
-{
-
- return (vm_reg_name_mappings[reg]);
-}
-
-static uint64_t
-adjust_operand(const struct decoded_instruction *instruction, uint64_t val,
- int size)
-{
- uint64_t ret;
-
- if (instruction->opcode_flags & ZEXT) {
- switch (size) {
- case 1:
- ret = val & 0xff;
- break;
- case 2:
- ret = val & 0xffff;
- break;
- case 4:
- ret = val & 0xffffffff;
- break;
- case 8:
- ret = val;
- break;
- default:
- break;
- }
- } else {
- /*
- * Extend the sign
- */
- switch (size) {
- case 1:
- ret = (int8_t)(val & 0xff);
- break;
- case 2:
- ret = (int16_t)(val & 0xffff);
- break;
- case 4:
- ret = (int32_t)(val & 0xffffffff);
- break;
- case 8:
- ret = val;
- break;
- default:
- break;
- }
- }
-
- return (ret);
-}
-
-static int
-get_operand(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3,
- const struct decoded_instruction *instruction, uint64_t *operand,
- struct mem_range *mr)
-{
- enum vm_reg_name regname;
- uint64_t reg;
- int error;
- uint8_t rm, addressing_mode, size;
-
- if (instruction->opcode_flags & FROM_RM) {
- rm = instruction->rm;
- addressing_mode = instruction->addressing_mode;
- } else if (instruction->opcode_flags & FROM_REG) {
- rm = instruction->reg;
- addressing_mode = MOD_DIRECT;
- } else
- return (-1);
-
- /*
- * Determine size of operand
- */
- size = 4;
- if (instruction->opcode_flags & FROM_8) {
- size = 1;
- } else if (instruction->opcode_flags & FROM_16 ||
- instruction->opsz) {
- size = 2;
- }
-
- regname = get_vm_reg_name(rm);
- error = vm_get_register(vm, vcpu, regname, &reg);
- if (error)
- return (error);
-
- switch (addressing_mode) {
- case MOD_DIRECT:
- *operand = reg;
- error = 0;
- break;
- case MOD_INDIRECT:
- case MOD_INDIRECT_DISP8:
- case MOD_INDIRECT_DISP32:
-#ifdef INSTR_VERIFY
- {
- uintptr_t target;
-
- target = gla2gpa(reg, guest_cr3);
- target += instruction->disp;
- assert(gpa == target);
- }
-#endif
- error = (*mr->handler)(vm, vcpu, MEM_F_READ, gpa, size,
- operand, mr->arg1, mr->arg2);
- break;
- default:
- return (-1);
- }
-
- if (!error)
- *operand = adjust_operand(instruction, *operand, size);
-
- return (error);
-}
-
-static uint64_t
-adjust_write(uint64_t reg, uint64_t operand, int size)
-{
- uint64_t val;
-
- switch (size) {
- case 1:
- val = (reg & ~0xff) | (operand & 0xff);
- break;
- case 2:
- val = (reg & ~0xffff) | (operand & 0xffff);
- break;
- case 4:
- val = (reg & ~0xffffffff) | (operand & 0xffffffff);
- break;
- case 8:
- val = operand;
- default:
- break;
- }
-
- return (val);
-}
-
-static int
-perform_write(struct vmctx *vm, int vcpu, uint64_t gpa, uint64_t guest_cr3,
- const struct decoded_instruction *instruction, uint64_t operand,
- struct mem_range *mr)
-{
- enum vm_reg_name regname;
- uintptr_t target;
- int error;
- int size;
- uint64_t reg;
- uint8_t addressing_mode;
-
- if (instruction->opcode_flags & TO_RM) {
- reg = instruction->rm;
- addressing_mode = instruction->addressing_mode;
- } else if (instruction->opcode_flags & TO_REG) {
- reg = instruction->reg;
- addressing_mode = MOD_DIRECT;
- } else
- return (-1);
-
- /*
- * Determine the operand size. rex.w has priority
- */
- size = 4;
- if (instruction->rex_w) {
- size = 8;
- } else if (instruction->opcode_flags & TO_8) {
- size = 1;
- } else if (instruction->opsz) {
- size = 2;
- };
-
- switch(addressing_mode) {
- case MOD_DIRECT:
- regname = get_vm_reg_name(reg);
- error = vm_get_register(vm, vcpu, regname, &reg);
- if (error)
- return (error);
- operand = adjust_write(reg, operand, size);
-
- return (vm_set_register(vm, vcpu, regname, operand));
- case MOD_INDIRECT:
- case MOD_INDIRECT_DISP8:
- case MOD_INDIRECT_DISP32:
-#ifdef INSTR_VERIFY
- regname = get_vm_reg_name(reg);
- error = vm_get_register(vm, vcpu, regname, &reg);
- assert(!error);
- target = gla2gpa(reg, guest_cr3);
- target += instruction->disp;
- assert(gpa == target);
-#endif
- error = (*mr->handler)(vm, vcpu, MEM_F_WRITE, gpa, size,
- &operand, mr->arg1, mr->arg2);
- return (error);
- default:
- return (-1);
- }
-}
-
-static int
-emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t gpa,
- uint64_t cr3,
- const struct decoded_instruction *instruction,
- struct mem_range *mr)
-{
- uint64_t operand;
- int error;
-
- error = get_operand(vm, vcpu, gpa, cr3, instruction, &operand, mr);
- if (error)
- return (error);
-
- return perform_write(vm, vcpu, gpa, cr3, instruction, operand, mr);
-}
-
-int
-emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3,
- uint64_t gpa, int flags, struct mem_range *mr)
-{
- struct decoded_instruction instr;
- int error;
- void *instruction;
-
- instruction = gla2hla(rip, cr3);
-
- error = decode_instruction(instruction, &instr);
- if (!error)
- error = emulate_decoded_instruction(vm, vcpu, gpa, cr3,
- &instr, mr);
-
- return (error);
-}
diff --git a/usr.sbin/bhyve/instruction_emul.h b/usr.sbin/bhyve/instruction_emul.h
deleted file mode 100644
index ef85796..0000000
--- a/usr.sbin/bhyve/instruction_emul.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*-
- * Copyright (c) 2012 Sandvine, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _INSTRUCTION_EMUL_H_
-#define _INSTRUCTION_EMUL_H_
-
-int emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip,
- uint64_t cr3, uint64_t gpa, int flags,
- struct mem_range *mr);
-
-#endif
diff --git a/usr.sbin/bhyve/ioapic.c b/usr.sbin/bhyve/ioapic.c
index ea6e47c..47dd833 100644
--- a/usr.sbin/bhyve/ioapic.c
+++ b/usr.sbin/bhyve/ioapic.c
@@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$");
#include "inout.h"
#include "mem.h"
-#include "instruction_emul.h"
#include "fbsdrun.h"
#include <stdio.h>
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index deb91dc..dc43ff4 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -51,7 +51,6 @@ __FBSDID("$FreeBSD$");
#include <assert.h>
#include "mem.h"
-#include "instruction_emul.h"
struct mmio_rb_range {
RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
@@ -134,33 +133,57 @@ mmio_rb_dump(void)
RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+ rval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+ &wval, mr->arg1, mr->arg2);
+ return (error);
+}
+
int
emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, uint64_t rip,
- uint64_t cr3, int mode)
+ uint64_t cr3, int mode, struct vie *vie)
{
struct mmio_rb_range *entry;
int err;
- err = 0;
-
/*
* First check the per-vCPU cache
*/
if (mmio_hint[vcpu] &&
paddr >= mmio_hint[vcpu]->mr_base &&
paddr <= mmio_hint[vcpu]->mr_end) {
- err = emulate_instruction(ctx, vcpu, rip, cr3, paddr, mode,
- &mmio_hint[vcpu]->mr_param);
- } else {
- if (mmio_rb_lookup(paddr, &entry)) {
- err = ENOENT;
- } else {
- mmio_hint[vcpu] = entry;
- err = emulate_instruction(ctx, vcpu, rip, cr3, paddr,
- mode, &entry->mr_param);
- }
+ entry = mmio_hint[vcpu];
+ } else
+ entry = NULL;
+
+ if (entry == NULL) {
+ if (mmio_rb_lookup(paddr, &entry))
+ return (ESRCH);
+
+ /* Update the per-vCPU cache */
+ mmio_hint[vcpu] = entry;
}
+ assert(entry != NULL && entry == mmio_hint[vcpu]);
+
+ err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+ mem_read, mem_write, &entry->mr_param);
return (err);
}
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
index 53c4f72..b34c1fd 100644
--- a/usr.sbin/bhyve/mem.h
+++ b/usr.sbin/bhyve/mem.h
@@ -51,7 +51,7 @@ struct mem_range {
void init_mem(void);
int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, uint64_t rip,
- uint64_t cr3, int mode);
+ uint64_t cr3, int mode, struct vie *vie);
int register_mem(struct mem_range *memp);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
index 7ac6e3d..28abb6b 100644
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -48,7 +48,6 @@ __FBSDID("$FreeBSD$");
#include <vmmapi.h>
#include "pci_emul.h"
#include "mem.h"
-#include "instruction_emul.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
OpenPOWER on IntegriCloud