summaryrefslogtreecommitdiffstats
path: root/sys/amd64/vmm/vmm_instruction_emul.c
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
committerneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
commit36ab9a2e1ab7d2b1884270275584f989cfd65e2b (patch)
tree8cf47855850351281fdc988a8a65fb469fbfb73f /sys/amd64/vmm/vmm_instruction_emul.c
parent76430d4106c5cb90837c5f728272d9f249e085c4 (diff)
downloadFreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.zip
FreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.tar.gz
Revamp the x86 instruction emulation in bhyve.
On a nested page table fault the hypervisor will: - fetch the instruction using the guest %rip and %cr3 - decode the instruction in 'struct vie' - emulate the instruction in host kernel context for local apic accesses - any other type of mmio access is punted up to user-space (e.g. ioapic) The decoded instruction is passed as collateral to the user-space process that is handling the PAGING exit. The emulation code is fleshed out to include more addressing modes (e.g. SIB) and more types of operands (e.g. imm8). The source code is unified into a single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well as /usr/sbin/bhyve. Reviewed by: grehan Obtained from: NetApp
Diffstat (limited to 'sys/amd64/vmm/vmm_instruction_emul.c')
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c481
1 files changed, 432 insertions, 49 deletions
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 7ef4dbb..5e5399b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -30,6 +30,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#ifdef _KERNEL
#include <sys/param.h>
#include <sys/pcpu.h>
#include <sys/systm.h>
@@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$");
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
-#include "vmm_instruction_emul.h"
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
-#define GB (1024 * 1024 * 1024)
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_RAX,
@@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_R15
};
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
static void
vie_init(struct vie *vie)
{
bzero(vie, sizeof(struct vie));
- vie->op_size = VIE_OP_SIZE_32BIT;
-
vie->base_register = VM_REG_LAST;
vie->index_register = VM_REG_LAST;
- vie->operand_register = VM_REG_LAST;
}
static int
@@ -129,7 +395,7 @@ error:
}
int
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
@@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
static int
vie_peek(struct vie *vie, uint8_t *x)
{
+
if (vie->num_processed < vie->num_valid) {
*x = vie->inst[vie->num_processed];
return (0);
@@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x)
static void
vie_advance(struct vie *vie)
{
- if (vie->num_processed >= vie->num_valid)
- panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
vie->num_processed++;
}
@@ -213,24 +478,16 @@ decode_opcode(struct vie *vie)
{
uint8_t x;
- static const uint8_t flags[256] = {
- [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
- [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
- [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
- };
-
if (vie_peek(vie, &x))
return (-1);
- vie->opcode_byte = x;
- vie->opcode_flags = flags[x];
+ vie->op = one_byte_opcodes[x];
- vie_advance(vie);
-
- if (vie->opcode_flags == 0)
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
return (-1);
- else
- return (0);
+
+ vie_advance(vie);
+ return (0);
}
/*
@@ -241,9 +498,6 @@ decode_modrm(struct vie *vie)
{
uint8_t x;
- if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
- return (0);
-
if (vie_peek(vie, &x))
return (-1);
@@ -251,35 +505,40 @@ decode_modrm(struct vie *vie)
vie->rm = (x >> 0) & 0x7;
vie->reg = (x >> 3) & 0x7;
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
(vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
- /*
- * Table 2-5: Special Cases of REX Encodings
- *
- * mod=0, r/m=5 is used in the compatibility mode to
- * indicate a disp32 without a base register.
- *
- * mod!=3, r/m=4 is used in the compatibility mode to
- * indicate that the SIB byte is present.
- *
- * The 'b' bit in the REX prefix is don't care in
- * this case.
- */
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
} else {
vie->rm |= (vie->rex_b << 3);
}
vie->reg |= (vie->rex_r << 3);
- /* SIB addressing not supported yet */
+ /* SIB */
if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
- return (-1);
+ goto done;
vie->base_register = gpr_map[vie->rm];
- if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
- vie->operand_register = gpr_map[vie->reg];
-
switch (vie->mod) {
case VIE_MOD_INDIRECT_DISP8:
vie->disp_bytes = 1;
@@ -295,12 +554,76 @@ decode_modrm(struct vie *vie)
break;
}
- /* calculate the operand size */
- if (vie->rex_w)
- vie->op_size = VIE_OP_SIZE_64BIT;
-
- if (vie->opcode_flags & VIE_F_FROM_IMM)
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
vie_advance(vie);
@@ -348,13 +671,14 @@ decode_immediate(struct vie *vie)
uint8_t x;
union {
char buf[4];
+ int8_t signed8;
int32_t signed32;
} u;
if ((n = vie->imm_bytes) == 0)
return (0);
- if (n != 4)
+ if (n != 1 && n != 4)
panic("decode_immediate: invalid imm_bytes %d", n);
for (i = 0; i < n; i++) {
@@ -365,14 +689,62 @@ decode_immediate(struct vie *vie)
vie_advance(vie);
}
- vie->immediate = u.signed32; /* sign-extended */
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
return (0);
}
+#define VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+#endif /* VERIFY_GLA */
+
int
-vmm_decode_instruction(struct vie *vie)
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
+
if (decode_rex(vie))
return (-1);
@@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie)
if (decode_modrm(vie))
return (-1);
+ if (decode_sib(vie))
+ return (-1);
+
if (decode_displacement(vie))
return (-1);
if (decode_immediate(vie))
return (-1);
+#ifdef VERIFY_GLA
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+#endif
+
+ vie->decoded = 1; /* success */
+
return (0);
}
+#endif /* _KERNEL */
OpenPOWER on IntegriCloud