summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
committerneel <neel@FreeBSD.org>2012-11-28 00:02:17 +0000
commit36ab9a2e1ab7d2b1884270275584f989cfd65e2b (patch)
tree8cf47855850351281fdc988a8a65fb469fbfb73f /sys/amd64
parent76430d4106c5cb90837c5f728272d9f249e085c4 (diff)
downloadFreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.zip
FreeBSD-src-36ab9a2e1ab7d2b1884270275584f989cfd65e2b.tar.gz
Revamp the x86 instruction emulation in bhyve.
On a nested page table fault the hypervisor will: - fetch the instruction using the guest %rip and %cr3 - decode the instruction in 'struct vie' - emulate the instruction in host kernel context for local apic accesses - any other type of mmio access is punted up to user-space (e.g. ioapic) The decoded instruction is passed as collateral to the user-space process that is handling the PAGING exit. The emulation code is fleshed out to include more addressing modes (e.g. SIB) and more types of operands (e.g. imm8). The source code is unified into a single file (vmm_instruction_emul.c) that is compiled into vmm.ko as well as /usr/sbin/bhyve. Reviewed by: grehan Obtained from: NetApp
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/include/vmm.h3
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h113
-rw-r--r--sys/amd64/vmm/intel/vmcs.h1
-rw-r--r--sys/amd64/vmm/intel/vmx.c45
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c481
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.h91
-rw-r--r--sys/amd64/vmm/vmm_lapic.c83
-rw-r--r--sys/amd64/vmm/vmm_lapic.h6
8 files changed, 605 insertions, 218 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 8f78b8f..2fb2194 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -150,6 +150,8 @@ void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
#endif /* KERNEL */
+#include <machine/vmm_instruction_emul.h>
+
#define VM_MAXCPU 8 /* maximum virtual cpus */
/*
@@ -268,6 +270,7 @@ struct vm_exit {
uint64_t cr3;
uint64_t gpa;
int rwx;
+ struct vie vie;
} paging;
/*
* VMX specific payload. Used when there is no "better"
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t mrr, mem_region_write_t mrw,
+ void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ uint64_t rip, int inst_length, uint64_t cr3,
+ struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+ uint64_t gla, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 84532f4..f39eed2 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -67,6 +67,7 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 7a9cfb8..b185c57 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -63,7 +63,6 @@ __FBSDID("$FreeBSD$");
#include "vmx.h"
#include "x86.h"
#include "vmx_controls.h"
-#include "vmm_instruction_emul.h"
#define PINBASED_CTLS_ONE_SETTING \
(PINBASED_EXTINT_EXITING | \
@@ -1150,23 +1149,11 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
}
static int
-vmx_lapic_fault(struct vm *vm, int cpu,
- uint64_t gpa, uint64_t rip, int inst_length,
- uint64_t cr3, uint64_t ept_qual)
+vmx_ept_fault(struct vm *vm, int cpu,
+ uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+ uint64_t cr3, uint64_t ept_qual, struct vie *vie)
{
- int read, write, handled;
- struct vie vie;
-
- /*
- * For this to be a legitimate access to the local apic:
- * - the GPA in the local apic page
- * - the GPA must be aligned on a 16 byte boundary
- */
- if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
- return (UNHANDLED);
-
- if ((gpa & 0xF) != 0)
- return (UNHANDLED);
+ int read, write, error;
/* EPT violation on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
@@ -1188,15 +1175,22 @@ vmx_lapic_fault(struct vm *vm, int cpu,
}
/* Fetch, decode and emulate the faulting instruction */
- if (vmm_fetch_instruction(vm, rip, inst_length, cr3, &vie) != 0)
+ if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
return (UNHANDLED);
- if (vmm_decode_instruction(&vie) != 0)
+ if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
return (UNHANDLED);
- handled = lapic_mmio(vm, cpu, gpa - DEFAULT_APIC_BASE, read, &vie);
+ /*
+ * Check if this is a local apic access
+ */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+ return (UNHANDLED);
- return (handled);
+ error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ return (error ? UNHANDLED : HANDLED);
}
static int
@@ -1206,7 +1200,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
struct vmcs *vmcs;
struct vmxctx *vmxctx;
uint32_t eax, ecx, edx;
- uint64_t qual, gpa, cr3, intr_info;
+ uint64_t qual, gla, gpa, cr3, intr_info;
handled = 0;
vmcs = &vmx->vmcs[vcpu];
@@ -1299,11 +1293,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
break;
case EXIT_REASON_EPT_FAULT:
+ gla = vmcs_gla();
gpa = vmcs_gpa();
cr3 = vmcs_guest_cr3();
- handled = vmx_lapic_fault(vmx->vm, vcpu,
- gpa, vmexit->rip, vmexit->inst_length,
- cr3, qual);
+ handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+ vmexit->rip, vmexit->inst_length,
+ cr3, qual, &vmexit->u.paging.vie);
if (!handled) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.cr3 = cr3;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 7ef4dbb..5e5399b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -30,6 +30,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#ifdef _KERNEL
#include <sys/param.h>
#include <sys/pcpu.h>
#include <sys/systm.h>
@@ -40,10 +41,60 @@ __FBSDID("$FreeBSD$");
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
-#include "vmm_instruction_emul.h"
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif /* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
-#define GB (1024 * 1024 * 1024)
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_RAX,
@@ -64,17 +115,232 @@ static enum vm_reg_name gpr_map[16] = {
VM_REG_GUEST_R15
};
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+ /*
+ * XXX
+ * The operand register in which we store the result of the
+ * read must be a GPR that we can modify even if the vcpu
+ * is "running". All the GPRs qualify except for %rsp.
+ *
+ * This is a limitation of the vm_set_register() API
+ * and can be fixed if necessary.
+ */
+ if (reg == VM_REG_GUEST_RSP)
+ return (0);
+#endif
+ return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ if (!vie_valid_register(reg))
+ return (EINVAL);
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ * - default address size is 64-bits
+ * - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ if (vie->rex_w)
+ size = 8;
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate; /* already sign-extended */
+
+ if (vie->rex_w)
+ size = 8;
+
+ if (size != 8)
+ val &= size2mask[size];
+
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val1, val2;
+
+ size = 4;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+ if (vie->rex_w)
+ size = 8;
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ val1 &= val2;
+ error = vie_update_register(vm, vcpuid, reg, val1, size);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+#ifdef _KERNEL
static void
vie_init(struct vie *vie)
{
bzero(vie, sizeof(struct vie));
- vie->op_size = VIE_OP_SIZE_32BIT;
-
vie->base_register = VM_REG_LAST;
vie->index_register = VM_REG_LAST;
- vie->operand_register = VM_REG_LAST;
}
static int
@@ -129,7 +395,7 @@ error:
}
int
-vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
int n, err;
@@ -172,6 +438,7 @@ vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
static int
vie_peek(struct vie *vie, uint8_t *x)
{
+
if (vie->num_processed < vie->num_valid) {
*x = vie->inst[vie->num_processed];
return (0);
@@ -182,8 +449,6 @@ vie_peek(struct vie *vie, uint8_t *x)
static void
vie_advance(struct vie *vie)
{
- if (vie->num_processed >= vie->num_valid)
- panic("vie_advance: %d/%d", vie->num_processed, vie->num_valid);
vie->num_processed++;
}
@@ -213,24 +478,16 @@ decode_opcode(struct vie *vie)
{
uint8_t x;
- static const uint8_t flags[256] = {
- [0x89] = VIE_F_HAS_MODRM | VIE_F_FROM_REG | VIE_F_TO_RM,
- [0x8B] = VIE_F_HAS_MODRM | VIE_F_FROM_RM | VIE_F_TO_REG,
- [0xC7] = VIE_F_HAS_MODRM | VIE_F_FROM_IMM | VIE_F_TO_RM,
- };
-
if (vie_peek(vie, &x))
return (-1);
- vie->opcode_byte = x;
- vie->opcode_flags = flags[x];
+ vie->op = one_byte_opcodes[x];
- vie_advance(vie);
-
- if (vie->opcode_flags == 0)
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
return (-1);
- else
- return (0);
+
+ vie_advance(vie);
+ return (0);
}
/*
@@ -241,9 +498,6 @@ decode_modrm(struct vie *vie)
{
uint8_t x;
- if ((vie->opcode_flags & VIE_F_HAS_MODRM) == 0)
- return (0);
-
if (vie_peek(vie, &x))
return (-1);
@@ -251,35 +505,40 @@ decode_modrm(struct vie *vie)
vie->rm = (x >> 0) & 0x7;
vie->reg = (x >> 3) & 0x7;
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
(vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
- /*
- * Table 2-5: Special Cases of REX Encodings
- *
- * mod=0, r/m=5 is used in the compatibility mode to
- * indicate a disp32 without a base register.
- *
- * mod!=3, r/m=4 is used in the compatibility mode to
- * indicate that the SIB byte is present.
- *
- * The 'b' bit in the REX prefix is don't care in
- * this case.
- */
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
} else {
vie->rm |= (vie->rex_b << 3);
}
vie->reg |= (vie->rex_r << 3);
- /* SIB addressing not supported yet */
+ /* SIB */
if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
- return (-1);
+ goto done;
vie->base_register = gpr_map[vie->rm];
- if (vie->opcode_flags & (VIE_F_FROM_REG | VIE_F_TO_REG))
- vie->operand_register = gpr_map[vie->reg];
-
switch (vie->mod) {
case VIE_MOD_INDIRECT_DISP8:
vie->disp_bytes = 1;
@@ -295,12 +554,76 @@ decode_modrm(struct vie *vie)
break;
}
- /* calculate the operand size */
- if (vie->rex_w)
- vie->op_size = VIE_OP_SIZE_64BIT;
-
- if (vie->opcode_flags & VIE_F_FROM_IMM)
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM)
vie->imm_bytes = 4;
+ else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ vie->imm_bytes = 1;
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
vie_advance(vie);
@@ -348,13 +671,14 @@ decode_immediate(struct vie *vie)
uint8_t x;
union {
char buf[4];
+ int8_t signed8;
int32_t signed32;
} u;
if ((n = vie->imm_bytes) == 0)
return (0);
- if (n != 4)
+ if (n != 1 && n != 4)
panic("decode_immediate: invalid imm_bytes %d", n);
for (i = 0; i < n; i++) {
@@ -365,14 +689,62 @@ decode_immediate(struct vie *vie)
vie_advance(vie);
}
- vie->immediate = u.signed32; /* sign-extended */
+ if (n == 1)
+ vie->immediate = u.signed8; /* sign-extended */
+ else
+ vie->immediate = u.signed32; /* sign-extended */
return (0);
}
+#define VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx;
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ if (base + vie->scale * idx + vie->displacement != gla) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla);
+ return (-1);
+ }
+
+ return (0);
+}
+#endif /* VERIFY_GLA */
+
int
-vmm_decode_instruction(struct vie *vie)
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
+
if (decode_rex(vie))
return (-1);
@@ -382,11 +754,22 @@ vmm_decode_instruction(struct vie *vie)
if (decode_modrm(vie))
return (-1);
+ if (decode_sib(vie))
+ return (-1);
+
if (decode_displacement(vie))
return (-1);
if (decode_immediate(vie))
return (-1);
+#ifdef VERIFY_GLA
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+#endif
+
+ vie->decoded = 1; /* success */
+
return (0);
}
+#endif /* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_instruction_emul.h b/sys/amd64/vmm/vmm_instruction_emul.h
deleted file mode 100644
index 1fa9e2b..0000000
--- a/sys/amd64/vmm/vmm_instruction_emul.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*-
- * Copyright (c) 2012 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _VMM_INSTRUCTION_EMUL_H_
-#define _VMM_INSTRUCTION_EMUL_H_
-
-enum vie_op_size {
- VIE_OP_SIZE_32BIT, /* default */
- VIE_OP_SIZE_64BIT,
- VIE_OP_SIZE_8BIT
-};
-
-#define VIE_INST_SIZE 15
-struct vie {
- uint8_t inst[VIE_INST_SIZE];
-
- uint8_t rex_w:1,
- rex_r:1,
- rex_x:1,
- rex_b:1;
-
- uint8_t mod:2,
- reg:4,
- rm:4;
-
-
- uint8_t opcode_byte;
- uint16_t opcode_flags;
- uint8_t disp_bytes;
- uint8_t imm_bytes;
-
- int num_valid;
- int num_processed;
-
- enum vm_reg_name base_register;
- enum vm_reg_name index_register;
- enum vm_reg_name operand_register;
-
- int op_size;
- int64_t displacement;
- int64_t immediate;
-};
-
-#define VIE_F_HAS_MODRM (1 << 0)
-#define VIE_F_FROM_RM (1 << 1)
-#define VIE_F_FROM_REG (1 << 2)
-#define VIE_F_TO_RM (1 << 3)
-#define VIE_F_TO_REG (1 << 4)
-#define VIE_F_FROM_IMM (1 << 5)
-
-#define VIE_MOD_INDIRECT 0
-#define VIE_MOD_INDIRECT_DISP8 1
-#define VIE_MOD_INDIRECT_DISP32 2
-#define VIE_MOD_DIRECT 3
-
-#define VIE_RM_SIB 4
-#define VIE_RM_DISP32 5
-
-struct vm;
-
-int vmm_fetch_instruction(struct vm *vm, uint64_t rip, int inst_length,
- uint64_t cr3, struct vie *vie);
-
-int vmm_decode_instruction(struct vie *vie);
-
-#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index bb22122..dabcf06 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -34,12 +34,12 @@ __FBSDID("$FreeBSD$");
#include <sys/smp.h>
#include <x86/specialreg.h>
+#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_ipi.h"
#include "vmm_lapic.h"
#include "vlapic.h"
-#include "vmm_instruction_emul.h"
static int
lapic_write(struct vlapic *vlapic, u_int offset, uint64_t val)
@@ -177,64 +177,45 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
}
int
-lapic_mmio(struct vm *vm, int cpu, u_int offset, int read, struct vie *vie)
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
{
- int handled, error;
- uint64_t val;
+ int error;
+ uint64_t off;
struct vlapic *vlapic;
- const int UNHANDLED = 0;
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
- /* Only 32-bit accesses to local apic */
- if (vie->op_size != VIE_OP_SIZE_32BIT)
- return (UNHANDLED);
+ off = gpa - DEFAULT_APIC_BASE;
/*
- * XXX
- * The operand register in which we store the result of the
- * read must be a GPR that we can modify even if the vcpu
- * is "running". All the GPRs qualify except for %rsp.
- *
- * This is a limitation of the vm_set_register() API
- * and can be fixed if necessary.
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
*/
- if (vie->operand_register == VM_REG_GUEST_RSP)
- return (UNHANDLED);
-
- if (read) {
- if ((vie->opcode_flags & VIE_F_TO_REG) == 0)
- return (UNHANDLED);
-
- if (vie->operand_register >= VM_REG_LAST)
- return (UNHANDLED);
-
- handled = lapic_read(vlapic, offset, &val);
- if (handled) {
- error = vm_set_register(vm, cpu, vie->operand_register,
- val);
- if (error)
- panic("lapic_mmio: error %d setting gpr %d",
- error, vie->operand_register);
- }
- } else {
- if ((vie->opcode_flags & VIE_F_FROM_REG) &&
- (vie->operand_register < VM_REG_LAST)) {
- error = vm_get_register(vm, cpu, vie->operand_register,
- &val);
- if (error) {
- panic("lapic_mmio: error %d getting gpr %d",
- error, vie->operand_register);
- }
- } else if (vie->opcode_flags & VIE_F_FROM_IMM) {
- val = vie->immediate;
- } else {
- return (UNHANDLED);
- }
-
- handled = lapic_write(vlapic, offset, val);
- }
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
- return (handled);
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+ return (error);
}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 59fc016..a79912e 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -30,13 +30,15 @@
#define _VMM_LAPIC_H_
struct vm;
-struct vie;
boolean_t lapic_msr(u_int num);
int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
-int lapic_mmio(struct vm *vm, int cpu, u_int offset, int rd, struct vie *);
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
int lapic_timer_tick(struct vm *vm, int cpu);
OpenPOWER on IntegriCloud