summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libvmmapi/vmmapi.c106
-rw-r--r--lib/libvmmapi/vmmapi.h13
-rw-r--r--sys/amd64/include/vmm.h113
-rw-r--r--sys/amd64/include/vmm_dev.h12
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h92
-rw-r--r--sys/amd64/vmm/intel/vmx.c155
-rw-r--r--sys/amd64/vmm/vmm.c67
-rw-r--r--sys/amd64/vmm/vmm_dev.c24
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c359
-rw-r--r--sys/amd64/vmm/vmm_ioport.c108
-rw-r--r--sys/amd64/vmm/vmm_ioport.h2
-rw-r--r--sys/amd64/vmm/vmm_ktr.h4
-rw-r--r--usr.sbin/bhyve/bhyverun.c26
-rw-r--r--usr.sbin/bhyve/inout.c151
-rw-r--r--usr.sbin/bhyve/inout.h10
-rw-r--r--usr.sbin/bhyve/mem.c1
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c6
17 files changed, 1025 insertions, 224 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 5e630f8..ba2904c 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -33,8 +33,10 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
+#include <sys/_iovec.h>
#include <machine/specialreg.h>
+#include <machine/param.h>
#include <stdio.h>
#include <stdlib.h>
@@ -937,3 +939,107 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
*capabilities = cap.capabilities;
return (error);
}
+
+static int
+gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, int *fault, uint64_t *gpa)
+{
+ struct vm_gla2gpa gg;
+ int error;
+
+ bzero(&gg, sizeof(struct vm_gla2gpa));
+ gg.vcpuid = vcpu;
+ gg.prot = prot;
+ gg.gla = gla;
+ gg.paging = *paging;
+
+ error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
+ if (error == 0) {
+ *fault = gg.fault;
+ *gpa = gg.gpa;
+ }
+ return (error);
+}
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
+{
+ uint64_t gpa;
+ int error, fault, i, n, off;
+
+ for (i = 0; i < iovcnt; i++) {
+ iov[i].iov_base = 0;
+ iov[i].iov_len = 0;
+ }
+
+ while (len) {
+ assert(iovcnt > 0);
+ error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
+ if (error)
+ return (-1);
+ if (fault)
+ return (1);
+
+ off = gpa & PAGE_MASK;
+ n = min(len, PAGE_SIZE - off);
+
+ iov->iov_base = (void *)gpa;
+ iov->iov_len = n;
+ iov++;
+ iovcnt--;
+
+ gla += n;
+ len -= n;
+ }
+ return (0);
+}
+
+void
+vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
+{
+ const char *src;
+ char *dst;
+ uint64_t gpa;
+ size_t n;
+
+ dst = vp;
+ while (len) {
+ assert(iov->iov_len);
+ gpa = (uint64_t)iov->iov_base;
+ n = min(len, iov->iov_len);
+ src = vm_map_gpa(ctx, gpa, n);
+ bcopy(src, dst, n);
+
+ iov++;
+ dst += n;
+ len -= n;
+ }
+}
+
+void
+vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
+ size_t len)
+{
+ const char *src;
+ char *dst;
+ uint64_t gpa;
+ size_t n;
+
+ src = vp;
+ while (len) {
+ assert(iov->iov_len);
+ gpa = (uint64_t)iov->iov_base;
+ n = min(len, iov->iov_len);
+ dst = vm_map_gpa(ctx, gpa, n);
+ bcopy(src, dst, n);
+
+ iov++;
+ src += n;
+ len -= n;
+ }
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 88e9947..bab41da 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -29,6 +29,7 @@
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
+struct iovec;
struct vmctx;
enum x2apic_state;
@@ -109,6 +110,18 @@ int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
+/*
+ * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
+ * The 'iovcnt' should be big enough to accomodate all GPA segments.
+ * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ */
+int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
+ void *host_dst, size_t len);
+void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
+ struct iovec *guest_iov, size_t len);
+
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 50d879b..f1902d2 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -54,6 +54,7 @@ struct vmspace;
struct vm_object;
struct pmap;
+enum vm_reg_name;
enum x2apic_state;
typedef int (*vmm_init_func_t)(int ipinum);
@@ -236,10 +237,11 @@ int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
+void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
-#endif /* KERNEL */
+enum vm_reg_name vm_segment_name(int seg_encoding);
-#include <machine/vmm_instruction_emul.h>
+#endif /* KERNEL */
#define VM_MAXCPU 16 /* maximum virtual cpus */
@@ -280,6 +282,7 @@ enum vm_reg_name {
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
+ VM_REG_GUEST_CR2,
VM_REG_LAST
};
@@ -318,6 +321,76 @@ struct seg_desc {
uint32_t limit;
uint32_t access;
};
+#define SEG_DESC_TYPE(desc) ((desc)->access & 0x001f)
+#define SEG_DESC_PRESENT(desc) ((desc)->access & 0x0080)
+#define SEG_DESC_DEF32(desc) ((desc)->access & 0x4000)
+#define SEG_DESC_GRANULARITY(desc) ((desc)->access & 0x8000)
+#define SEG_DESC_UNUSABLE(desc) ((desc)->access & 0x10000)
+
+enum vm_cpu_mode {
+ CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
+ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
+};
+
+enum vm_paging_mode {
+ PAGING_MODE_FLAT,
+ PAGING_MODE_32,
+ PAGING_MODE_PAE,
+ PAGING_MODE_64,
+};
+
+struct vm_guest_paging {
+ uint64_t cr3;
+ int cpl;
+ enum vm_cpu_mode cpu_mode;
+ enum vm_paging_mode paging_mode;
+};
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1,
+ rex_present:1;
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
enum vm_exitcode {
VM_EXITCODE_INOUT,
@@ -335,22 +408,38 @@ enum vm_exitcode {
VM_EXITCODE_RENDEZVOUS,
VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_SUSPENDED,
+ VM_EXITCODE_INOUT_STR,
VM_EXITCODE_MAX
};
+struct vm_inout {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1;
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+};
+
+struct vm_inout_str {
+ struct vm_inout inout; /* must be the first element */
+ struct vm_guest_paging paging;
+ uint64_t rflags;
+ uint64_t cr0;
+ uint64_t index;
+ uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
+ int addrsize;
+ enum vm_reg_name seg_name;
+ struct seg_desc seg_desc;
+};
+
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
uint64_t rip;
union {
- struct {
- uint16_t bytes:3; /* 1 or 2 or 4 */
- uint16_t in:1; /* out is 0, in is 1 */
- uint16_t string:1;
- uint16_t rep:1;
- uint16_t port;
- uint32_t eax; /* valid for out */
- } inout;
+ struct vm_inout inout;
+ struct vm_inout_str inout_str;
struct {
uint64_t gpa;
int fault_type;
@@ -358,9 +447,7 @@ struct vm_exit {
struct {
uint64_t gpa;
uint64_t gla;
- uint64_t cr3;
- enum vie_cpu_mode cpu_mode;
- enum vie_paging_mode paging_mode;
+ struct vm_guest_paging paging;
struct vie vie;
} inst_emul;
/*
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index ecafa9c..f094d51 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -168,6 +168,15 @@ struct vm_suspend {
enum vm_suspend_how how;
};
+struct vm_gla2gpa {
+ int vcpuid; /* inputs */
+ int prot; /* PROT_READ or PROT_WRITE */
+ uint64_t gla;
+ struct vm_guest_paging paging;
+ int fault; /* outputs */
+ uint64_t gpa;
+};
+
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@@ -180,6 +189,7 @@ enum {
IOCNUM_MAP_MEMORY = 10,
IOCNUM_GET_MEMORY_SEG = 11,
IOCNUM_GET_GPA_PMAP = 12,
+ IOCNUM_GLA2GPA = 13,
/* register/state accessors */
IOCNUM_SET_REGISTER = 20,
@@ -289,4 +299,6 @@ enum {
_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
#define VM_GET_GPA_PMAP \
_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
+#define VM_GLA2GPA \
+ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index 0901aa2..e4c408b 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -29,63 +29,7 @@
#ifndef _VMM_INSTRUCTION_EMUL_H_
#define _VMM_INSTRUCTION_EMUL_H_
-enum vie_cpu_mode {
- CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
- CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
-};
-
-enum vie_paging_mode {
- PAGING_MODE_FLAT,
- PAGING_MODE_32,
- PAGING_MODE_PAE,
- PAGING_MODE_64,
-};
-
-/*
- * The data structures 'vie' and 'vie_op' are meant to be opaque to the
- * consumers of instruction decoding. The only reason why their contents
- * need to be exposed is because they are part of the 'vm_exit' structure.
- */
-struct vie_op {
- uint8_t op_byte; /* actual opcode byte */
- uint8_t op_type; /* type of operation (e.g. MOV) */
- uint16_t op_flags;
-};
-
-#define VIE_INST_SIZE 15
-struct vie {
- uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
- uint8_t num_valid; /* size of the instruction */
- uint8_t num_processed;
-
- uint8_t rex_w:1, /* REX prefix */
- rex_r:1,
- rex_x:1,
- rex_b:1,
- rex_present:1;
-
- uint8_t mod:2, /* ModRM byte */
- reg:4,
- rm:4;
-
- uint8_t ss:2, /* SIB byte */
- index:4,
- base:4;
-
- uint8_t disp_bytes;
- uint8_t imm_bytes;
-
- uint8_t scale;
- int base_register; /* VM_REG_GUEST_xyz */
- int index_register; /* VM_REG_GUEST_xyz */
-
- int64_t displacement; /* optional addr displacement */
- int64_t immediate; /* optional immediate operand */
-
- uint8_t decoded; /* set to 1 if successfully decoded */
-
- struct vie_op op; /* opcode description */
-};
+#include <sys/mman.h>
/*
* Callback functions to read and write memory regions.
@@ -111,6 +55,24 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t mrr, mem_region_write_t mrw,
void *mrarg);
+int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size);
+
+/*
+ * Returns 1 if an alignment check exception should be injected and 0 otherwise.
+ */
+int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
+ uint64_t rflags, uint64_t gla);
+
+/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
+int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
+
+uint64_t vie_size2mask(int size);
+
+int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+ struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
+ uint64_t *gla);
+
#ifdef _KERNEL
/*
* APIs to fetch and decode the instruction from nested page fault handler.
@@ -118,8 +80,18 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
* 'vie' must be initialized before calling 'vmm_fetch_instruction()'
*/
int vmm_fetch_instruction(struct vm *vm, int cpuid,
- uint64_t rip, int inst_length, uint64_t cr3,
- enum vie_paging_mode paging_mode, struct vie *vie);
+ struct vm_guest_paging *guest_paging,
+ uint64_t rip, int inst_length, struct vie *vie);
+
+/*
+ * Translate the guest linear address 'gla' to a guest physical address.
+ *
+ * Returns 0 on success and '*gpa' contains the result of the translation.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa);
void vie_init(struct vie *vie);
@@ -136,7 +108,7 @@ void vie_init(struct vie *vie);
*/
#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
- enum vie_cpu_mode cpu_mode, struct vie *vie);
+ enum vm_cpu_mode cpu_mode, struct vie *vie);
#endif /* _KERNEL */
#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index e85e5e4..94b7417 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
#include "vmm_host.h"
#include "vmm_ioport.h"
#include "vmm_ipi.h"
@@ -185,6 +186,8 @@ SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
*/
#define APIC_ACCESS_ADDRESS 0xFFFFF000
+static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
+static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
static void vmx_inject_pir(struct vlapic *vlapic);
#ifdef KTR
@@ -539,7 +542,7 @@ static int
vmx_init(int ipinum)
{
int error, use_tpr_shadow;
- uint64_t fixed0, fixed1, feature_control;
+ uint64_t basic, fixed0, fixed1, feature_control;
uint32_t tmp, procbased2_vid_bits;
/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
@@ -559,6 +562,17 @@ vmx_init(int ipinum)
return (ENXIO);
}
+ /*
+ * Verify capabilities MSR_VMX_BASIC:
+ * - bit 54 indicates support for INS/OUTS decoding
+ */
+ basic = rdmsr(MSR_VMX_BASIC);
+ if ((basic & (1UL << 54)) == 0) {
+ printf("vmx_init: processor does not support desired basic "
+ "capabilities\n");
+ return (EINVAL);
+ }
+
/* Check support for primary processor-based VM-execution controls */
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_TRUE_PROCBASED_CTLS,
@@ -1539,7 +1553,19 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
return (HANDLED);
}
-static enum vie_cpu_mode
+/*
+ * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
+ */
+static int
+vmx_cpl(void)
+{
+ uint32_t ssar;
+
+ ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
+ return ((ssar >> 5) & 0x3);
+}
+
+static enum vm_cpu_mode
vmx_cpu_mode(void)
{
@@ -1549,7 +1575,7 @@ vmx_cpu_mode(void)
return (CPU_MODE_COMPATIBILITY);
}
-static enum vie_paging_mode
+static enum vm_paging_mode
vmx_paging_mode(void)
{
@@ -1563,6 +1589,89 @@ vmx_paging_mode(void)
return (PAGING_MODE_PAE);
}
+static uint64_t
+inout_str_index(struct vmx *vmx, int vcpuid, int in)
+{
+ uint64_t val;
+ int error;
+ enum vm_reg_name reg;
+
+ reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+ error = vmx_getreg(vmx, vcpuid, reg, &val);
+ KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
+ return (val);
+}
+
+static uint64_t
+inout_str_count(struct vmx *vmx, int vcpuid, int rep)
+{
+ uint64_t val;
+ int error;
+
+ if (rep) {
+ error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
+ KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
+ } else {
+ val = 1;
+ }
+ return (val);
+}
+
+static int
+inout_str_addrsize(uint32_t inst_info)
+{
+ uint32_t size;
+
+ size = (inst_info >> 7) & 0x7;
+ switch (size) {
+ case 0:
+ return (2); /* 16 bit */
+ case 1:
+ return (4); /* 32 bit */
+ case 2:
+ return (8); /* 64 bit */
+ default:
+ panic("%s: invalid size encoding %d", __func__, size);
+ }
+}
+
+static void
+inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
+ struct vm_inout_str *vis)
+{
+ int error, s;
+
+ if (in) {
+ vis->seg_name = VM_REG_GUEST_ES;
+ } else {
+ s = (inst_info >> 15) & 0x7;
+ vis->seg_name = vm_segment_name(s);
+ }
+
+ error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
+ KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
+
+ /* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */
+}
+
+static void
+vmx_paging_info(struct vm_guest_paging *paging)
+{
+ paging->cr3 = vmcs_guest_cr3();
+ paging->cpl = vmx_cpl();
+ paging->cpu_mode = vmx_cpu_mode();
+ paging->paging_mode = vmx_paging_mode();
+}
+
+static void
+vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
+{
+ vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+ vmexit->u.inst_emul.gpa = gpa;
+ vmexit->u.inst_emul.gla = gla;
+ vmx_paging_info(&vmexit->u.inst_emul.paging);
+}
+
static int
ept_fault_type(uint64_t ept_qual)
{
@@ -1754,12 +1863,8 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
}
if (allowed) {
- vmexit->exitcode = VM_EXITCODE_INST_EMUL;
- vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
- vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
- vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
- vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
- vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
+ vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
+ VIE_INVALID_GLA);
}
/*
@@ -1776,10 +1881,12 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
static int
vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
- int error, handled;
+ int error, handled, in;
struct vmxctx *vmxctx;
struct vlapic *vlapic;
- uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
+ struct vm_inout_str *vis;
+ uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
+ uint32_t reason;
uint64_t qual, gpa;
bool retu;
@@ -1936,15 +2043,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
vmexit->exitcode = VM_EXITCODE_INOUT;
vmexit->u.inout.bytes = (qual & 0x7) + 1;
- vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
vmexit->u.inout.port = (uint16_t)(qual >> 16);
vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
- error = emulate_ioport(vmx->vm, vcpu, vmexit);
- if (error == 0) {
- handled = 1;
- vmxctx->guest_rax = vmexit->u.inout.eax;
+ if (vmexit->u.inout.string) {
+ inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
+ vmexit->exitcode = VM_EXITCODE_INOUT_STR;
+ vis = &vmexit->u.inout_str;
+ vmx_paging_info(&vis->paging);
+ vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+ vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
+ vis->index = inout_str_index(vmx, vcpu, in);
+ vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
+ vis->addrsize = inout_str_addrsize(inst_info);
+ inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
}
break;
case EXIT_REASON_CPUID:
@@ -1990,12 +2104,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmexit->u.paging.fault_type = ept_fault_type(qual);
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
} else if (ept_emulation_fault(qual)) {
- vmexit->exitcode = VM_EXITCODE_INST_EMUL;
- vmexit->u.inst_emul.gpa = gpa;
- vmexit->u.inst_emul.gla = vmcs_gla();
- vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
- vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
- vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
+ vmexit_inst_emul(vmexit, gpa, vmcs_gla());
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
}
/*
@@ -2324,6 +2433,8 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg)
return (&vmxctx->guest_r14);
case VM_REG_GUEST_R15:
return (&vmxctx->guest_r15);
+ case VM_REG_GUEST_CR2:
+ return (&vmxctx->guest_cr2);
default:
break;
}
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 9aed5f5..8ebdfd7 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -62,7 +62,9 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include "vmm_ioport.h"
#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
@@ -1131,34 +1133,33 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
- int error, inst_length;
- uint64_t rip, gla, gpa, cr3;
- enum vie_cpu_mode cpu_mode;
- enum vie_paging_mode paging_mode;
+ uint64_t gla, gpa;
+ struct vm_guest_paging *paging;
mem_region_read_t mread;
mem_region_write_t mwrite;
+ int error;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
- rip = vme->rip;
- inst_length = vme->inst_length;
-
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
- cr3 = vme->u.inst_emul.cr3;
- cpu_mode = vme->u.inst_emul.cpu_mode;
- paging_mode = vme->u.inst_emul.paging_mode;
vie = &vme->u.inst_emul.vie;
+ paging = &vme->u.inst_emul.paging;
vie_init(vie);
/* Fetch, decode and emulate the faulting instruction */
- if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3,
- paging_mode, vie) != 0)
+ error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
+ vme->inst_length, vie);
+ if (error == 1)
+ return (0); /* Resume guest to handle page fault */
+ else if (error == -1)
return (EFAULT);
+ else if (error != 0)
+ panic("%s: vmm_fetch_instruction error %d", __func__, error);
- if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0)
+ if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
return (EFAULT);
/* return to userland unless this is an in-kernel emulated device */
@@ -1348,6 +1349,10 @@ restart:
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid, &retu);
break;
+ case VM_EXITCODE_INOUT:
+ case VM_EXITCODE_INOUT_STR:
+ error = vm_handle_inout(vm, vcpuid, vme, &retu);
+ break;
default:
retu = true; /* handled in userland */
break;
@@ -1430,6 +1435,25 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
}
void
+vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+{
+ struct vm_exception pf = {
+ .vector = IDT_PF,
+ .error_code_valid = 1,
+ .error_code = error_code
+ };
+ int error;
+
+ VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
+ error_code, cr2);
+
+ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
+ KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
+
+ vm_inject_fault(vm, vcpuid, &pf);
+}
+
+void
vm_inject_gp(struct vm *vm, int vcpuid)
{
struct vm_exception gpf = {
@@ -1856,3 +1880,20 @@ vm_atpit(struct vm *vm)
{
return (vm->vatpit);
}
+
+enum vm_reg_name
+vm_segment_name(int seg)
+{
+ static enum vm_reg_name seg_names[] = {
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS
+ };
+
+ KASSERT(seg >= 0 && seg < nitems(seg_names),
+ ("%s: invalid segment encoding %d", __func__, seg));
+ return (seg_names[seg]);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f1d5795..0561785 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
@@ -168,6 +169,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_x2apic *x2apic;
struct vm_gpa_pte *gpapte;
struct vm_suspend *vmsuspend;
+ struct vm_gla2gpa *gg;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@@ -192,6 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_PPTDEV_MSI:
case VM_PPTDEV_MSIX:
case VM_SET_X2APIC_STATE:
+ case VM_GLA2GPA:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
@@ -415,6 +418,27 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_GET_HPET_CAPABILITIES:
error = vhpet_getcap((struct vm_hpet_cap *)data);
break;
+ case VM_GLA2GPA: {
+ CTASSERT(PROT_READ == VM_PROT_READ);
+ CTASSERT(PROT_WRITE == VM_PROT_WRITE);
+ CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
+ gg = (struct vm_gla2gpa *)data;
+ error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
+ gg->prot, &gg->gpa);
+ KASSERT(error == 0 || error == 1 || error == -1,
+ ("%s: vmm_gla2gpa unknown error %d", __func__, error));
+ if (error >= 0) {
+ /*
+ * error = 0: the translation was successful
+ * error = 1: a fault was injected into the guest
+ */
+ gg->fault = error;
+ error = 0;
+ } else {
+ error = EFAULT;
+ }
+ break;
+ }
default:
error = ENOTTY;
break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index f7d109c..921deb5 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/pcpu.h>
#include <sys/systm.h>
+#include <sys/proc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
@@ -46,9 +47,15 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
+#include <assert.h>
#include <vmmapi.h>
+#define KASSERT(exp,msg) assert((exp))
#endif /* _KERNEL */
+#include <machine/vmm_instruction_emul.h>
+#include <x86/psl.h>
+#include <x86/specialreg.h>
+
/* struct vie_op.op_type */
enum {
VIE_OP_TYPE_NONE = 0,
@@ -205,7 +212,7 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
return (error);
}
-static int
+int
vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
uint64_t val, int size)
{
@@ -560,6 +567,155 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (error);
}
+int
+vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
+{
+ KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+ ("%s: invalid size %d", __func__, size));
+ KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
+
+ if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
+ return (0);
+
+ return ((gla & (size - 1)) ? 1 : 0);
+}
+
+int
+vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
+{
+ uint64_t mask;
+
+ if (cpu_mode != CPU_MODE_64BIT)
+ return (0);
+
+ /*
+ * The value of the bit 47 in the 'gla' should be replicated in the
+ * most significant 16 bits.
+ */
+ mask = ~((1UL << 48) - 1);
+ if (gla & (1UL << 47))
+ return ((gla & mask) != mask);
+ else
+ return ((gla & mask) != 0);
+}
+
+uint64_t
+vie_size2mask(int size)
+{
+ KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+ ("vie_size2mask: invalid size %d", size));
+ return (size2mask[size]);
+}
+
+int
+vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+ struct seg_desc *desc, uint64_t offset, int length, int addrsize,
+ int prot, uint64_t *gla)
+{
+ uint64_t firstoff, low_limit, high_limit, segbase;
+ int glasize, type;
+
+ KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
+ ("%s: invalid segment %d", __func__, seg));
+ KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
+ ("%s: invalid operand size %d", __func__, length));
+ KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
+ ("%s: invalid prot %#x", __func__, prot));
+
+ firstoff = offset;
+ if (cpu_mode == CPU_MODE_64BIT) {
+ KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
+ "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
+ glasize = 8;
+ } else {
+ KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
+ "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
+ glasize = 4;
+ /*
+ * If the segment selector is loaded with a NULL selector
+ * then the descriptor is unusable and attempting to use
+ * it results in a #GP(0).
+ */
+ if (SEG_DESC_UNUSABLE(desc))
+ return (-1);
+
+ /*
+ * The processor generates a #NP exception when a segment
+ * register is loaded with a selector that points to a
+ * descriptor that is not present. If this was the case then
+ * it would have been checked before the VM-exit.
+ */
+ KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
+ seg, desc->access));
+
+ /*
+ * The descriptor type must indicate a code/data segment.
+ */
+ type = SEG_DESC_TYPE(desc);
+ KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
+ "descriptor type %#x", seg, type));
+
+ if (prot & PROT_READ) {
+ /* #GP on a read access to a exec-only code segment */
+ if ((type & 0xA) == 0x8)
+ return (-1);
+ }
+
+ if (prot & PROT_WRITE) {
+ /*
+ * #GP on a write access to a code segment or a
+ * read-only data segment.
+ */
+ if (type & 0x8) /* code segment */
+ return (-1);
+
+ if ((type & 0xA) == 0) /* read-only data seg */
+ return (-1);
+ }
+
+ /*
+ * 'desc->limit' is fully expanded taking granularity into
+ * account.
+ */
+ if ((type & 0xC) == 0x4) {
+ /* expand-down data segment */
+ low_limit = desc->limit + 1;
+ high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+ } else {
+ /* code segment or expand-up data segment */
+ low_limit = 0;
+ high_limit = desc->limit;
+ }
+
+ while (length > 0) {
+ offset &= vie_size2mask(addrsize);
+ if (offset < low_limit || offset > high_limit)
+ return (-1);
+ offset++;
+ length--;
+ }
+ }
+
+ /*
+ * In 64-bit mode all segments except %fs and %gs have a segment
+ * base address of 0.
+ */
+ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
+ seg != VM_REG_GUEST_GS) {
+ segbase = 0;
+ } else {
+ segbase = desc->base;
+ }
+
+ /*
+ * Truncate 'firstoff' to the effective address size before adding
+ * it to the segment base.
+ */
+ firstoff &= vie_size2mask(addrsize);
+ *gla = (segbase + firstoff) & vie_size2mask(glasize);
+ return (0);
+}
+
#ifdef _KERNEL
void
vie_init(struct vie *vie)
@@ -572,28 +728,86 @@ vie_init(struct vie *vie)
}
static int
-gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
- uint64_t *gpa, enum vie_paging_mode paging_mode)
+pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
+{
+ int error_code = 0;
+
+ if (pte & PG_V)
+ error_code |= PGEX_P;
+ if (prot & VM_PROT_WRITE)
+ error_code |= PGEX_W;
+ if (usermode)
+ error_code |= PGEX_U;
+ if (rsvd)
+ error_code |= PGEX_RSV;
+ if (prot & VM_PROT_EXECUTE)
+ error_code |= PGEX_I;
+
+ return (error_code);
+}
+
+static void
+ptp_release(void **cookie)
+{
+ if (*cookie != NULL) {
+ vm_gpa_release(*cookie);
+ *cookie = NULL;
+ }
+}
+
+static void *
+ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
{
- int nlevels, ptpshift, ptpindex;
- uint64_t *ptpbase, pte, pgsize;
+ void *ptr;
+
+ ptp_release(cookie);
+ ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
+ return (ptr);
+}
+
+int
+vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa)
+{
+ int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
+ u_int retries;
+ uint64_t *ptpbase, ptpphys, pte, pgsize;
uint32_t *ptpbase32, pte32;
void *cookie;
- if (paging_mode == PAGING_MODE_FLAT) {
+ usermode = (paging->cpl == 3 ? 1 : 0);
+ writable = prot & VM_PROT_WRITE;
+ cookie = NULL;
+ retval = 0;
+ retries = 0;
+restart:
+ ptpphys = paging->cr3; /* root of the page tables */
+ ptp_release(&cookie);
+ if (retries++ > 0)
+ maybe_yield();
+
+ if (vie_canonical_check(paging->cpu_mode, gla)) {
+ /*
+ * XXX assuming a non-stack reference otherwise a stack fault
+ * should be generated.
+ */
+ vm_inject_gp(vm, vcpuid);
+ goto fault;
+ }
+
+ if (paging->paging_mode == PAGING_MODE_FLAT) {
*gpa = gla;
- return (0);
+ goto done;
}
- if (paging_mode == PAGING_MODE_32) {
+ if (paging->paging_mode == PAGING_MODE_32) {
nlevels = 2;
while (--nlevels >= 0) {
/* Zero out the lower 12 bits. */
ptpphys &= ~0xfff;
- ptpbase32 = vm_gpa_hold(vm, ptpphys, PAGE_SIZE,
- VM_PROT_READ, &cookie);
-
+ ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+
if (ptpbase32 == NULL)
goto error;
@@ -603,29 +817,55 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
pte32 = ptpbase32[ptpindex];
- vm_gpa_release(cookie);
-
- if ((pte32 & PG_V) == 0)
- goto error;
+ if ((pte32 & PG_V) == 0 ||
+ (usermode && (pte32 & PG_U) == 0) ||
+ (writable && (pte32 & PG_RW) == 0)) {
+ pfcode = pf_error_code(usermode, prot, 0,
+ pte32);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
- if (pte32 & PG_PS)
+ /*
+ * Emulate the x86 MMU's management of the accessed
+ * and dirty flags. While the accessed flag is set
+ * at every level of the page table, the dirty flag
+ * is only set at the last level providing the guest
+ * physical address.
+ */
+ if ((pte32 & PG_A) == 0) {
+ if (atomic_cmpset_32(&ptpbase32[ptpindex],
+ pte32, pte32 | PG_A) == 0) {
+ goto restart;
+ }
+ }
+
+ /* XXX must be ignored if CR4.PSE=0 */
+ if (nlevels > 0 && (pte32 & PG_PS) != 0)
break;
ptpphys = pte32;
}
+ /* Set the dirty bit in the page table entry if necessary */
+ if (writable && (pte32 & PG_M) == 0) {
+ if (atomic_cmpset_32(&ptpbase32[ptpindex],
+ pte32, pte32 | PG_M) == 0) {
+ goto restart;
+ }
+ }
+
/* Zero out the lower 'ptpshift' bits */
pte32 >>= ptpshift; pte32 <<= ptpshift;
*gpa = pte32 | (gla & (pgsize - 1));
- return (0);
+ goto done;
}
- if (paging_mode == PAGING_MODE_PAE) {
- /* Zero out the lower 5 bits and the upper 12 bits */
- ptpphys >>= 5; ptpphys <<= 17; ptpphys >>= 12;
+ if (paging->paging_mode == PAGING_MODE_PAE) {
+ /* Zero out the lower 5 bits and the upper 32 bits */
+ ptpphys &= 0xffffffe0UL;
- ptpbase = vm_gpa_hold(vm, ptpphys, sizeof(*ptpbase) * 4,
- VM_PROT_READ, &cookie);
+ ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
if (ptpbase == NULL)
goto error;
@@ -633,10 +873,11 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
pte = ptpbase[ptpindex];
- vm_gpa_release(cookie);
-
- if ((pte & PG_V) == 0)
- goto error;
+ if ((pte & PG_V) == 0) {
+ pfcode = pf_error_code(usermode, prot, 0, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
ptpphys = pte;
@@ -647,8 +888,7 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
- ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
- &cookie);
+ ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
if (ptpbase == NULL)
goto error;
@@ -658,36 +898,59 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
pte = ptpbase[ptpindex];
- vm_gpa_release(cookie);
+ if ((pte & PG_V) == 0 ||
+ (usermode && (pte & PG_U) == 0) ||
+ (writable && (pte & PG_RW) == 0)) {
+ pfcode = pf_error_code(usermode, prot, 0, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
- if ((pte & PG_V) == 0)
- goto error;
+ /* Set the accessed bit in the page table entry */
+ if ((pte & PG_A) == 0) {
+ if (atomic_cmpset_64(&ptpbase[ptpindex],
+ pte, pte | PG_A) == 0) {
+ goto restart;
+ }
+ }
- if (pte & PG_PS) {
- if (pgsize > 1 * GB)
- goto error;
- else
- break;
+ if (nlevels > 0 && (pte & PG_PS) != 0) {
+ if (pgsize > 1 * GB) {
+ pfcode = pf_error_code(usermode, prot, 1, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
+ break;
}
ptpphys = pte;
}
+ /* Set the dirty bit in the page table entry if necessary */
+ if (writable && (pte & PG_M) == 0) {
+ if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
+ goto restart;
+ }
+
/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
*gpa = pte | (gla & (pgsize - 1));
- return (0);
-
+done:
+ ptp_release(&cookie);
+ return (retval);
error:
- return (-1);
+ retval = -1;
+ goto done;
+fault:
+ retval = 1;
+ goto done;
}
int
-vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
- uint64_t cr3, enum vie_paging_mode paging_mode,
- struct vie *vie)
+vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+ uint64_t rip, int inst_length, struct vie *vie)
{
- int n, err, prot;
+ int n, error, prot;
uint64_t gpa, off;
void *hpa, *cookie;
@@ -701,9 +964,9 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
/* Copy the instruction into 'vie' */
while (vie->num_valid < inst_length) {
- err = gla2gpa(vm, rip, cr3, &gpa, paging_mode);
- if (err)
- break;
+ error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
+ if (error)
+ return (error);
off = gpa & PAGE_MASK;
n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
@@ -804,7 +1067,7 @@ decode_opcode(struct vie *vie)
}
static int
-decode_modrm(struct vie *vie, enum vie_cpu_mode cpu_mode)
+decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
{
uint8_t x;
@@ -1084,7 +1347,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
int
vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
- enum vie_cpu_mode cpu_mode, struct vie *vie)
+ enum vm_cpu_mode cpu_mode, struct vie *vie)
{
if (cpu_mode == CPU_MODE_64BIT) {
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index eae45cc..4ce1403 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -33,11 +33,16 @@ __FBSDID("$FreeBSD$");
#include <sys/cpuset.h>
#include <sys/systm.h>
+#include <vm/vm.h>
+
#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+#include <x86/psl.h>
#include "vatpic.h"
#include "vatpit.h"
#include "vmm_ioport.h"
+#include "vmm_ktr.h"
#define MAX_IOPORTS 1280
@@ -55,32 +60,64 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
[IO_ELCR2] = vatpic_elc_handler,
};
-int
-emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit)
+#ifdef KTR
+static const char *
+inout_instruction(struct vm_exit *vmexit)
{
- ioport_handler_func_t handler;
- uint32_t mask, val;
- int error;
+ int index;
- if (vmexit->u.inout.port >= MAX_IOPORTS)
- return (-1);
-
- handler = ioport_handler[vmexit->u.inout.port];
- if (handler == NULL)
- return (-1);
+ static const char *iodesc[] = {
+ "outb", "outw", "outl",
+ "inb", "inw", "inl",
+ "outsb", "outsw", "outsd"
+ "insb", "insw", "insd",
+ };
switch (vmexit->u.inout.bytes) {
case 1:
- mask = 0xff;
+ index = 0;
break;
case 2:
- mask = 0xffff;
+ index = 1;
break;
default:
- mask = 0xffffffff;
+ index = 2;
break;
}
+ if (vmexit->u.inout.in)
+ index += 3;
+
+ if (vmexit->u.inout.string)
+ index += 6;
+
+ KASSERT(index < nitems(iodesc), ("%s: invalid index %d",
+ __func__, index));
+
+ return (iodesc[index]);
+}
+#endif /* KTR */
+
+static int
+emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
+ bool *retu)
+{
+ ioport_handler_func_t handler;
+ uint32_t mask, val;
+ int error;
+
+ error = 0;
+ *retu = true;
+
+ if (vmexit->u.inout.port >= MAX_IOPORTS)
+ goto done;
+
+ handler = ioport_handler[vmexit->u.inout.port];
+ if (handler == NULL)
+ goto done;
+
+ mask = vie_size2mask(vmexit->u.inout.bytes);
+
if (!vmexit->u.inout.in) {
val = vmexit->u.inout.eax & mask;
}
@@ -88,10 +125,47 @@ emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit)
error = (*handler)(vm, vcpuid, vmexit->u.inout.in,
vmexit->u.inout.port, vmexit->u.inout.bytes, &val);
- if (!error && vmexit->u.inout.in) {
- vmexit->u.inout.eax &= ~mask;
- vmexit->u.inout.eax |= val & mask;
+ if (!error) {
+ *retu = false;
+ if (vmexit->u.inout.in) {
+ vmexit->u.inout.eax &= ~mask;
+ vmexit->u.inout.eax |= val & mask;
+ error = vm_set_register(vm, vcpuid,
+ VM_REG_GUEST_RAX, vmexit->u.inout.eax);
+ KASSERT(error == 0, ("emulate_ioport: error %d "
+ "setting guest rax register", error));
+ }
}
+done:
+ return (error);
+}
+
+static int
+emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+ *retu = true;
+ return (0); /* Return to userspace to finish emulation */
+}
+
+int
+vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+ int bytes, error;
+
+ bytes = vmexit->u.inout.bytes;
+ KASSERT(bytes == 1 || bytes == 2 || bytes == 4,
+ ("vm_handle_inout: invalid operand size %d", bytes));
+
+ if (vmexit->u.inout.string)
+ error = emulate_inout_str(vm, vcpuid, vmexit, retu);
+ else
+ error = emulate_inout_port(vm, vcpuid, vmexit, retu);
+
+ VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s",
+ vmexit->u.inout.rep ? "rep " : "",
+ inout_instruction(vmexit),
+ vmexit->u.inout.port,
+ error ? "error" : (*retu ? "userspace" : "handled"));
return (error);
}
diff --git a/sys/amd64/vmm/vmm_ioport.h b/sys/amd64/vmm/vmm_ioport.h
index 02e543a..84a4cf1 100644
--- a/sys/amd64/vmm/vmm_ioport.h
+++ b/sys/amd64/vmm/vmm_ioport.h
@@ -32,6 +32,6 @@
typedef int (*ioport_handler_func_t)(void *vm, int vcpuid,
bool in, int port, int bytes, uint32_t *val);
-int emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit);
+int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu);
#endif /* _VMM_IOPORT_H_ */
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
index 9fb46d8..61ff53f 100644
--- a/sys/amd64/vmm/vmm_ktr.h
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -48,6 +48,10 @@ CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2))
#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \
CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3))
+#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \
+CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \
+ (p1), (p2), (p3), (p4))
+
#define VM_CTR0(vm, format) \
CTR1(KTR_VMM, "vm %s: " format, vm_name((vm)))
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index d9b4418..f9a67cb 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -288,33 +288,34 @@ static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
- int bytes, port, in, out;
- uint32_t eax;
+ int bytes, port, in, out, string;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
- eax = vme->u.inout.eax;
+ string = vme->u.inout.string;
in = vme->u.inout.in;
out = !in;
- /* We don't deal with these */
- if (vme->u.inout.string || vme->u.inout.rep)
- return (VMEXIT_ABORT);
-
/* Extra-special case of host notifications */
- if (out && port == GUEST_NIO_PORT)
- return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+ if (out && port == GUEST_NIO_PORT) {
+ error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
+ return (error);
+ }
- error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
- if (error == INOUT_OK && in)
- error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+ error = emulate_inout(ctx, vcpu, vme, strictio);
+ if (error == INOUT_OK && in && !string) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
+ vme->u.inout.eax);
+ }
switch (error) {
case INOUT_OK:
return (VMEXIT_CONTINUE);
+ case INOUT_RESTART:
+ return (VMEXIT_RESTART);
case INOUT_RESET:
stats.io_reset++;
return (VMEXIT_RESET);
@@ -514,6 +515,7 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_INOUT_STR] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index 5fbe99b..fe9e0d8 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -31,11 +31,21 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+#include <vmmapi.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
+#include "bhyverun.h"
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
@@ -91,52 +101,135 @@ register_default_iohandler(int start, int size)
}
int
-emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
- uint32_t *eax, int strict)
+emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
{
- int flags;
- uint32_t mask, val;
+ int addrsize, bytes, flags, in, port, prot, rep;
+ uint32_t val;
inout_func_t handler;
void *arg;
- int error;
+ int error, retval;
+ enum vm_reg_name idxreg;
+ uint64_t gla, index, iterations, count;
+ struct vm_inout_str *vis;
+ struct iovec iov[2];
+
+ bytes = vmexit->u.inout.bytes;
+ in = vmexit->u.inout.in;
+ port = vmexit->u.inout.port;
assert(port < MAX_IOPORTS);
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
handler = inout_handlers[port].handler;
if (strict && handler == default_inout)
return (-1);
- switch (bytes) {
- case 1:
- mask = 0xff;
- break;
- case 2:
- mask = 0xffff;
- break;
- default:
- mask = 0xffffffff;
- break;
- }
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
- if (!in) {
- val = *eax & mask;
+ if (in) {
+ if (!(flags & IOPORT_F_IN))
+ return (-1);
+ } else {
+ if (!(flags & IOPORT_F_OUT))
+ return (-1);
}
- flags = inout_handlers[port].flags;
- arg = inout_handlers[port].arg;
+ retval = 0;
+ if (vmexit->u.inout.string) {
+ vis = &vmexit->u.inout_str;
+ rep = vis->inout.rep;
+ addrsize = vis->addrsize;
+ prot = in ? PROT_WRITE : PROT_READ;
+ assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
+
+ /* Index register */
+ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+ index = vis->index & vie_size2mask(addrsize);
+
+ /* Count register */
+ count = vis->count & vie_size2mask(addrsize);
+
+ /* Limit number of back-to-back in/out emulations to 16 */
+ iterations = MIN(count, 16);
+ while (iterations > 0) {
+ if (vie_calculate_gla(vis->paging.cpu_mode,
+ vis->seg_name, &vis->seg_desc, index, bytes,
+ addrsize, prot, &gla)) {
+ error = vm_inject_exception2(ctx, vcpu,
+ IDT_GP, 0);
+ assert(error == 0);
+ retval = INOUT_RESTART;
+ break;
+ }
+
+ error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
+ prot, iov, nitems(iov));
+ assert(error == 0 || error == 1 || error == -1);
+ if (error) {
+ retval = (error == 1) ? INOUT_RESTART :
+ INOUT_ERROR;
+ break;
+ }
+
+ if (vie_alignment_check(vis->paging.cpl, bytes,
+ vis->cr0, vis->rflags, gla)) {
+ error = vm_inject_exception2(ctx, vcpu,
+ IDT_AC, 0);
+ assert(error == 0);
+ return (INOUT_RESTART);
+ }
+
+ val = 0;
+ if (!in)
+ vm_copyin(ctx, vcpu, iov, &val, bytes);
+
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval != 0)
+ break;
+
+ if (in)
+ vm_copyout(ctx, vcpu, &val, iov, bytes);
+
+ /* Update index */
+ if (vis->rflags & PSL_D)
+ index -= bytes;
+ else
+ index += bytes;
+
+ count--;
+ iterations--;
+ }
- if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
- error = (*handler)(ctx, vcpu, in, port, bytes, &val, arg);
- else
- error = -1;
+ /* Update index register */
+ error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
+ assert(error == 0);
+
+ /*
+ * Update count register only if the instruction had a repeat
+ * prefix.
+ */
+ if (rep) {
+ error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
+ count, addrsize);
+ assert(error == 0);
+ }
- if (!error && in) {
- *eax &= ~mask;
- *eax |= val & mask;
+ /* Restart the instruction if more iterations remain */
+ if (retval == INOUT_OK && count != 0)
+ retval = INOUT_RESTART;
+ } else {
+ if (!in) {
+ val = vmexit->u.inout.eax & vie_size2mask(bytes);
+ }
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval == 0 && in) {
+ vmexit->u.inout.eax &= ~vie_size2mask(bytes);
+ vmexit->u.inout.eax |= val & vie_size2mask(bytes);
+ }
}
-
- return (error);
+ return (retval);
}
void
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
index 5117d79..f15a2c8 100644
--- a/usr.sbin/bhyve/inout.h
+++ b/usr.sbin/bhyve/inout.h
@@ -32,12 +32,14 @@
#include <sys/linker_set.h>
struct vmctx;
+struct vm_exit;
/* Handler return values. */
#define INOUT_ERROR -1
#define INOUT_OK 0
-#define INOUT_RESET 1
-#define INOUT_POWEROFF 2
+#define INOUT_RESTART 1
+#define INOUT_RESET 2
+#define INOUT_POWEROFF 3
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
@@ -72,8 +74,8 @@ struct inout_port {
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
-int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
- uint32_t *eax, int strict);
+int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
+ int strict);
int register_inout(struct inout_port *iop);
int unregister_inout(struct inout_port *iop);
void init_bvmcons(void);
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index d5c7935..7ea630f 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/tree.h>
#include <sys/errno.h>
#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
#include <stdio.h>
#include <stdlib.h>
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 3474fd7..cf1c655 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -52,10 +52,6 @@ __FBSDID("$FreeBSD$");
#include "pci_emul.h"
#include "virtio.h"
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
#define VTBLK_RINGSZ 64
#define VTBLK_MAXSEGS 32
@@ -217,7 +213,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
case VBH_OP_IDENT:
/* Assume a single buffer */
strlcpy(iov[1].iov_base, sc->vbsc_ident,
- min(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
err = 0;
break;
default:
OpenPOWER on IntegriCloud