diff options
-rw-r--r-- | lib/libvmmapi/vmmapi.c | 106 | ||||
-rw-r--r-- | lib/libvmmapi/vmmapi.h | 13 | ||||
-rw-r--r-- | sys/amd64/include/vmm.h | 113 | ||||
-rw-r--r-- | sys/amd64/include/vmm_dev.h | 12 | ||||
-rw-r--r-- | sys/amd64/include/vmm_instruction_emul.h | 92 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 155 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm.c | 67 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_dev.c | 24 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_instruction_emul.c | 359 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ioport.c | 108 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ioport.h | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ktr.h | 4 | ||||
-rw-r--r-- | usr.sbin/bhyve/bhyverun.c | 26 | ||||
-rw-r--r-- | usr.sbin/bhyve/inout.c | 151 | ||||
-rw-r--r-- | usr.sbin/bhyve/inout.h | 10 | ||||
-rw-r--r-- | usr.sbin/bhyve/mem.c | 1 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_block.c | 6 |
17 files changed, 1025 insertions, 224 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 5e630f8..ba2904c 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -33,8 +33,10 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/ioctl.h> #include <sys/mman.h> +#include <sys/_iovec.h> #include <machine/specialreg.h> +#include <machine/param.h> #include <stdio.h> #include <stdlib.h> @@ -937,3 +939,107 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) *capabilities = cap.capabilities; return (error); } + +static int +gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, int *fault, uint64_t *gpa) +{ + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.vcpuid = vcpu; + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; + + error = ioctl(ctx->fd, VM_GLA2GPA, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } + return (error); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +int +vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) +{ + uint64_t gpa; + int error, fault, i, n, off; + + for (i = 0; i < iovcnt; i++) { + iov[i].iov_base = 0; + iov[i].iov_len = 0; + } + + while (len) { + assert(iovcnt > 0); + error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa); + if (error) + return (-1); + if (fault) + return (1); + + off = gpa & PAGE_MASK; + n = min(len, PAGE_SIZE - off); + + iov->iov_base = (void *)gpa; + iov->iov_len = n; + iov++; + iovcnt--; + + gla += n; + len -= n; + } + return (0); +} + +void +vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) +{ + const char *src; + char *dst; + uint64_t gpa; + size_t n; + + dst = vp; + while (len) { + assert(iov->iov_len); + gpa = (uint64_t)iov->iov_base; + n = min(len, iov->iov_len); + src = vm_map_gpa(ctx, gpa, n); + bcopy(src, dst, n); + + iov++; + dst += n; + len -= n; + } +} + +void +vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, + size_t len) +{ + const char *src; + char *dst; + uint64_t gpa; + size_t n; + + src = vp; + while (len) { + assert(iov->iov_len); + gpa = (uint64_t)iov->iov_base; + n = min(len, iov->iov_len); + dst = vm_map_gpa(ctx, gpa, n); + bcopy(src, dst, n); + + iov++; + src += n; + len -= n; + } +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 88e9947..bab41da 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -29,6 +29,7 @@ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ +struct iovec; struct vmctx; enum x2apic_state; @@ -109,6 +110,18 @@ int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); +/* + * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. + * The 'iovcnt' should be big enough to accomodate all GPA segments. + * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. + */ +int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); +void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, + void *host_dst, size_t len); +void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, + struct iovec *guest_iov, size_t len); + /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 50d879b..f1902d2 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -54,6 +54,7 @@ struct vmspace; struct vm_object; struct pmap; +enum vm_reg_name; enum x2apic_state; typedef int (*vmm_init_func_t)(int ipinum); @@ -236,10 +237,11 @@ int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme); void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */ void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */ +void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2); -#endif /* KERNEL */ +enum vm_reg_name vm_segment_name(int seg_encoding); -#include <machine/vmm_instruction_emul.h> +#endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ @@ -280,6 +282,7 @@ enum vm_reg_name { VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, VM_REG_LAST }; @@ -318,6 +321,76 @@ struct seg_desc { uint32_t limit; uint32_t access; }; +#define SEG_DESC_TYPE(desc) ((desc)->access & 0x001f) +#define SEG_DESC_PRESENT(desc) ((desc)->access & 0x0080) +#define SEG_DESC_DEF32(desc) ((desc)->access & 0x4000) +#define SEG_DESC_GRANULARITY(desc) ((desc)->access & 0x8000) +#define SEG_DESC_UNUSABLE(desc) ((desc)->access & 0x10000) + +enum vm_cpu_mode { + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1; + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; enum vm_exitcode { VM_EXITCODE_INOUT, @@ -335,22 +408,38 @@ enum vm_exitcode { VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, + VM_EXITCODE_INOUT_STR, VM_EXITCODE_MAX }; +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { - struct { - uint16_t bytes:3; /* 1 or 2 or 4 */ - uint16_t in:1; /* out is 0, in is 1 */ - uint16_t string:1; - uint16_t rep:1; - uint16_t port; - uint32_t eax; /* valid for out */ - } inout; + struct vm_inout inout; + struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; @@ -358,9 +447,7 @@ struct vm_exit { struct { uint64_t gpa; uint64_t gla; - uint64_t cr3; - enum vie_cpu_mode cpu_mode; - enum vie_paging_mode paging_mode; + struct vm_guest_paging paging; struct vie vie; } inst_emul; /* diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index ecafa9c..f094d51 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -168,6 +168,15 @@ struct vm_suspend { enum vm_suspend_how how; }; +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -180,6 +189,7 @@ enum { IOCNUM_MAP_MEMORY = 10, IOCNUM_GET_MEMORY_SEG = 11, IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, @@ -289,4 +299,6 @@ enum { _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) #endif diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index 0901aa2..e4c408b 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -29,63 +29,7 @@ #ifndef _VMM_INSTRUCTION_EMUL_H_ #define _VMM_INSTRUCTION_EMUL_H_ -enum vie_cpu_mode { - CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ - CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ -}; - -enum vie_paging_mode { - PAGING_MODE_FLAT, - PAGING_MODE_32, - PAGING_MODE_PAE, - PAGING_MODE_64, -}; - -/* - * The data structures 'vie' and 'vie_op' are meant to be opaque to the - * consumers of instruction decoding. The only reason why their contents - * need to be exposed is because they are part of the 'vm_exit' structure. - */ -struct vie_op { - uint8_t op_byte; /* actual opcode byte */ - uint8_t op_type; /* type of operation (e.g. MOV) */ - uint16_t op_flags; -}; - -#define VIE_INST_SIZE 15 -struct vie { - uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ - uint8_t num_valid; /* size of the instruction */ - uint8_t num_processed; - - uint8_t rex_w:1, /* REX prefix */ - rex_r:1, - rex_x:1, - rex_b:1, - rex_present:1; - - uint8_t mod:2, /* ModRM byte */ - reg:4, - rm:4; - - uint8_t ss:2, /* SIB byte */ - index:4, - base:4; - - uint8_t disp_bytes; - uint8_t imm_bytes; - - uint8_t scale; - int base_register; /* VM_REG_GUEST_xyz */ - int index_register; /* VM_REG_GUEST_xyz */ - - int64_t displacement; /* optional addr displacement */ - int64_t immediate; /* optional immediate operand */ - - uint8_t decoded; /* set to 1 if successfully decoded */ - - struct vie_op op; /* opcode description */ -}; +#include <sys/mman.h> /* * Callback functions to read and write memory regions. @@ -111,6 +55,24 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, mem_region_read_t mrr, mem_region_write_t mrw, void *mrarg); +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* + * Returns 1 if an alignment check exception should be injected and 0 otherwise. + */ +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + #ifdef _KERNEL /* * APIs to fetch and decode the instruction from nested page fault handler. @@ -118,8 +80,18 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, * 'vie' must be initialized before calling 'vmm_fetch_instruction()' */ int vmm_fetch_instruction(struct vm *vm, int cpuid, - uint64_t rip, int inst_length, uint64_t cr3, - enum vie_paging_mode paging_mode, struct vie *vie); + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * Returns 0 on success and '*gpa' contains the result of the translation. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa); void vie_init(struct vie *vie); @@ -136,7 +108,7 @@ void vie_init(struct vie *vie); */ #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vie_cpu_mode cpu_mode, struct vie *vie); + enum vm_cpu_mode cpu_mode, struct vie *vie); #endif /* _KERNEL */ #endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index e85e5e4..94b7417 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> #include "vmm_host.h" #include "vmm_ioport.h" #include "vmm_ipi.h" @@ -185,6 +186,8 @@ SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, */ #define APIC_ACCESS_ADDRESS 0xFFFFF000 +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static void vmx_inject_pir(struct vlapic *vlapic); #ifdef KTR @@ -539,7 +542,7 @@ static int vmx_init(int ipinum) { int error, use_tpr_shadow; - uint64_t fixed0, fixed1, feature_control; + uint64_t basic, fixed0, fixed1, feature_control; uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ @@ -559,6 +562,17 @@ vmx_init(int ipinum) return (ENXIO); } + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_init: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } + /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, @@ -1539,7 +1553,19 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) return (HANDLED); } -static enum vie_cpu_mode +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode vmx_cpu_mode(void) { @@ -1549,7 +1575,7 @@ vmx_cpu_mode(void) return (CPU_MODE_COMPATIBILITY); } -static enum vie_paging_mode +static enum vm_paging_mode vmx_paging_mode(void) { @@ -1563,6 +1589,89 @@ vmx_paging_mode(void) return (PAGING_MODE_PAE); } +static uint64_t +inout_str_index(struct vmx *vmx, int vcpuid, int in) +{ + uint64_t val; + int error; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vmx, vcpuid, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx *vmx, int vcpuid, int rep) +{ + uint64_t val; + int error; + + if (rep) { + error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); + + /* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */ +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +{ + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(&vmexit->u.inst_emul.paging); +} + static int ept_fault_type(uint64_t ept_qual) { @@ -1754,12 +1863,8 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) } if (allowed) { - vmexit->exitcode = VM_EXITCODE_INST_EMUL; - vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset; - vmexit->u.inst_emul.gla = VIE_INVALID_GLA; - vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); - vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); - vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); } /* @@ -1776,10 +1881,12 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int error, handled; + int error, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; - uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason; + struct vm_inout_str *vis; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t reason; uint64_t qual, gpa; bool retu; @@ -1936,15 +2043,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.bytes = (qual & 0x7) + 1; - vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; vmexit->u.inout.port = (uint16_t)(qual >> 16); vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); - error = emulate_ioport(vmx->vm, vcpu, vmexit); - if (error == 0) { - handled = 1; - vmxctx->guest_rax = vmexit->u.inout.eax; + if (vmexit->u.inout.string) { + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging); + vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(VMCS_GUEST_CR0); + vis->index = inout_str_index(vmx, vcpu, in); + vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } break; case EXIT_REASON_CPUID: @@ -1990,12 +2104,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); } else if (ept_emulation_fault(qual)) { - vmexit->exitcode = VM_EXITCODE_INST_EMUL; - vmexit->u.inst_emul.gpa = gpa; - vmexit->u.inst_emul.gla = vmcs_gla(); - vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); - vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); - vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); + vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); } /* @@ -2324,6 +2433,8 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg) return (&vmxctx->guest_r14); case VM_REG_GUEST_R15: return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); default: break; } diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 9aed5f5..8ebdfd7 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -62,7 +62,9 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> +#include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" @@ -1131,34 +1133,33 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; - int error, inst_length; - uint64_t rip, gla, gpa, cr3; - enum vie_cpu_mode cpu_mode; - enum vie_paging_mode paging_mode; + uint64_t gla, gpa; + struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; + int error; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - rip = vme->rip; - inst_length = vme->inst_length; - gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; - cr3 = vme->u.inst_emul.cr3; - cpu_mode = vme->u.inst_emul.cpu_mode; - paging_mode = vme->u.inst_emul.paging_mode; vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; vie_init(vie); /* Fetch, decode and emulate the faulting instruction */ - if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, - paging_mode, vie) != 0) + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, + vme->inst_length, vie); + if (error == 1) + return (0); /* Resume guest to handle page fault */ + else if (error == -1) return (EFAULT); + else if (error != 0) + panic("%s: vmm_fetch_instruction error %d", __func__, error); - if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0) + if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0) return (EFAULT); /* return to userland unless this is an in-kernel emulated device */ @@ -1348,6 +1349,10 @@ restart: case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vm, vcpuid, vme, &retu); + break; default: retu = true; /* handled in userland */ break; @@ -1430,6 +1435,25 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception) } void +vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) +{ + struct vm_exception pf = { + .vector = IDT_PF, + .error_code_valid = 1, + .error_code = error_code + }; + int error; + + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, &pf); +} + +void vm_inject_gp(struct vm *vm, int vcpuid) { struct vm_exception gpf = { @@ -1856,3 +1880,20 @@ vm_atpit(struct vm *vm) { return (vm->vatpit); } + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index f1d5795..0561785 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> #include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> #include <machine/vmm_dev.h> #include "vmm_lapic.h" @@ -168,6 +169,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; + struct vm_gla2gpa *gg; sc = vmmdev_lookup2(cdev); if (sc == NULL) @@ -192,6 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_PPTDEV_MSI: case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: + case VM_GLA2GPA: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. @@ -415,6 +418,27 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; + case VM_GLA2GPA: { + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + gg = (struct vm_gla2gpa *)data; + error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, + gg->prot, &gg->gpa); + KASSERT(error == 0 || error == 1 || error == -1, + ("%s: vmm_gla2gpa unknown error %d", __func__, error)); + if (error >= 0) { + /* + * error = 0: the translation was successful + * error = 1: a fault was injected into the guest + */ + gg->fault = error; + error = 0; + } else { + error = EFAULT; + } + break; + } default: error = ENOTTY; break; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index f7d109c..921deb5 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/pcpu.h> #include <sys/systm.h> +#include <sys/proc.h> #include <vm/vm.h> #include <vm/pmap.h> @@ -46,9 +47,15 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> +#include <assert.h> #include <vmmapi.h> +#define KASSERT(exp,msg) assert((exp)) #endif /* _KERNEL */ +#include <machine/vmm_instruction_emul.h> +#include <x86/psl.h> +#include <x86/specialreg.h> + /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, @@ -205,7 +212,7 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) return (error); } -static int +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size) { @@ -560,6 +567,155 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x", + seg, desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc); + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + #ifdef _KERNEL void vie_init(struct vie *vie) @@ -572,28 +728,86 @@ vie_init(struct vie *vie) } static int -gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, - uint64_t *gpa, enum vie_paging_mode paging_mode) +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & VM_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & VM_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) { - int nlevels, ptpshift, ptpindex; - uint64_t *ptpbase, pte, pgsize; + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +int +vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa) +{ + int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + u_int retries; + uint64_t *ptpbase, ptpphys, pte, pgsize; uint32_t *ptpbase32, pte32; void *cookie; - if (paging_mode == PAGING_MODE_FLAT) { + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & VM_PROT_WRITE; + cookie = NULL; + retval = 0; + retries = 0; +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); + if (retries++ > 0) + maybe_yield(); + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + vm_inject_gp(vm, vcpuid); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; - return (0); + goto done; } - if (paging_mode == PAGING_MODE_32) { + if (paging->paging_mode == PAGING_MODE_32) { nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; - ptpbase32 = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, - VM_PROT_READ, &cookie); - + ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase32 == NULL) goto error; @@ -603,29 +817,55 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, pte32 = ptpbase32[ptpindex]; - vm_gpa_release(cookie); - - if ((pte32 & PG_V) == 0) - goto error; + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } - if (pte32 & PG_PS) + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if ((pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) break; ptpphys = pte32; } + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); - return (0); + goto done; } - if (paging_mode == PAGING_MODE_PAE) { - /* Zero out the lower 5 bits and the upper 12 bits */ - ptpphys >>= 5; ptpphys <<= 17; ptpphys >>= 12; + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; - ptpbase = vm_gpa_hold(vm, ptpphys, sizeof(*ptpbase) * 4, - VM_PROT_READ, &cookie); + ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); if (ptpbase == NULL) goto error; @@ -633,10 +873,11 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, pte = ptpbase[ptpindex]; - vm_gpa_release(cookie); - - if ((pte & PG_V) == 0) - goto error; + if ((pte & PG_V) == 0) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } ptpphys = pte; @@ -647,8 +888,7 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; - ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ, - &cookie); + ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; @@ -658,36 +898,59 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys, pte = ptpbase[ptpindex]; - vm_gpa_release(cookie); + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } - if ((pte & PG_V) == 0) - goto error; + /* Set the accessed bit in the page table entry */ + if ((pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } - if (pte & PG_PS) { - if (pgsize > 1 * GB) - goto error; - else - break; + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + pfcode = pf_error_code(usermode, prot, 1, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + break; } ptpphys = pte; } + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); - return (0); - +done: + ptp_release(&cookie); + return (retval); error: - return (-1); + retval = -1; + goto done; +fault: + retval = 1; + goto done; } int -vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, - uint64_t cr3, enum vie_paging_mode paging_mode, - struct vie *vie) +vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie) { - int n, err, prot; + int n, error, prot; uint64_t gpa, off; void *hpa, *cookie; @@ -701,9 +964,9 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length, /* Copy the instruction into 'vie' */ while (vie->num_valid < inst_length) { - err = gla2gpa(vm, rip, cr3, &gpa, paging_mode); - if (err) - break; + error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa); + if (error) + return (error); off = gpa & PAGE_MASK; n = min(inst_length - vie->num_valid, PAGE_SIZE - off); @@ -804,7 +1067,7 @@ decode_opcode(struct vie *vie) } static int -decode_modrm(struct vie *vie, enum vie_cpu_mode cpu_mode) +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; @@ -1084,7 +1347,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vie_cpu_mode cpu_mode, struct vie *vie) + enum vm_cpu_mode cpu_mode, struct vie *vie) { if (cpu_mode == CPU_MODE_64BIT) { diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index eae45cc..4ce1403 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -33,11 +33,16 @@ __FBSDID("$FreeBSD$"); #include <sys/cpuset.h> #include <sys/systm.h> +#include <vm/vm.h> + #include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <x86/psl.h> #include "vatpic.h" #include "vatpit.h" #include "vmm_ioport.h" +#include "vmm_ktr.h" #define MAX_IOPORTS 1280 @@ -55,32 +60,64 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [IO_ELCR2] = vatpic_elc_handler, }; -int -emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit) +#ifdef KTR +static const char * +inout_instruction(struct vm_exit *vmexit) { - ioport_handler_func_t handler; - uint32_t mask, val; - int error; + int index; - if (vmexit->u.inout.port >= MAX_IOPORTS) - return (-1); - - handler = ioport_handler[vmexit->u.inout.port]; - if (handler == NULL) - return (-1); + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd" + "insb", "insw", "insd", + }; switch (vmexit->u.inout.bytes) { case 1: - mask = 0xff; + index = 0; break; case 2: - mask = 0xffff; + index = 1; break; default: - mask = 0xffffffff; + index = 2; break; } + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(index < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* KTR */ + +static int +emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, + bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val; + int error; + + error = 0; + *retu = true; + + if (vmexit->u.inout.port >= MAX_IOPORTS) + goto done; + + handler = ioport_handler[vmexit->u.inout.port]; + if (handler == NULL) + goto done; + + mask = vie_size2mask(vmexit->u.inout.bytes); + if (!vmexit->u.inout.in) { val = vmexit->u.inout.eax & mask; } @@ -88,10 +125,47 @@ emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit) error = (*handler)(vm, vcpuid, vmexit->u.inout.in, vmexit->u.inout.port, vmexit->u.inout.bytes, &val); - if (!error && vmexit->u.inout.in) { - vmexit->u.inout.eax &= ~mask; - vmexit->u.inout.eax |= val & mask; + if (!error) { + *retu = false; + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vm, vcpuid, + VM_REG_GUEST_RAX, vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d " + "setting guest rax register", error)); + } } +done: + return (error); +} + +static int +emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + *retu = true; + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + int bytes, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(vm, vcpuid, vmexit, retu); + else + error = emulate_inout_port(vm, vcpuid, vmexit, retu); + + VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); return (error); } diff --git a/sys/amd64/vmm/vmm_ioport.h b/sys/amd64/vmm/vmm_ioport.h index 02e543a..84a4cf1 100644 --- a/sys/amd64/vmm/vmm_ioport.h +++ b/sys/amd64/vmm/vmm_ioport.h @@ -32,6 +32,6 @@ typedef int (*ioport_handler_func_t)(void *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); -int emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit); +int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); #endif /* _VMM_IOPORT_H_ */ diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h index 9fb46d8..61ff53f 100644 --- a/sys/amd64/vmm/vmm_ktr.h +++ b/sys/amd64/vmm/vmm_ktr.h @@ -48,6 +48,10 @@ CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) #define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + #define VM_CTR0(vm, format) \ CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index d9b4418..f9a67cb 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -288,33 +288,34 @@ static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; - int bytes, port, in, out; - uint32_t eax; + int bytes, port, in, out, string; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; - eax = vme->u.inout.eax; + string = vme->u.inout.string; in = vme->u.inout.in; out = !in; - /* We don't deal with these */ - if (vme->u.inout.string || vme->u.inout.rep) - return (VMEXIT_ABORT); - /* Extra-special case of host notifications */ - if (out && port == GUEST_NIO_PORT) - return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); + if (out && port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + return (error); + } - error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); - if (error == INOUT_OK && in) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); + error = emulate_inout(ctx, vcpu, vme, strictio); + if (error == INOUT_OK && in && !string) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + vme->u.inout.eax); + } switch (error) { case INOUT_OK: return (VMEXIT_CONTINUE); + case INOUT_RESTART: + return (VMEXIT_RESTART); case INOUT_RESET: stats.io_reset++; return (VMEXIT_RESET); @@ -514,6 +515,7 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index 5fbe99b..fe9e0d8 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -31,11 +31,21 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/linker_set.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <vmmapi.h> #include <stdio.h> #include <string.h> #include <assert.h> +#include "bhyverun.h" #include "inout.h" SET_DECLARE(inout_port_set, struct inout_port); @@ -91,52 +101,135 @@ register_default_iohandler(int start, int size) } int -emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, int strict) +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) { - int flags; - uint32_t mask, val; + int addrsize, bytes, flags, in, port, prot, rep; + uint32_t val; inout_func_t handler; void *arg; - int error; + int error, retval; + enum vm_reg_name idxreg; + uint64_t gla, index, iterations, count; + struct vm_inout_str *vis; + struct iovec iov[2]; + + bytes = vmexit->u.inout.bytes; + in = vmexit->u.inout.in; + port = vmexit->u.inout.port; assert(port < MAX_IOPORTS); + assert(bytes == 1 || bytes == 2 || bytes == 4); handler = inout_handlers[port].handler; if (strict && handler == default_inout) return (-1); - switch (bytes) { - case 1: - mask = 0xff; - break; - case 2: - mask = 0xffff; - break; - default: - mask = 0xffffffff; - break; - } + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; - if (!in) { - val = *eax & mask; + if (in) { + if (!(flags & IOPORT_F_IN)) + return (-1); + } else { + if (!(flags & IOPORT_F_OUT)) + return (-1); } - flags = inout_handlers[port].flags; - arg = inout_handlers[port].arg; + retval = 0; + if (vmexit->u.inout.string) { + vis = &vmexit->u.inout_str; + rep = vis->inout.rep; + addrsize = vis->addrsize; + prot = in ? PROT_WRITE : PROT_READ; + assert(addrsize == 2 || addrsize == 4 || addrsize == 8); + + /* Index register */ + idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + index = vis->index & vie_size2mask(addrsize); + + /* Count register */ + count = vis->count & vie_size2mask(addrsize); + + /* Limit number of back-to-back in/out emulations to 16 */ + iterations = MIN(count, 16); + while (iterations > 0) { + if (vie_calculate_gla(vis->paging.cpu_mode, + vis->seg_name, &vis->seg_desc, index, bytes, + addrsize, prot, &gla)) { + error = vm_inject_exception2(ctx, vcpu, + IDT_GP, 0); + assert(error == 0); + retval = INOUT_RESTART; + break; + } + + error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes, + prot, iov, nitems(iov)); + assert(error == 0 || error == 1 || error == -1); + if (error) { + retval = (error == 1) ? INOUT_RESTART : + INOUT_ERROR; + break; + } + + if (vie_alignment_check(vis->paging.cpl, bytes, + vis->cr0, vis->rflags, gla)) { + error = vm_inject_exception2(ctx, vcpu, + IDT_AC, 0); + assert(error == 0); + return (INOUT_RESTART); + } + + val = 0; + if (!in) + vm_copyin(ctx, vcpu, iov, &val, bytes); + + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval != 0) + break; + + if (in) + vm_copyout(ctx, vcpu, &val, iov, bytes); + + /* Update index */ + if (vis->rflags & PSL_D) + index -= bytes; + else + index += bytes; + + count--; + iterations--; + } - if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT))) - error = (*handler)(ctx, vcpu, in, port, bytes, &val, arg); - else - error = -1; + /* Update index register */ + error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); + assert(error == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if (rep) { + error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, + count, addrsize); + assert(error == 0); + } - if (!error && in) { - *eax &= ~mask; - *eax |= val & mask; + /* Restart the instruction if more iterations remain */ + if (retval == INOUT_OK && count != 0) + retval = INOUT_RESTART; + } else { + if (!in) { + val = vmexit->u.inout.eax & vie_size2mask(bytes); + } + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval == 0 && in) { + vmexit->u.inout.eax &= ~vie_size2mask(bytes); + vmexit->u.inout.eax |= val & vie_size2mask(bytes); + } } - - return (error); + return (retval); } void diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h index 5117d79..f15a2c8 100644 --- a/usr.sbin/bhyve/inout.h +++ b/usr.sbin/bhyve/inout.h @@ -32,12 +32,14 @@ #include <sys/linker_set.h> struct vmctx; +struct vm_exit; /* Handler return values. */ #define INOUT_ERROR -1 #define INOUT_OK 0 -#define INOUT_RESET 1 -#define INOUT_POWEROFF 2 +#define INOUT_RESTART 1 +#define INOUT_RESET 2 +#define INOUT_POWEROFF 3 typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); @@ -72,8 +74,8 @@ struct inout_port { DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) void init_inout(void); -int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes, - uint32_t *eax, int strict); +int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, + int strict); int register_inout(struct inout_port *iop); int unregister_inout(struct inout_port *iop); void init_bvmcons(void); diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c index d5c7935..7ea630f 100644 --- a/usr.sbin/bhyve/mem.c +++ b/usr.sbin/bhyve/mem.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/tree.h> #include <sys/errno.h> #include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> #include <stdio.h> #include <stdlib.h> diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 3474fd7..cf1c655 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -52,10 +52,6 @@ __FBSDID("$FreeBSD$"); #include "pci_emul.h" #include "virtio.h" -#ifndef min -#define min(a, b) ((a) < (b) ? (a) : (b)) -#endif - #define VTBLK_RINGSZ 64 #define VTBLK_MAXSEGS 32 @@ -217,7 +213,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) case VBH_OP_IDENT: /* Assume a single buffer */ strlcpy(iov[1].iov_base, sc->vbsc_ident, - min(iov[1].iov_len, sizeof(sc->vbsc_ident))); + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); err = 0; break; default: |