diff options
41 files changed, 3057 insertions, 594 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 9fb2308..93955c7 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/_iovec.h> #include <sys/cpuset.h> +#include <x86/segments.h> #include <machine/specialreg.h> #include <machine/param.h> @@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg, } int +vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +{ + int error; + + error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, + &seg_desc->access); + return (error); +} + +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; @@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, #endif int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) { uint64_t gpa; @@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu) error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + vmii.info1 = info1; + error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + return (error); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 067eaa0..fbb6ddd 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -66,6 +66,8 @@ int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, + struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, @@ -104,6 +106,9 @@ int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ @@ -121,7 +126,7 @@ int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); * The 'iovcnt' should be big enough to accomodate all GPA segments. * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. */ -int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 957120c..74be82c 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <machine/specialreg.h> #include <machine/md_var.h> +#include <amd64/vmm/intel/vmx_controls.h> #include <x86/isa/icu.h> /* XXX - should be in header file: */ @@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void); static void print_AMD_info(void); static void print_AMD_assoc(int i); static void print_via_padlock_info(void); +static void print_vmx_info(void); int cpu_class; char machine[] = "amd64"; @@ -428,6 +430,9 @@ printcpuinfo(void) if (via_feature_rng != 0 || via_feature_xcrypt != 0) print_via_padlock_info(); + if (cpu_feature2 & CPUID2_VMX) + print_vmx_info(); + if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_AMD) cpu_feature &= ~CPUID_HTT; @@ -722,3 +727,197 @@ print_via_padlock_info(void) "\015RSA" /* PMM */ ); } + +static uint32_t +vmx_settable(uint64_t basic, int msr, int true_msr) +{ + uint64_t val; + + if (basic & (1UL << 55)) + val = rdmsr(true_msr); + else + val = rdmsr(msr); + + /* Just report the controls that can be set to 1. */ + return (val >> 32); +} + +static void +print_vmx_info(void) +{ + uint64_t basic, msr; + uint32_t entry, exit, mask, pin, proc, proc2; + int comma; + + printf("\n VT-x: "); + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) + printf("(disabled in BIOS) "); + basic = rdmsr(MSR_VMX_BASIC); + pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS); + proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS); + if (proc & PROCBASED_SECONDARY_CONTROLS) + proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2); + else + proc2 = 0; + exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS); + entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS); + + if (!bootverbose) { + comma = 0; + if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT && + entry & VM_ENTRY_LOAD_PAT) { + printf("%sPAT", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_HLT_EXITING) { + printf("%sHLT", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_MTF) { + printf("%sMTF", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_PAUSE_EXITING) { + printf("%sPAUSE", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_ENABLE_EPT) { + printf("%sEPT", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) { + printf("%sUG", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_ENABLE_VPID) { + printf("%sVPID", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_USE_TPR_SHADOW && + proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES && + proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE && + proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION && + proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) { + printf("%sVID", comma ? "," : ""); + comma = 1; + if (pin & PINBASED_POSTED_INTERRUPT) + printf(",PostIntr"); + } + return; + } + + mask = basic >> 32; + printf("Basic Features=0x%b", mask, + "\020" + "\02132PA" /* 32-bit physical addresses */ + "\022SMM" /* SMM dual-monitor */ + "\027INS/OUTS" /* VM-exit info for INS and OUTS */ + "\030TRUE" /* TRUE_CTLS MSRs */ + ); + printf("\n Pin-Based Controls=0x%b", pin, + "\020" + "\001ExtINT" /* External-interrupt exiting */ + "\004NMI" /* NMI exiting */ + "\006VNMI" /* Virtual NMIs */ + "\007PreTmr" /* Activate VMX-preemption timer */ + "\010PostIntr" /* Process posted interrupts */ + ); + printf("\n Primary Processor Controls=0x%b", proc, + "\020" + "\003INTWIN" /* Interrupt-window exiting */ + "\004TSCOff" /* Use TSC offsetting */ + "\010HLT" /* HLT exiting */ + "\012INVLPG" /* INVLPG exiting */ + "\013MWAIT" /* MWAIT exiting */ + "\014RDPMC" /* RDPMC exiting */ + "\015RDTSC" /* RDTSC exiting */ + "\020CR3-LD" /* CR3-load exiting */ + "\021CR3-ST" /* CR3-store exiting */ + "\024CR8-LD" /* CR8-load exiting */ + "\025CR8-ST" /* CR8-store exiting */ + "\026TPR" /* Use TPR shadow */ + "\027NMIWIN" /* NMI-window exiting */ + "\030MOV-DR" /* MOV-DR exiting */ + "\031IO" /* Unconditional I/O exiting */ + "\032IOmap" /* Use I/O bitmaps */ + "\034MTF" /* Monitor trap flag */ + "\035MSRmap" /* Use MSR bitmaps */ + "\036MONITOR" /* MONITOR exiting */ + "\037PAUSE" /* PAUSE exiting */ + ); + if (proc & PROCBASED_SECONDARY_CONTROLS) + printf("\n Secondary Processor Controls=0x%b", proc2, + "\020" + "\001APIC" /* Virtualize APIC accesses */ + "\002EPT" /* Enable EPT */ + "\003DT" /* Descriptor-table exiting */ + "\004RDTSCP" /* Enable RDTSCP */ + "\005x2APIC" /* Virtualize x2APIC mode */ + "\006VPID" /* Enable VPID */ + "\007WBINVD" /* WBINVD exiting */ + "\010UG" /* Unrestricted guest */ + "\011APIC-reg" /* APIC-register virtualization */ + "\012VID" /* Virtual-interrupt delivery */ + "\013PAUSE-loop" /* PAUSE-loop exiting */ + "\014RDRAND" /* RDRAND exiting */ + "\015INVPCID" /* Enable INVPCID */ + "\016VMFUNC" /* Enable VM functions */ + "\017VMCS" /* VMCS shadowing */ + "\020EPT#VE" /* EPT-violation #VE */ + "\021XSAVES" /* Enable XSAVES/XRSTORS */ + ); + printf("\n Exit Controls=0x%b", mask, + "\020" + "\003DR" /* Save debug controls */ + /* Ignore Host address-space size */ + "\015PERF" /* Load MSR_PERF_GLOBAL_CTRL */ + "\020AckInt" /* Acknowledge interrupt on exit */ + "\023PAT-SV" /* Save MSR_PAT */ + "\024PAT-LD" /* Load MSR_PAT */ + "\025EFER-SV" /* Save MSR_EFER */ + "\026EFER-LD" /* Load MSR_EFER */ + "\027PTMR-SV" /* Save VMX-preemption timer value */ + ); + printf("\n Entry Controls=0x%b", mask, + "\020" + "\003DR" /* Save debug controls */ + /* Ignore IA-32e mode guest */ + /* Ignore Entry to SMM */ + /* Ignore Deactivate dual-monitor treatment */ + "\016PERF" /* Load MSR_PERF_GLOBAL_CTRL */ + "\017PAT" /* Load MSR_PAT */ + "\020EFER" /* Load MSR_EFER */ + ); + if (proc & PROCBASED_SECONDARY_CONTROLS && + (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) { + msr = rdmsr(MSR_VMX_EPT_VPID_CAP); + mask = msr; + printf("\n EPT Features=0x%b", mask, + "\020" + "\001XO" /* Execute-only translations */ + "\007PW4" /* Page-walk length of 4 */ + "\011UC" /* EPT paging-structure mem can be UC */ + "\017WB" /* EPT paging-structure mem can be WB */ + "\0212M" /* EPT PDE can map a 2-Mbyte page */ + "\0221G" /* EPT PDPTE can map a 1-Gbyte page */ + "\025INVEPT" /* INVEPT is supported */ + "\026AD" /* Accessed and dirty flags for EPT */ + "\032single" /* INVEPT single-context type */ + "\033all" /* INVEPT all-context type */ + ); + mask = msr >> 32; + printf("\n VPID Features=0x%b", mask, + "\020" + "\001INVVPID" /* INVVPID is supported */ + "\011individual" /* INVVPID individual-address type */ + "\012single" /* INVVPID single-context type */ + "\013all" /* INVVPID all-context type */ + /* INVVPID single-context-retaining-globals type */ + "\014single-globals" + ); + } +} diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 5a359e9..63a9b3f 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -29,11 +29,14 @@ #ifndef _VMM_H_ #define _VMM_H_ +#include <x86/segments.h> + enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; @@ -75,6 +78,10 @@ enum vm_reg_name { VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, VM_REG_LAST }; @@ -84,6 +91,16 @@ enum x2apic_state { X2APIC_STATE_LAST }; +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + #ifdef _KERNEL #define VM_MAX_NAMELEN 32 @@ -99,6 +116,7 @@ struct vioapic; struct vlapic; struct vmspace; struct vm_object; +struct vm_guest_paging; struct pmap; typedef int (*vmm_init_func_t)(int ipinum); @@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)); +} +#endif + void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); @@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm); int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); /* - * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an - * exception is pending and also updates 'vme'. The pending exception is - * cleared when this function returns. + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * - * This function should only be called in the context of the thread that is - * executing this vcpu. + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. */ -int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); -void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */ -void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */ -void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2); +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); enum vm_reg_name vm_segment_name(int seg_encoding); +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ @@ -322,13 +390,16 @@ struct seg_desc { uint32_t limit; uint32_t access; }; -#define SEG_DESC_TYPE(desc) ((desc)->access & 0x001f) -#define SEG_DESC_PRESENT(desc) ((desc)->access & 0x0080) -#define SEG_DESC_DEF32(desc) ((desc)->access & 0x4000) -#define SEG_DESC_GRANULARITY(desc) ((desc)->access & 0x8000) -#define SEG_DESC_UNUSABLE(desc) ((desc)->access & 0x10000) +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; @@ -364,11 +435,14 @@ struct vie { uint8_t num_valid; /* size of the instruction */ uint8_t num_processed; + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, - rex_present:1; + rex_present:1, + opsize_override:1, /* Operand size override */ + addrsize_override:1; /* Address size override */ uint8_t mod:2, /* ModRM byte */ reg:4, @@ -410,6 +484,7 @@ enum vm_exitcode { VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MAX }; @@ -434,6 +509,22 @@ struct vm_inout_str { struct seg_desc seg_desc; }; +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -448,6 +539,7 @@ struct vm_exit { struct { uint64_t gpa; uint64_t gla; + int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; @@ -487,7 +579,38 @@ struct vm_exit { struct { enum vm_suspend_how how; } suspended; + struct vm_task_switch task_switch; } u; }; +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static void __inline +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static void __inline +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static void __inline +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static void __inline +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 9b3b00d..e4d839e 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -189,6 +189,12 @@ struct vm_cpuset { #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -211,6 +217,8 @@ enum { IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, @@ -324,4 +332,8 @@ enum { _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) #endif diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index e4c408b..bbd3d88 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, * s */ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t mrr, mem_region_write_t mrw, - void *mrarg); + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size); @@ -108,7 +108,7 @@ void vie_init(struct vie *vie); */ #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vm_cpu_mode cpu_mode, struct vie *vie); + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #endif /* _KERNEL */ #endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index cc97d95..51e5c2c 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -103,6 +103,14 @@ vmcs_field_encoding(int ident) return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); default: return (-1); } diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 657d5b0..4e9557c 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val) #define VMCS_INTR_T_HWINTR (0 << 8) #define VMCS_INTR_T_NMI (2 << 8) #define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) #define VMCS_INTR_DEL_ERRCODE (1 << 11) /* diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 2cbb159..b2c5702 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); -static int vmx_no_patmsr; - static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); @@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, /* * Optional capabilities */ +static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + +static int vmx_patmsr; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0, + "PAT MSR saved and restored in VCMS"); + static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec; @@ -618,6 +636,7 @@ vmx_init(int ipinum) } /* Check support for VM-exit controls */ + vmx_patmsr = 1; error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, @@ -637,12 +656,12 @@ vmx_init(int ipinum) if (bootverbose) printf("vmm: PAT MSR access not supported\n"); guest_msr_valid(MSR_PAT); - vmx_no_patmsr = 1; + vmx_patmsr = 0; } } /* Check support for VM-entry controls */ - if (!vmx_no_patmsr) { + if (vmx_patmsr) { error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, @@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * MSR_PAT save/restore support, leave access disabled so accesses * will be trapped. */ - if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) + if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT)) panic("vmx_vminit: error setting guest pat msr access"); vpid_alloc(vpid, VM_MAXCPU); @@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; - vmx->state[i].lastcpu = -1; + vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); @@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); -static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; - if (vmxstate->lastcpu == curcpu) + if (vmxstate->vpid == 0) return; - vmxstate->lastcpu = curcpu; - - vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } - vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); - vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); - vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); /* - * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated @@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ - if (vmxstate->vpid != 0) { - if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { - invvpid_desc._res1 = 0; - invvpid_desc._res2 = 0; - invvpid_desc.vpid = vmxstate->vpid; - invvpid_desc.linear_addr = 0; - invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); - } else { - /* - * The invvpid can be skipped if an invept is going to - * be performed before entering the guest. The invept - * will invalidate combined mappings tagged with - * 'vmx->eptp' for all vpids. - */ - vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); - } + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +{ + struct vmxstate *vmxstate; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ @@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) { - struct vm_exception exc; int vector, need_nmi_exiting, extint_pending; - uint64_t rflags; + uint64_t rflags, entryinfo; uint32_t gi, info; - if (vm_exception_pending(vmx->vm, vcpu, &exc)) { - KASSERT(exc.vector >= 0 && exc.vector < 32, - ("%s: invalid exception vector %d", __func__, exc.vector)); + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " - "pending exception %d: %#x", __func__, exc.vector, info)); + "pending exception: %#lx/%#x", __func__, entryinfo, info)); - info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID; - if (exc.error_code_valid) { - info |= VMCS_INTR_DEL_ERRCODE; - vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code); + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + vmcs_write(VMCS_ENTRY_INTR_INFO, info); } @@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { @@ -1659,11 +1724,19 @@ vmx_cpl(void) static enum vm_cpu_mode vmx_cpu_mode(void) { + uint32_t csar; - if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) - return (CPU_MODE_64BIT); - else - return (CPU_MODE_COMPATIBILITY); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } } static enum vm_paging_mode @@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging) static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; - vmx_paging_info(&vmexit->u.inst_emul.paging); + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_d = 0; + break; + } } static int @@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) return (UNHANDLED); } +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { @@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; + struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; - uint32_t reason; - uint64_t qual, gpa; + uint32_t intr_type, reason; + uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); @@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); /* - * VM exits that could be triggered during event injection on the - * previous VM entry need to be handled specially by re-injecting - * the event. + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. * * See "Information for VM Exits During Event Delivery" in Intel SDM * for details. */ - switch (reason) { - case EXIT_REASON_EPT_FAULT: - case EXIT_REASON_EPT_MISCONFIG: - case EXIT_REASON_APIC_ACCESS: - case EXIT_REASON_TASK_SWITCH: - case EXIT_REASON_EXCEPTION: - idtvec_info = vmcs_idt_vectoring_info(); - if (idtvec_info & VMCS_IDT_VEC_VALID) { - idtvec_info &= ~(1 << 12); /* clear undefined bit */ - vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info); - if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { - idtvec_err = vmcs_idt_vectoring_err(); - vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, - idtvec_err); - } - /* - * If 'virtual NMIs' are being used and the VM-exit - * happened while injecting an NMI during the previous - * VM-entry, then clear "blocking by NMI" in the Guest - * Interruptibility-state. - */ - if ((idtvec_info & VMCS_INTR_T_MASK) == - VMCS_INTR_T_NMI) { - vmx_clear_nmi_blocking(vmx, vcpu); - } + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); } - default: - idtvec_info = 0; - break; } switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); switch (qual & 0xf) { @@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * the guest. * * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (intr_info & 0xff) != IDT_DF && @@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, * pmap_invalidate_ept(). */ disable_intr(); + vmx_inject_interrupts(vmx, vcpu, vlapic); + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ if (vcpu_suspended(suspend_cookie)) { enable_intr(); vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip()); @@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, break; } - if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { + if (vcpu_should_yield(vm, vcpu)) { enable_intr(); vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip()); vmx_astpending_trace(vmx, vcpu, vmexit->rip); @@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, break; } - vmx_inject_interrupts(vmx, vcpu, vlapic); vmx_run_trace(vmx, vcpu); rc = vmx_enter_guest(vmxctx, vmx, launched); @@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; + pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); @@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vmx->ctx[vcpu].pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } } return (error); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c index 2aba63c..a3428db 100644 --- a/sys/amd64/vmm/intel/vmx_msr.c +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <machine/cpufunc.h> +#include <machine/specialreg.h> #include "vmx_msr.h" diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h index e6379a9..340b0f7 100644 --- a/sys/amd64/vmm/intel/vmx_msr.h +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -29,29 +29,6 @@ #ifndef _VMX_MSR_H_ #define _VMX_MSR_H_ -#define MSR_VMX_BASIC 0x480 -#define MSR_VMX_EPT_VPID_CAP 0x48C - -#define MSR_VMX_PROCBASED_CTLS 0x482 -#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E - -#define MSR_VMX_PINBASED_CTLS 0x481 -#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D - -#define MSR_VMX_PROCBASED_CTLS2 0x48B - -#define MSR_VMX_EXIT_CTLS 0x483 -#define MSR_VMX_TRUE_EXIT_CTLS 0x48f - -#define MSR_VMX_ENTRY_CTLS 0x484 -#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 - -#define MSR_VMX_CR0_FIXED0 0x486 -#define MSR_VMX_CR0_FIXED1 0x487 - -#define MSR_VMX_CR4_FIXED0 0x488 -#define MSR_VMX_CR4_FIXED1 0x489 - uint32_t vmx_revision(void); int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c index ca76ea8..f5ef71b 100644 --- a/sys/amd64/vmm/intel/vtd.c +++ b/sys/amd64/vmm/intel/vtd.c @@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, ptpindex = 0; ptpshift = 0; + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + if (gpa & PAGE_MASK) panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index ee6fc84..38fc458 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic) atpic->mask, atpic->request, atpic->service); /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * * PIC interrupts are routed to both the Local APIC * and the I/O APIC to support operation in 1 of 3 * modes. * * 1. Legacy PIC Mode: the PIC effectively bypasses - * all APIC components. In mode '1' the local APIC is + * all APIC components. In this mode the local APIC is * disabled and LINT0 is reconfigured as INTR to * deliver the PIC interrupt directly to the CPU. * * 2. Virtual Wire Mode: the APIC is treated as a * virtual wire which delivers interrupts from the PIC - * to the CPU. In mode '2' LINT0 is programmed as + * to the CPU. In this mode LINT0 is programmed as * ExtINT to indicate that the PIC is the source of * the interrupt. * - * 3. Symmetric I/O Mode: PIC interrupts are fielded - * by the I/O APIC and delivered to the appropriate - * CPU. In mode '3' the I/O APIC input 0 is - * programmed as ExtINT to indicate that the PIC is - * the source of the interrupt. + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. */ atpic->intr_raised = true; lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c2a9fd1..fa0200e 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -97,6 +97,7 @@ struct vcpu { int hostcpu; /* (o) vcpu's host cpu */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ struct vm_exception exception; /* (x) exception collateral */ @@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; @@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) return (0); } +static vm_paddr_t +vm_maxmem(struct vm *vm) +{ + int i; + vm_paddr_t gpa, maxmem; + + maxmem = 0; + for (i = 0; i < vm->num_mem_segs; i++) { + gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; + if (gpa > maxmem) + maxmem = gpa; + } + return (maxmem); +} + static void vm_gpa_unwire(struct vm *vm) { @@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); - maxaddr = vmm_mem_maxaddr(); + maxaddr = vm_maxmem(vm); vm->iommu = iommu_create_domain(maxaddr); error = vm_gpa_wire(vm); @@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) } } + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vm, vcpuid)) + break; + /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps @@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); - msleep_spin(vcpu, &vcpu->mtx, wmesg, 0); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } @@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; - int error; + enum vm_cpu_mode cpu_mode; + int cs_d, error; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; + cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; vie_init(vie); @@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) else if (error != 0) panic("%s: vmm_fetch_instruction error %d", __func__, error); - if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0) + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) return (EFAULT); /* return to userland unless this is an in-kernel emulated device */ @@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) return (0); } - error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, - retu); + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, + mread, mwrite, retu); return (error); } @@ -1456,6 +1484,202 @@ restart: } int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", + info1, info2); + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exception.vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exception.error_code_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exception.error_code << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", + vcpu->exception.vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " + "retinfo(%#lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) { struct vcpu *vcpu; @@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) if (exception->vector < 0 || exception->vector >= 32) return (EINVAL); + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (exception->vector == IDT_DF) + return (EINVAL); + vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { @@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) return (0); } -int -vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) -{ - struct vcpu *vcpu; - int pending; - - KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); - - vcpu = &vm->vcpu[vcpuid]; - pending = vcpu->exception_pending; - if (pending) { - vcpu->exception_pending = 0; - *exception = vcpu->exception; - VCPU_CTR1(vm, vcpuid, "Exception %d delivered", - exception->vector); - } - return (pending); -} - -static void -vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception) +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, + int errcode) { + struct vm_exception exception; struct vm_exit *vmexit; + struct vm *vm; int error; - error = vm_inject_exception(vm, vcpuid, exception); + vm = vmarg; + + exception.vector = vector; + exception.error_code = errcode; + exception.error_code_valid = errcode_valid; + error = vm_inject_exception(vm, vcpuid, &exception); KASSERT(error == 0, ("vm_inject_exception error %d", error)); /* @@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception) } void -vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) { - struct vm_exception pf = { - .vector = IDT_PF, - .error_code_valid = 1, - .error_code = error_code - }; + struct vm *vm; int error; + vm = vmarg; VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); - vm_inject_fault(vm, vcpuid, &pf); -} - -void -vm_inject_gp(struct vm *vm, int vcpuid) -{ - struct vm_exception gpf = { - .vector = IDT_GP, - .error_code_valid = 1, - .error_code = 0 - }; - - vm_inject_fault(vm, vcpuid, &gpf); -} - -void -vm_inject_ud(struct vm *vm, int vcpuid) -{ - struct vm_exception udf = { - .vector = IDT_UD, - .error_code_valid = 0 - }; - - vm_inject_fault(vm, vcpuid, &udf); + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); @@ -1993,6 +2188,97 @@ vm_segment_name(int seg) return (seg_names[seg]); } +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int idx; + + for (idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) + vm_gpa_release(copyinfo[idx].cookie); + } + bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva, *cookie; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); + if (error) + return (error); + off = gpa & PAGE_MASK; + n = min(remaining, PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, + prot, &cookie); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + copyinfo[idx].cookie = cookie; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (-1); + } else { + return (0); + } +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, + size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} /* * Return the amount of in-use and wired memory for the VM. Since diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index f3e31a3..a85109e 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_gla2gpa *gg; struct vm_activate_cpu *vac; struct vm_cpuset *vm_cpuset; + struct vm_intinfo *vmii; sc = vmmdev_lookup2(cdev); if (sc == NULL) @@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. @@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; + case VM_SET_INTINFO: + vmii = (struct vm_intinfo *)data; + error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); + break; + case VM_GET_INTINFO: + vmii = (struct vm_intinfo *)data; + error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, + &vmii->info2); + break; default: error = ENOTTY; break; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 921deb5..a65b125 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #else /* !_KERNEL */ #include <sys/types.h> #include <sys/errno.h> +#include <sys/_iovec.h> #include <machine/vmm.h> @@ -65,18 +66,26 @@ enum { VIE_OP_TYPE_AND, VIE_OP_TYPE_OR, VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, VIE_OP_TYPE_LAST }; /* struct vie_op.op_flags */ -#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */ -#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) static const struct vie_op two_byte_opcodes[256] = { [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, @@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, @@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = { .op_byte = 0x8B, .op_type = VIE_OP_TYPE_MOV, }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, [0xC7] = { .op_byte = 0xC7, .op_type = VIE_OP_TYPE_MOV, @@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_OR, .op_flags = VIE_OP_F_IMM8, }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } }; /* struct vie.mod */ @@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) return (error); } -static int -vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) { - uint64_t val; - int error, rshift; - enum vm_reg_name reg; - - rshift = 0; - reg = gpr_map[vie->reg]; + *lhbr = 0; + *reg = gpr_map[vie->reg]; /* - * 64-bit mode imposes limitations on accessing legacy byte registers. + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). * * The legacy high-byte registers cannot be addressed if the REX * prefix is present. In this case the values 4, 5, 6 and 7 of the @@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) */ if (!vie->rex_present) { if (vie->reg & 0x4) { - /* - * Obtain the value of %ah by reading %rax and shifting - * right by 8 bits (same for %bh, %ch and %dh). - */ - rshift = 8; - reg = gpr_map[vie->reg & 0x3]; + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; } } +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &val); - *rval = val >> rshift; + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } return (error); } @@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, } /* - * The following simplifying assumptions are made during emulation: - * - * - guest is in 64-bit mode - * - default address size is 64-bits - * - default operand size is 32-bits - * - * - operand size override is not supported - * - * - address size override is not supported + * Return the status flags that would result from doing (x - y). */ +static u_long +getcc16(uint16_t x, uint16_t y) +{ + u_long rflags; + + __asm __volatile("sub %1,%2; pushfq; popq %0" : + "=r" (rflags) : "m" (y), "r" (x)); + return (rflags); +} + +static u_long +getcc32(uint32_t x, uint32_t y) +{ + u_long rflags; + + __asm __volatile("sub %1,%2; pushfq; popq %0" : + "=r" (rflags) : "m" (y), "r" (x)); + return (rflags); +} + +static u_long +getcc64(uint64_t x, uint64_t y) +{ + u_long rflags; + + __asm __volatile("sub %1,%2; pushfq; popq %0" : + "=r" (rflags) : "m" (y), "r" (x)); + return (rflags); +} + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, uint8_t byte; uint64_t val; - size = 4; + size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { @@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * 88/r: mov r/m8, r8 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ - size = 1; + size = 1; /* override for byte operation */ error = vie_read_bytereg(vm, vcpuid, vie, &byte); if (error == 0) error = memwrite(vm, vcpuid, gpa, byte, size, arg); @@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 * 89/r: mov r/m32, r32 * REX.W + 89/r mov r/m64, r64 */ - if (vie->rex_w) - size = 8; reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val); if (error == 0) { @@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } break; case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vm, vcpuid, vie, val); + break; case 0x8B: /* * MOV from mem (ModRM:r/m) to reg (ModRM:reg) - * 8A/r: mov r/m8, r8 - * REX + 8A/r: mov r/m8, r8 + * 8B/r: mov r16, r/m16 * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ - if (vie->op.op_byte == 0x8A) - size = 1; - else if (vie->rex_w) - size = 8; error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = gpr_map[vie->reg]; error = vie_update_register(vm, vcpuid, reg, val, size); } break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); + break; case 0xC7: /* - * MOV from imm32 to mem (ModRM:r/m) + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 * C7/0 mov r/m32, imm32 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ - val = vie->immediate; /* already sign-extended */ - - if (vie->rex_w) - size = 8; - - if (size != 8) - val &= size2mask[size]; - + val = vie->immediate & size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); break; default: @@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } -/* - * The following simplifying assumptions are made during emulation: - * - * - guest is in 64-bit mode - * - default address size is 64-bits - * - default operand size is 32-bits - * - * - operand size override is not supported - * - * - address size override is not supported - */ static int emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, @@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, enum vm_reg_name reg; uint64_t val; - size = 4; + size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { @@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * MOV and zero extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * - * 0F B6/r movzx r/m8, r32 - * REX.W + 0F B6/r movzx r/m8, r64 + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 */ /* get the first operand */ @@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the second operand */ reg = gpr_map[vie->reg]; - if (vie->rex_w) - size = 8; + /* zero-extend byte */ + val = (uint8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * - * 0F BE/r movsx r/m8, r32 - * REX.W + 0F BE/r movsx r/m8, r64 + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 */ /* get the first operand */ @@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the second operand */ reg = gpr_map[vie->reg]; - if (vie->rex_w) - size = 8; - /* sign extend byte */ val = (int8_t)val; @@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, enum vm_reg_name reg; uint64_t val1, val2; - size = 4; + size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { @@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * + * 23/r and r16, r/m16 * 23/r and r32, r/m32 * REX.W + 23/r and r64, r/m64 */ - if (vie->rex_w) - size = 8; /* get the first operand */ reg = gpr_map[vie->reg]; @@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * AND mem (ModRM:r/m) with immediate and store the * result in mem. * - * 81/ and r/m32, imm32 - * REX.W + 81/ and r/m64, imm32 sign-extended to 64 + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * * Currently, only the AND operation of the 0x81 opcode * is implemented (ModRM:reg = b100). @@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if ((vie->reg & 7) != 4) break; - if (vie->rex_w) - size = 8; - /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) @@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, int error, size; uint64_t val1; - size = 4; + size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { @@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * OR mem (ModRM:r/m) with immediate and store the * result in mem. * - * 83/ OR r/m32, imm8 sign-extended to 32 - * REX.W + 83/ OR r/m64, imm8 sign-extended to 64 + * 83 /1 OR r/m16, imm8 sign-extended to 16 + * 83 /1 OR r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 * * Currently, only the OR operation of the 0x83 opcode * is implemented (ModRM:reg = b001). @@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if ((vie->reg & 7) != 1) break; - if (vie->rex_w) - size = 8; - /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) @@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x3B: + /* + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare first operand (reg) with second operand (r/m) and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &op1); + if (error) + return (error); + + /* Get the second operand */ + error = memread(vm, vcpuid, gpa, &op2, size, arg); + if (error) + return (error); + + break; + default: + return (EINVAL); + } + rflags2 = getcc(size, op1, op2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, size, stackaddrsize; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + size = vie->opsize; + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compability mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + + rsp -= size; + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE, + copyinfo, nitems(copyinfo)); + if (error == -1) { + /* + * XXX cannot return a negative error value here because it + * ends up being the return value of the VM_RUN() ioctl and + * is interpreted as a pseudo-error (for e.g. ERESTART). + */ + return (EFAULT); + } else if (error == 1) { + /* Resume guest execution to handle page fault */ + return (0); + } + + error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + if (error == 0) { + vm_copyout(vm, vcpuid, &val, copyinfo, size); + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } +#ifdef _KERNEL + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); +#endif + return (error); +} + int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, - void *memarg) + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) { int error; @@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (EINVAL); switch (vie->op.op_type) { + case VIE_OP_TYPE_PUSH: + error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; case VIE_OP_TYPE_MOV: error = emulate_mov(vm, vcpuid, gpa, vie, memread, memwrite, memarg); @@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, * then the descriptor is unusable and attempting to use * it results in a #GP(0). */ - if (SEG_DESC_UNUSABLE(desc)) + if (SEG_DESC_UNUSABLE(desc->access)) return (-1); /* @@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ - KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x", - seg, desc->access)); + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); /* * The descriptor type must indicate a code/data segment. */ - type = SEG_DESC_TYPE(desc); + type = SEG_DESC_TYPE(desc->access); KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %#x", seg, type)); @@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, if ((type & 0xC) == 0x4) { /* expand-down data segment */ low_limit = desc->limit + 1; - high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; } else { /* code segment or expand-up data segment */ low_limit = 0; @@ -947,45 +1254,24 @@ fault: } int -vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging, +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t rip, int inst_length, struct vie *vie) { - int n, error, prot; - uint64_t gpa, off; - void *hpa, *cookie; - - /* - * XXX cache previously fetched instructions using 'rip' as the tag - */ + struct vm_copyinfo copyinfo[2]; + int error, prot; - prot = VM_PROT_READ | VM_PROT_EXECUTE; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); - /* Copy the instruction into 'vie' */ - while (vie->num_valid < inst_length) { - error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa); - if (error) - return (error); - - off = gpa & PAGE_MASK; - n = min(inst_length - vie->num_valid, PAGE_SIZE - off); - - if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL) - break; - - bcopy(hpa, &vie->inst[vie->num_valid], n); - - vm_gpa_release(cookie); - - rip += n; - vie->num_valid += n; + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, + copyinfo, nitems(copyinfo)); + if (error == 0) { + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; } - - if (vie->num_valid == inst_length) - return (0); - else - return (-1); + return (error); } static int @@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie) } static int -decode_rex(struct vie *vie) +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { uint8_t x; - if (vie_peek(vie, &x)) - return (-1); + while (1) { + if (vie_peek(vie, &x)) + return (-1); - if (x >= 0x40 && x <= 0x4F) { - vie->rex_present = 1; + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else + break; + + vie_advance(vie); + } + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; vie->rex_b = x & 0x1 ? 1 : 0; - vie_advance(vie); } + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } return (0); } @@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + if (vie_peek(vie, &x)) return (-1); @@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie) union { char buf[4]; int8_t signed8; + int16_t signed16; int32_t signed32; } u; /* Figure out immediate operand size (if any) */ - if (vie->op.op_flags & VIE_OP_F_IMM) - vie->imm_bytes = 4; - else if (vie->op.op_flags & VIE_OP_F_IMM8) + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { vie->imm_bytes = 1; + } if ((n = vie->imm_bytes) == 0) return (0); - if (n != 1 && n != 4) - panic("decode_immediate: invalid imm_bytes %d", n); + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) @@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie) u.buf[i] = x; vie_advance(vie); } - + + /* sign-extend the immediate value before use */ if (n == 1) - vie->immediate = u.signed8; /* sign-extended */ + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; else - vie->immediate = u.signed32; /* sign-extended */ + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; return (0); } @@ -1301,7 +1681,7 @@ static int verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) { int error; - uint64_t base, idx; + uint64_t base, idx, gla2; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) @@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) } } - if (base + vie->scale * idx + vie->displacement != gla) { + /* XXX assuming that the base address of the segment is 0 */ + gla2 = base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { printf("verify_gla mismatch: " "base(0x%0lx), scale(%d), index(0x%0lx), " - "disp(0x%0lx), gla(0x%0lx)\n", - base, vie->scale, idx, vie->displacement, gla); + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + base, vie->scale, idx, vie->displacement, gla, gla2); return (-1); } @@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vm_cpu_mode cpu_mode, struct vie *vie) + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) { - if (cpu_mode == CPU_MODE_64BIT) { - if (decode_rex(vie)) - return (-1); - } + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); if (decode_opcode(vie)) return (-1); @@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, if (decode_displacement(vie)) return (-1); - + if (decode_immediate(vie)) return (-1); + if (decode_moffset(vie)) + return (-1); + if (verify_inst_length(vie)) return (-1); diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h index c0cef2c..8610684 100644 --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -436,6 +436,25 @@ #define MSR_MC4_MISC 0x413 /* + * VMX MSRs + */ +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 +#define MSR_VMX_PROCBASED_CTLS2 0x48b +#define MSR_VMX_EPT_VPID_CAP 0x48c +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48d +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +/* * X2APIC MSRs */ #define MSR_APIC_ID 0x802 diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 23e16cb..1c95f77 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -35,6 +35,7 @@ SRCS= \ post.c \ rtc.c \ smbiostbl.c \ + task_switch.c \ uart_emul.c \ virtio.c \ xmsr.c \ diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c index c4ec020..5dea300 100644 --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -40,12 +40,13 @@ * Layout * ------ * RSDP -> 0xf2400 (36 bytes fixed) - * RSDT -> 0xf2440 (36 bytes + 4*N table addrs, 2 used) - * XSDT -> 0xf2480 (36 bytes + 8*N table addrs, 2 used) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) * MADT -> 0xf2500 (depends on #CPUs) * FADT -> 0xf2600 (268 bytes) * HPET -> 0xf2740 (56 bytes) - * FACS -> 0xf2780 (64 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) * DSDT -> 0xf2800 (variable - can go up to 0x100000) */ @@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$"); #define MADT_OFFSET 0x100 #define FADT_OFFSET 0x200 #define HPET_OFFSET 0x340 -#define FACS_OFFSET 0x380 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 #define DSDT_OFFSET 0x400 #define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" @@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp) basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); @@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp) basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); @@ -583,6 +589,39 @@ err_exit: } static int +basl_fwrite_mcfg(FILE *fp) +{ + int err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int basl_fwrite_facs(FILE *fp) { int err; @@ -921,6 +960,7 @@ static struct { { basl_fwrite_madt, MADT_OFFSET }, { basl_fwrite_fadt, FADT_OFFSET }, { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, { basl_fwrite_facs, FACS_OFFSET }, { basl_fwrite_dsdt, DSDT_OFFSET }, { NULL } diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c index 6e13c19..930b7af 100644 --- a/usr.sbin/bhyve/atkbdc.c +++ b/usr.sbin/bhyve/atkbdc.c @@ -31,6 +31,10 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> +#include <vmmapi.h> + +#include <assert.h> +#include <errno.h> #include <stdio.h> #include "inout.h" @@ -48,29 +52,30 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { if (bytes != 1) - return (INOUT_ERROR); + return (-1); *eax = 0; - return (INOUT_OK); + return (0); } static int atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { - int retval; + int error, retval; if (bytes != 1) - return (INOUT_ERROR); + return (-1); - retval = INOUT_OK; + retval = 0; if (in) { *eax = KBD_SYS_FLAG; /* system passed POST */ } else { switch (*eax) { case KBDC_RESET: /* Pulse "reset" line. */ - retval = INOUT_RESET; + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); break; } } diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index aad1aef..80b814a 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 2, 2014 +.Dd June 26, 2014 .Dt BHYVE 8 .Os .Sh NAME @@ -32,12 +32,14 @@ .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm -.Op Fl aehwxACHPW +.Op Fl abehwxACHPWY .Op Fl c Ar numcpus .Op Fl g Ar gdbport +.Op Fl l Ar lpcdev Ns Op , Ns Ar conf +.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t .Op Fl p Ar vcpu:hostcpu .Op Fl s Ar slot,emulation Ns Op , Ns Ar conf -.Op Fl l Ar lpcdev Ns Op , Ns Ar conf +.Op Fl U Ar uuid .Ar vmname .Sh DESCRIPTION .Nm @@ -66,21 +68,49 @@ Generate ACPI tables. Required for .Fx Ns /amd64 guests. +.It Fl b +Enable a low-level console device supported by +.Fx kernels compiled with +.Cd "device bvmconsole" . +This option will be deprecated in a future version. .It Fl c Ar numcpus Number of guest virtual CPUs. The default is 1 and the maximum is 16. .It Fl C Include guest memory in core file. -.It Fl H -Yield the virtual CPU thread when a HLT instruction is detected. -If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl e +Force +.Nm +to exit when a guest issues an access to an I/O port that is not emulated. +This is intended for debug purposes. .It Fl g Ar gdbport For -.Fx Ns /amd64 kernels compiled with -.Cd "option bvmdebug" , +.Fx +kernels compiled with +.Cd "device bvmdebug" , allow a remote kernel kgdb to be relayed to the guest kernel gdb stub via a local IPv4 address and this port. This option will be deprecated in a future version. +.It Fl h +Print help message and exit. +.It Fl H +Yield the virtual CPU thread when a HLT instruction is detected. +If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl l Ar lpcdev Ns Op , Ns Ar conf +Allow devices behind the LPC PCI-ISA bridge to be configured. +The only supported devices are the TTY-class devices, +.Li com1 +and +.Li com2 . +.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t +Guest physical memory size in bytes. +This must be the same size that was given to +.Xr bhyveload 8 . +.Pp +The size argument may be suffixed with one of K, M, G or T (either upper +or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, +or terabytes. +If no suffix is given, the value is assumed to be in megabytes. .It Fl p Ar vcpu:hostcpu Pin guest's virtual CPU .Em vcpu @@ -88,9 +118,6 @@ to .Em hostcpu . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. -.It Fl W -Force virtio PCI device emulations to use MSI interrupts instead of MSI-X -interrupts. .It Fl s Ar slot,emulation Ns Op , Ns Ar conf Configure a virtual PCI slot and function. .Pp @@ -211,34 +238,21 @@ The host device must have been reserved at boot-time using the loader variable as described in .Xr vmm 4 . .El -.It Fl l Ar lpcdev Ns Op , Ns Ar conf -Allow devices behind the LPC PCI-ISA bridge to be configured. -The only supported devices are the TTY-class devices, -.Li com1 -and -.Li com2 . -.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t -Guest physical memory size in bytes. -This must be the same size that was given to -.Xr bhyveload 8 . -.Pp -The size argument may be suffixed with one of K, M, G or T (either upper -or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, -or terabytes. -If no suffix is given, the value is assumed to be in megabytes. -.It Fl e -Force -.Nm -to exit when a guest issues an access to an I/O port that is not emulated. -This is intended for debug purposes. +.It Fl U Ar uuid +Set the universally unique identifier +.Pq UUID +in the guest's System Management BIOS System Information structure. +By default a UUID is generated from the host's hostname and +.Ar vmname . .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. +.It Fl W +Force virtio PCI device emulations to use MSI interrupts instead of MSI-X +interrupts. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. -.It Fl h -Print help message and exit. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 1e5d3b3..7dcf6d0 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$"); #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ -#define VMEXIT_CONTINUE 1 /* continue from next instruction */ -#define VMEXIT_RESTART 2 /* restart current instruction */ -#define VMEXIT_ABORT 3 /* abort the vm run loop */ -#define VMEXIT_RESET 4 /* guest machine has reset */ -#define VMEXIT_POWEROFF 5 /* guest machine has powered off */ - #define MB (1024UL * 1024) #define GB (1024UL * MB) typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; @@ -101,7 +96,7 @@ static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); -struct vm_exit vmexit[VM_MAXCPU]; +static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; @@ -112,8 +107,6 @@ struct bhyvestats { uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; - int io_reset; - int io_poweroff; } stats; struct mt_vmm_info { @@ -129,26 +122,26 @@ usage(int code) { fprintf(stderr, - "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n" - " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" + "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n" + " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n" " -a: local apic is in xAPIC mode (deprecated)\n" - " -A: create an ACPI table\n" - " -g: gdb port\n" + " -A: create ACPI tables\n" " -c: # cpus (default 1)\n" " -C: include guest memory in core file\n" - " -p: pin 'vcpu' to 'hostcpu'\n" - " -H: vmexit from the guest on hlt\n" - " -P: vmexit from the guest on pause\n" - " -W: force virtio to use single-vector MSI\n" " -e: exit on unhandled I/O access\n" + " -g: gdb port\n" " -h: help\n" - " -s: <slot,driver,configinfo> PCI slot config\n" + " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -P: vmexit from the guest on pause\n" + " -s: <slot,driver,configinfo> PCI slot config\n" + " -U: uuid\n" " -w: ignore unimplemented MSRs\n" + " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" - " -Y: disable MPtable generation\n" - " -U: uuid\n", + " -Y: disable MPtable generation\n", progname, (int)strlen(progname), ""); exit(code); @@ -187,6 +180,27 @@ pincpu_parse(const char *opt) return (0); } +void +vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, + int errcode) +{ + struct vmctx *ctx; + int error; + + ctx = arg; + if (errcode_valid) + error = vm_inject_exception2(ctx, vcpu, vector, errcode); + else + error = vm_inject_exception(ctx, vcpu, vector); + assert(error == 0); + + /* + * Set the instruction length to 0 to ensure that the instruction is + * restarted when the fault handler returns. + */ + vmexit[vcpu].inst_length = 0; +} + void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { @@ -315,27 +329,18 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) } error = emulate_inout(ctx, vcpu, vme, strictio); - if (error == INOUT_OK && in && !string) { + if (!error && in && !string) { error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, vme->u.inout.eax); + assert(error == 0); } - switch (error) { - case INOUT_OK: - return (VMEXIT_CONTINUE); - case INOUT_RESTART: - return (VMEXIT_RESTART); - case INOUT_RESET: - stats.io_reset++; - return (VMEXIT_RESET); - case INOUT_POWEROFF: - stats.io_poweroff++; - return (VMEXIT_POWEROFF); - default: - fprintf(stderr, "Unhandled %s%c 0x%04x\n", - in ? "in" : "out", - bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); + if (error) { + fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); return (VMEXIT_ABORT); + } else { + return (VMEXIT_CONTINUE); } } @@ -352,8 +357,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { - error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0); - assert(error == 0); + vm_inject_gp(ctx, *pvcpu); return (VMEXIT_RESTART); } } @@ -379,8 +383,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { - error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0); - assert(error == 0); + vm_inject_gp(ctx, *pvcpu); return (VMEXIT_RESTART); } } @@ -399,6 +402,16 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) return (retval); } +#define DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define EXIT_REASON_EPT_MISCONFIG 49 +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 +#define VMCS_IDENT(x) ((x) | 0x80000000) + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif + static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { @@ -413,7 +426,21 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); - +#ifdef DEBUG_EPT_MISCONFIG + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vm_get_register(ctx, *pvcpu, + VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), + &ept_misconfig_gpa); + vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, + &ept_misconfig_ptenum); + fprintf(stderr, "\tEPT misconfiguration:\n"); + fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); + fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + ept_misconfig_ptenum, ept_misconfig_pte[0], + ept_misconfig_pte[1], ept_misconfig_pte[2], + ept_misconfig_pte[3]); + } +#endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } @@ -465,7 +492,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) stats.vmexit_inst_emul++; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, - &vmexit->u.inst_emul.vie); + &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging); if (err) { if (err == EINVAL) { @@ -515,6 +542,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) exit(1); case VM_SUSPEND_HALT: exit(2); + case VM_SUSPEND_TRIPLEFAULT: + exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); @@ -532,7 +561,8 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, - [VM_EXITCODE_SUSPENDED] = vmexit_suspend + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, }; static void @@ -540,7 +570,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) { int error, rc, prevcpu; enum vm_exitcode exitcode; - enum vm_suspend_how how; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { @@ -575,16 +604,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) case VMEXIT_RESTART: rip = vmexit[vcpu].rip; break; - case VMEXIT_RESET: - case VMEXIT_POWEROFF: - if (rc == VMEXIT_RESET) - how = VM_SUSPEND_RESET; - else - how = VM_SUSPEND_POWEROFF; - error = vm_suspend(ctx, how); - assert(error == 0 || errno == EALREADY); - rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; - break; case VMEXIT_ABORT: abort(); default: diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index f18d42f..87824ef 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -35,6 +35,10 @@ #define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] #endif +#define VMEXIT_CONTINUE 1 /* continue from next instruction */ +#define VMEXIT_RESTART 2 /* restart current instruction */ +#define VMEXIT_ABORT 3 /* abort the vm run loop */ + struct vmctx; extern int guest_ncpus; extern char *guest_uuid_str; diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index b29bc78..1ec0344 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -390,6 +390,55 @@ blockif_close(struct blockif_ctxt *bc) } /* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535UL*16*255) + sectors = 65535UL*16*255; + + if (sectors >= 65536UL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* * Accessors */ off_t diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h index e0c0bb1..c2c21f6 100644 --- a/usr.sbin/bhyve/block_if.h +++ b/usr.sbin/bhyve/block_if.h @@ -52,6 +52,8 @@ struct blockif_req { struct blockif_ctxt; struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index fe9e0d8..1041a59 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -154,31 +154,28 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) /* Limit number of back-to-back in/out emulations to 16 */ iterations = MIN(count, 16); while (iterations > 0) { + assert(retval == 0); if (vie_calculate_gla(vis->paging.cpu_mode, vis->seg_name, &vis->seg_desc, index, bytes, addrsize, prot, &gla)) { - error = vm_inject_exception2(ctx, vcpu, - IDT_GP, 0); - assert(error == 0); - retval = INOUT_RESTART; + vm_inject_gp(ctx, vcpu); break; } - error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes, - prot, iov, nitems(iov)); - assert(error == 0 || error == 1 || error == -1); - if (error) { - retval = (error == 1) ? INOUT_RESTART : - INOUT_ERROR; + error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, + bytes, prot, iov, nitems(iov)); + if (error == -1) { + retval = -1; /* Unrecoverable error */ + break; + } else if (error == 1) { + retval = 0; /* Resume guest to handle fault */ break; } if (vie_alignment_check(vis->paging.cpl, bytes, vis->cr0, vis->rflags, gla)) { - error = vm_inject_exception2(ctx, vcpu, - IDT_AC, 0); - assert(error == 0); - return (INOUT_RESTART); + vm_inject_ac(ctx, vcpu, 0); + break; } val = 0; @@ -217,8 +214,8 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) } /* Restart the instruction if more iterations remain */ - if (retval == INOUT_OK && count != 0) - retval = INOUT_RESTART; + if (retval == 0 && count != 0) + vmexit->inst_length = 0; } else { if (!in) { val = vmexit->u.inout.eax & vie_size2mask(bytes); diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h index f15a2c8..7f39095 100644 --- a/usr.sbin/bhyve/inout.h +++ b/usr.sbin/bhyve/inout.h @@ -34,13 +34,9 @@ struct vmctx; struct vm_exit; -/* Handler return values. */ -#define INOUT_ERROR -1 -#define INOUT_OK 0 -#define INOUT_RESTART 1 -#define INOUT_RESET 2 -#define INOUT_POWEROFF 3 - +/* + * inout emulation handlers return 0 on success and -1 on failure. + */ typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c index 7ea630f..2a9f430 100644 --- a/usr.sbin/bhyve/mem.c +++ b/usr.sbin/bhyve/mem.c @@ -157,10 +157,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) } int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie) +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + { struct mmio_rb_range *entry; - int err; + int err, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* @@ -184,10 +186,28 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie) } assert(entry != NULL); - err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, mem_read, mem_write, &entry->mr_param); - pthread_rwlock_unlock(&mmio_rwlock); - + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); + return (err); } @@ -244,6 +264,7 @@ unregister_mem(struct mem_range *memp) mr = &entry->mr_param; assert(mr->name == memp->name); assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); /* flush Per-vCPU cache */ diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h index 264bff9..f671eae 100644 --- a/usr.sbin/bhyve/mem.h +++ b/usr.sbin/bhyve/mem.h @@ -48,9 +48,11 @@ struct mem_range { #define MEM_F_READ 0x1 #define MEM_F_WRITE 0x2 #define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ void init_mem(void); -int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging); int register_mem(struct mem_range *memp); int register_mem_fallback(struct mem_range *memp); diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 9f61107..214237d 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -336,8 +336,9 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) p->is |= AHCI_P_IX_TFE; + else + p->ci &= ~(1 << slot); p->tfd = tfd; - p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } @@ -598,10 +599,16 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) } else { uint16_t buf[256]; uint64_t sectors; + uint16_t cyl; + uint8_t sech, heads; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + blockif_chs(p->bctx, &cyl, &heads, &sech); memset(buf, 0, sizeof(buf)); buf[0] = 0x0040; + buf[1] = cyl; + buf[3] = heads; + buf[6] = sech; /* TODO emulate different serial? */ ata_string((uint8_t *)(buf+10), "123456", 20); ata_string((uint8_t *)(buf+23), "001", 8); @@ -645,8 +652,8 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); p->tfd = ATA_S_DSC | ATA_S_READY; p->is |= AHCI_P_IX_DP; + p->ci &= ~(1 << slot); } - p->ci &= ~(1 << slot); ahci_generate_intr(p->pr_sc); } @@ -688,8 +695,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); p->tfd = ATA_S_DSC | ATA_S_READY; p->is |= AHCI_P_IX_DHR; + p->ci &= ~(1 << slot); } - p->ci &= ~(1 << slot); ahci_generate_intr(p->pr_sc); } @@ -1292,7 +1299,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) if (!p->atapi) { p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; p->is |= AHCI_P_IX_TFE; - p->ci &= ~(1 << slot); ahci_generate_intr(p->pr_sc); } else handle_packet_cmd(p, slot, cfis); @@ -1301,7 +1307,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) WPRINTF("Unsupported cmd:%02x\n", cfis[2]); p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; p->is |= AHCI_P_IX_TFE; - p->ci &= ~(1 << slot); ahci_generate_intr(p->pr_sc); break; } @@ -1369,8 +1374,11 @@ ahci_handle_port(struct ahci_port *p) * are already in-flight. */ for (i = 0; (i < 32) && p->ci; i++) { - if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) + if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) { + p->cmd &= ~AHCI_P_CMD_CCS_MASK; + p->cmd |= i << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, i); + } } } diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 458ba76..6b906ed 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64; #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 -#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMBASE64 0xD000000000UL #define PCI_EMUL_MEMLIMIT64 0xFD00000000UL static struct pci_devemu *pci_emul_finddev(char *name); -static void pci_lintr_route(struct pci_devinst *pi); -static void pci_lintr_update(struct pci_devinst *pi); - -static struct mem_range pci_mem_hole; +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); /* * I/O access @@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, return (0); } +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { + struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; @@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx) * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) - * [lowmem_limit, 4GB) PCI hole (32-bit BAR allocation) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) - * + */ + + /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); - memset(&pci_mem_hole, 0, sizeof(struct mem_range)); - pci_mem_hole.name = "PCI hole"; - pci_mem_hole.flags = MEM_F_RW; - pci_mem_hole.base = lowmem; - pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem; - pci_mem_hole.handler = pci_emul_fallback_handler; - - error = register_mem_fallback(&pci_mem_hole); + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); assert(error == 0); return (0); @@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) } } -static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; - -static int -pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - uint32_t x; - - if (bytes != 4) { - if (in) - *eax = (bytes == 2) ? 0xffff : 0xff; - return (0); - } - - if (in) { - x = (cfgbus << 16) | - (cfgslot << 11) | - (cfgfunc << 8) | - cfgoff; - if (cfgenable) - x |= CONF1_ENABLE; - *eax = x; - } else { - x = *eax; - cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; - cfgoff = x & PCI_REGMAX; - cfgfunc = (x >> 8) & PCI_FUNCMAX; - cfgslot = (x >> 11) & PCI_SLOTMAX; - cfgbus = (x >> 16) & PCI_BUSMAX; - } - - return (0); -} -INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); - static uint32_t bits_changed(uint32_t old, uint32_t new, uint32_t mask) { @@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) pci_lintr_update(pi); } -static int -pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; - int coff, idx, needcfg; + int idx, needcfg; uint64_t addr, bar, mask; - assert(bytes == 1 || bytes == 2 || bytes == 4); - - if ((bi = pci_businfo[cfgbus]) != NULL) { - si = &bi->slotinfo[cfgslot]; - pi = si->si_funcs[cfgfunc].fi_devi; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; } else pi = NULL; - coff = cfgoff + (port - CONF1_DATA_PORT); - -#if 0 - printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", - in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); -#endif - /* - * Just return if there is no device at this cfgslot:cfgfunc, - * if the guest is doing an un-aligned access, or if the config - * address word isn't enabled. + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. */ - if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) { + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; - return (0); + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; } pe = pi->pi_d; @@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { - needcfg = pe->pe_cfgread(ctx, vcpu, pi, - coff, bytes, eax); + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); } else { needcfg = 1; } @@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, *eax = pci_get_cfgdata32(pi, coff); } - pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax); + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) - return (0); + return; /* * Special handling for write to BAR registers @@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) - return (0); + return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { @@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, CFGWRITE(pi, coff, *eax, bytes); } } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } return (0); } diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 866ffc5..6b8c4e0 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); static __inline void diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c index 653aeb0..20e033f 100644 --- a/usr.sbin/bhyve/pci_irq.c +++ b/usr.sbin/bhyve/pci_irq.c @@ -115,7 +115,7 @@ void pci_irq_reserve(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); irq_counts[irq] = IRQ_DISABLED; @@ -125,10 +125,10 @@ void pci_irq_use(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); - if (irq_counts[irq] != IRQ_DISABLED) - irq_counts[irq]++; + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; } void @@ -197,7 +197,7 @@ pirq_alloc_pin(struct vmctx *ctx) { int best_count, best_irq, best_pin, irq, pin; - pirq_cold = 1; + pirq_cold = 0; /* First, find the least-used PIRQ pin. */ best_pin = 0; @@ -222,7 +222,7 @@ pirq_alloc_pin(struct vmctx *ctx) best_count = irq_counts[irq]; } } - assert(best_irq != 0); + assert(best_irq >= 0); irq_counts[best_irq]++; pirqs[best_pin].reg = best_irq; vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); @@ -234,9 +234,6 @@ pirq_alloc_pin(struct vmctx *ctx) int pirq_irq(int pin) { - - if (pin == -1) - return (255); assert(pin > 0 && pin <= nitems(pirqs)); return (pirqs[pin - 1].reg & PIRQ_IRQ); } diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c index 67126d8..f5a2d43 100644 --- a/usr.sbin/bhyve/pm.c +++ b/usr.sbin/bhyve/pm.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <assert.h> +#include <errno.h> #include <pthread.h> #include <signal.h> #include <vmmapi.h> @@ -56,6 +57,8 @@ static int reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; + static uint8_t reset_control; if (bytes != 1) @@ -66,8 +69,10 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, reset_control = *eax; /* Treat hard and soft resets the same. */ - if (reset_control & 0x4) - return (INOUT_RESET); + if (reset_control & 0x4) { + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + } } return (0); } @@ -224,6 +229,7 @@ static int pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; if (bytes != 2) return (-1); @@ -243,8 +249,10 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, * says that '5' should be stored in SLP_TYP for S5. */ if (*eax & PM1_SLP_EN) { - if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) - return (INOUT_POWEROFF); + if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + assert(error == 0 || errno == EALREADY); + } } } return (0); diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c index d560f02..28c7eb2 100644 --- a/usr.sbin/bhyve/smbiostbl.c +++ b/usr.sbin/bhyve/smbiostbl.c @@ -321,8 +321,8 @@ struct smbios_table_type0 smbios_type0_template = { const char *smbios_type0_strings[] = { "BHYVE", /* vendor string */ - __TIME__, /* bios version string */ - __DATE__, /* bios release date string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ NULL }; diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c new file mode 100644 index 0000000..0002da8 --- /dev/null +++ b/usr.sbin/bhyve/task_switch.c @@ -0,0 +1,932 @@ +/*- + * Copyright (c) 2014 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> +#include <x86/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <errno.h> + +#include <vmmapi.h> + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +CTASSERT(sizeof(struct tss32) == 104); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ + uint64_t val; + int error; + + error = vm_get_register(ctx, vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + + error = vm_set_register(ctx, vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; + seg_desc.access |= usd->sd_xx << 12; + seg_desc.access |= usd->sd_def32 << 14; + seg_desc.access |= usd->sd_gran << 15; + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov)); + if (error == 0) { + if (doread) + vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + else + vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + } + return (error); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, true)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, false)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t sel, struct user_segment_descriptor *desc) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + int segment, struct seg_desc *seg_desc) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = GETREG(ctx, vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd); + if (error) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~PSL_NT; + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t ot_sel, struct tss32 *tss, struct iovec *iov) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(ctx, vcpu); + return (1); + } + } + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc); + if (error) + return (error); + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc); + if (error) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + int task_type, uint32_t errcode) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp -= bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { + sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + return (1); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(ctx, vcpu, 1); + return (1); + } + + error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + iov, nitems(iov)); + if (error) + return (error); + + vm_copyout(ctx, vcpu, &errcode, iov, bytes); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + * 0: success + * +1: an exception was injected into the guest vcpu + * -1: unrecoverable/programming error + */ +#define CHKERR(x) \ + do { \ + assert(((x) == 0) || ((x) == 1) || ((x) == -1)); \ + if ((x) == -1) \ + return (VMEXIT_ABORT); \ + else if ((x) == 1) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc); + CHKERR(error); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < minlimit) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov)); + CHKERR(error); + vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc); + CHKERR(error); + + /* Get the old TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov)); + CHKERR(error); + vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + &ot_desc); + CHKERR(error); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + eip = vmexit->rip + vmexit->inst_length; + tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + &nt_desc); + CHKERR(error); + } + + /* Update task register to point at the new TSS */ + SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + vmexit->rip = newtss.tss_eip; + vmexit->inst_length = 0; + + /* Load processor state from new TSS */ + error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov); + CHKERR(error); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + task_switch->errcode); + CHKERR(error); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = vm_set_intinfo(ctx, vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 4e58dd6..1f27300 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -437,7 +437,7 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail) if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; - else if (vs->vs_flags & VIRTIO_EVENT_IDX) { + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index 01b5f7b..1f29dfa 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -352,7 +352,7 @@ struct virtio_consts { /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ - uint32_t vc_hv_caps; /* hypervisor-provided capabilities */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ }; /* diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index e77f0d7..b6006b7 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -195,7 +195,8 @@ usage(void) " [--force-reset]\n" " [--force-poweroff]\n" " [--get-active-cpus]\n" - " [--get-suspended-cpus]\n", + " [--get-suspended-cpus]\n" + " [--get-intinfo]\n", progname); exit(1); } @@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt; static int force_reset, force_poweroff; static const char *capname; static int create, destroy, get_lowmem, get_highmem; +static int get_intinfo; static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; @@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus) printf("\n"); } +static void +print_intinfo(const char *banner, uint64_t info) +{ + int type; + + printf("%s:\t", banner); + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + switch (type) { + case VM_INTINFO_HWINTR: + printf("extint"); + break; + case VM_INTINFO_NMI: + printf("nmi"); + break; + case VM_INTINFO_SWINTR: + printf("swint"); + break; + default: + printf("exception"); + break; + } + printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); + if (info & VM_INTINFO_DEL_ERRCODE) + printf(" errcode %#x", (u_int)(info >> 32)); + } else { + printf("n/a"); + } + printf("\n"); +} + int main(int argc, char *argv[]) { @@ -420,7 +453,7 @@ main(int argc, char *argv[]) vm_paddr_t gpa, gpa_pmap; size_t len; struct vm_exit vmexit; - uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte; + uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2]; struct vmctx *ctx; int wired; cpuset_t cpus; @@ -595,6 +628,7 @@ main(int argc, char *argv[]) { "force-poweroff", NO_ARG, &force_poweroff, 1 }, { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, + { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { NULL, 0, NULL, 0 } }; @@ -1566,6 +1600,14 @@ main(int argc, char *argv[]) print_cpus("suspended cpus", &cpus); } + if (!error && (get_intinfo || get_all)) { + error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); + if (!error) { + print_intinfo("pending", info[0]); + print_intinfo("current", info[1]); + } + } + if (!error && run) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); assert(error == 0); diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8 index 3a300cc..0bdf151 100644 --- a/usr.sbin/bhyveload/bhyveload.8 +++ b/usr.sbin/bhyveload/bhyveload.8 @@ -35,11 +35,11 @@ guest inside a bhyve virtual machine .Sh SYNOPSIS .Nm -.Op Fl m Ar mem-size +.Op Fl c Ar cons-dev .Op Fl d Ar disk-path -.Op Fl h Ar host-path .Op Fl e Ar name=value -.Op Fl c Ar cons-dev +.Op Fl h Ar host-path +.Op Fl m Ar mem-size .Ar vmname .Sh DESCRIPTION .Nm @@ -62,6 +62,32 @@ and will be created if it does not already exist. .Sh OPTIONS The following options are available: .Bl -tag -width indent +.It Fl c Ar cons-dev +.Ar cons-dev +is a +.Xr tty 4 +device to use for +.Nm +terminal I/O. +.Pp +The text string "stdio" is also accepted and selects the use of +unbuffered standard I/O. This is the default value. +.It Fl d Ar disk-path +The +.Ar disk-path +is the pathname of the guest's boot disk image. +.It Fl e Ar name=value +Set the FreeBSD loader environment variable +.Ar name +to +.Ar value . +.Pp +The option may be used more than once to set more than one environment +variable. +.It Fl h Ar host-path +The +.Ar host-path +is the directory at the top of the guest's boot filesystem. .It Fl m Ar mem-size Xo .Sm off .Op Cm K | k | M | m | G | g | T | t @@ -85,32 +111,6 @@ respectively. The default value of .Ar mem-size is 256M. -.It Fl d Ar disk-path -The -.Ar disk-path -is the pathname of the guest's boot disk image. -.It Fl h Ar host-path -The -.Ar host-path -is the directory at the top of the guest's boot filesystem. -.It Fl e Ar name=value -Set the FreeBSD loader environment variable -.Ar name -to -.Ar value . -.Pp -The option may be used more than once to set more than one environment -variable. -.It Fl c Ar cons-dev -.Ar cons-dev -is a -.Xr tty 4 -device to use for -.Nm -terminal I/O. -.Pp -The text string "stdio" is also accepted and selects the use of -unbuffered standard I/O. This is the default value. .El .Sh EXAMPLES To create a virtual machine named diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c index ff6b269..eaf71a8 100644 --- a/usr.sbin/bhyveload/bhyveload.c +++ b/usr.sbin/bhyveload/bhyveload.c @@ -629,8 +629,8 @@ usage(void) { fprintf(stderr, - "usage: %s [-m mem-size] [-d <disk-path>] [-h <host-path>]\n" - " %*s [-e <name=value>] [-c <console-device>] <vmname>\n", + "usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n" + " %*s [-h <host-path>] [-m mem-size] <vmname>\n", progname, (int)strlen(progname), ""); exit(1); |