summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libvmmapi/vmmapi.c42
-rw-r--r--lib/libvmmapi/vmmapi.h7
-rw-r--r--sys/amd64/amd64/identcpu.c199
-rw-r--r--sys/amd64/include/vmm.h153
-rw-r--r--sys/amd64/include/vmm_dev.h12
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h6
-rw-r--r--sys/amd64/vmm/intel/vmcs.c8
-rw-r--r--sys/amd64/vmm/intel/vmcs.h3
-rw-r--r--sys/amd64/vmm/intel/vmx.c350
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c1
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h23
-rw-r--r--sys/amd64/vmm/intel/vtd.c5
-rw-r--r--sys/amd64/vmm/io/vatpic.c17
-rw-r--r--sys/amd64/vmm/vmm.c402
-rw-r--r--sys/amd64/vmm/vmm_dev.c12
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c672
-rw-r--r--sys/x86/include/specialreg.h19
-rw-r--r--usr.sbin/bhyve/Makefile1
-rw-r--r--usr.sbin/bhyve/acpi.c48
-rw-r--r--usr.sbin/bhyve/atkbdc.c17
-rw-r--r--usr.sbin/bhyve/bhyve.880
-rw-r--r--usr.sbin/bhyve/bhyverun.c127
-rw-r--r--usr.sbin/bhyve/bhyverun.h4
-rw-r--r--usr.sbin/bhyve/block_if.c49
-rw-r--r--usr.sbin/bhyve/block_if.h2
-rw-r--r--usr.sbin/bhyve/inout.c29
-rw-r--r--usr.sbin/bhyve/inout.h10
-rw-r--r--usr.sbin/bhyve/mem.c31
-rw-r--r--usr.sbin/bhyve/mem.h4
-rw-r--r--usr.sbin/bhyve/pci_ahci.c20
-rw-r--r--usr.sbin/bhyve/pci_emul.c218
-rw-r--r--usr.sbin/bhyve/pci_emul.h1
-rw-r--r--usr.sbin/bhyve/pci_irq.c15
-rw-r--r--usr.sbin/bhyve/pm.c16
-rw-r--r--usr.sbin/bhyve/smbiostbl.c4
-rw-r--r--usr.sbin/bhyve/task_switch.c932
-rw-r--r--usr.sbin/bhyve/virtio.c2
-rw-r--r--usr.sbin/bhyve/virtio.h2
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c46
-rw-r--r--usr.sbin/bhyveload/bhyveload.858
-rw-r--r--usr.sbin/bhyveload/bhyveload.c4
41 files changed, 3057 insertions, 594 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 9fb2308..93955c7 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/_iovec.h>
#include <sys/cpuset.h>
+#include <x86/segments.h>
#include <machine/specialreg.h>
#include <machine/param.h>
@@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
}
int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+ int error;
+
+ error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+ &seg_desc->access);
+ return (error);
+}
+
+int
vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
@@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
#endif
int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
{
uint64_t gpa;
@@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)
error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
return (error);
}
+
+int
+vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
+{
+ struct vm_intinfo vmii;
+ int error;
+
+ bzero(&vmii, sizeof(struct vm_intinfo));
+ vmii.vcpuid = vcpu;
+ error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
+ if (error == 0) {
+ *info1 = vmii.info1;
+ *info2 = vmii.info2;
+ }
+ return (error);
+}
+
+int
+vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
+{
+ struct vm_intinfo vmii;
+ int error;
+
+ bzero(&vmii, sizeof(struct vm_intinfo));
+ vmii.vcpuid = vcpu;
+ vmii.info1 = info1;
+ error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
+ return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 067eaa0..fbb6ddd 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -66,6 +66,8 @@ int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
+int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+ struct seg_desc *seg_desc);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
@@ -104,6 +106,9 @@ int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, int idx, uint64_t addr, uint64_t msg,
uint32_t vector_control);
+int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
+int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
+
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
@@ -121,7 +126,7 @@ int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
* The 'iovcnt' should be big enough to accomodate all GPA segments.
* Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
*/
-int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
void *host_dst, size_t len);
diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 957120c..74be82c 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/md_var.h>
+#include <amd64/vmm/intel/vmx_controls.h>
#include <x86/isa/icu.h>
/* XXX - should be in header file: */
@@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void);
static void print_AMD_info(void);
static void print_AMD_assoc(int i);
static void print_via_padlock_info(void);
+static void print_vmx_info(void);
int cpu_class;
char machine[] = "amd64";
@@ -428,6 +430,9 @@ printcpuinfo(void)
if (via_feature_rng != 0 || via_feature_xcrypt != 0)
print_via_padlock_info();
+ if (cpu_feature2 & CPUID2_VMX)
+ print_vmx_info();
+
if ((cpu_feature & CPUID_HTT) &&
cpu_vendor_id == CPU_VENDOR_AMD)
cpu_feature &= ~CPUID_HTT;
@@ -722,3 +727,197 @@ print_via_padlock_info(void)
"\015RSA" /* PMM */
);
}
+
+static uint32_t
+vmx_settable(uint64_t basic, int msr, int true_msr)
+{
+ uint64_t val;
+
+ if (basic & (1UL << 55))
+ val = rdmsr(true_msr);
+ else
+ val = rdmsr(msr);
+
+ /* Just report the controls that can be set to 1. */
+ return (val >> 32);
+}
+
+static void
+print_vmx_info(void)
+{
+ uint64_t basic, msr;
+ uint32_t entry, exit, mask, pin, proc, proc2;
+ int comma;
+
+ printf("\n VT-x: ");
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+ printf("(disabled in BIOS) ");
+ basic = rdmsr(MSR_VMX_BASIC);
+ pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS);
+ proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS);
+ if (proc & PROCBASED_SECONDARY_CONTROLS)
+ proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2);
+ else
+ proc2 = 0;
+ exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS);
+ entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS);
+
+ if (!bootverbose) {
+ comma = 0;
+ if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT &&
+ entry & VM_ENTRY_LOAD_PAT) {
+ printf("%sPAT", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc & PROCBASED_HLT_EXITING) {
+ printf("%sHLT", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc & PROCBASED_MTF) {
+ printf("%sMTF", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc & PROCBASED_PAUSE_EXITING) {
+ printf("%sPAUSE", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc2 & PROCBASED2_ENABLE_EPT) {
+ printf("%sEPT", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) {
+ printf("%sUG", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc2 & PROCBASED2_ENABLE_VPID) {
+ printf("%sVPID", comma ? "," : "");
+ comma = 1;
+ }
+ if (proc & PROCBASED_USE_TPR_SHADOW &&
+ proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES &&
+ proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE &&
+ proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION &&
+ proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) {
+ printf("%sVID", comma ? "," : "");
+ comma = 1;
+ if (pin & PINBASED_POSTED_INTERRUPT)
+ printf(",PostIntr");
+ }
+ return;
+ }
+
+ mask = basic >> 32;
+ printf("Basic Features=0x%b", mask,
+ "\020"
+ "\02132PA" /* 32-bit physical addresses */
+ "\022SMM" /* SMM dual-monitor */
+ "\027INS/OUTS" /* VM-exit info for INS and OUTS */
+ "\030TRUE" /* TRUE_CTLS MSRs */
+ );
+ printf("\n Pin-Based Controls=0x%b", pin,
+ "\020"
+ "\001ExtINT" /* External-interrupt exiting */
+ "\004NMI" /* NMI exiting */
+ "\006VNMI" /* Virtual NMIs */
+ "\007PreTmr" /* Activate VMX-preemption timer */
+ "\010PostIntr" /* Process posted interrupts */
+ );
+ printf("\n Primary Processor Controls=0x%b", proc,
+ "\020"
+ "\003INTWIN" /* Interrupt-window exiting */
+ "\004TSCOff" /* Use TSC offsetting */
+ "\010HLT" /* HLT exiting */
+ "\012INVLPG" /* INVLPG exiting */
+ "\013MWAIT" /* MWAIT exiting */
+ "\014RDPMC" /* RDPMC exiting */
+ "\015RDTSC" /* RDTSC exiting */
+ "\020CR3-LD" /* CR3-load exiting */
+ "\021CR3-ST" /* CR3-store exiting */
+ "\024CR8-LD" /* CR8-load exiting */
+ "\025CR8-ST" /* CR8-store exiting */
+ "\026TPR" /* Use TPR shadow */
+ "\027NMIWIN" /* NMI-window exiting */
+ "\030MOV-DR" /* MOV-DR exiting */
+ "\031IO" /* Unconditional I/O exiting */
+ "\032IOmap" /* Use I/O bitmaps */
+ "\034MTF" /* Monitor trap flag */
+ "\035MSRmap" /* Use MSR bitmaps */
+ "\036MONITOR" /* MONITOR exiting */
+ "\037PAUSE" /* PAUSE exiting */
+ );
+ if (proc & PROCBASED_SECONDARY_CONTROLS)
+ printf("\n Secondary Processor Controls=0x%b", proc2,
+ "\020"
+ "\001APIC" /* Virtualize APIC accesses */
+ "\002EPT" /* Enable EPT */
+ "\003DT" /* Descriptor-table exiting */
+ "\004RDTSCP" /* Enable RDTSCP */
+ "\005x2APIC" /* Virtualize x2APIC mode */
+ "\006VPID" /* Enable VPID */
+ "\007WBINVD" /* WBINVD exiting */
+ "\010UG" /* Unrestricted guest */
+ "\011APIC-reg" /* APIC-register virtualization */
+ "\012VID" /* Virtual-interrupt delivery */
+ "\013PAUSE-loop" /* PAUSE-loop exiting */
+ "\014RDRAND" /* RDRAND exiting */
+ "\015INVPCID" /* Enable INVPCID */
+ "\016VMFUNC" /* Enable VM functions */
+ "\017VMCS" /* VMCS shadowing */
+ "\020EPT#VE" /* EPT-violation #VE */
+ "\021XSAVES" /* Enable XSAVES/XRSTORS */
+ );
+ printf("\n Exit Controls=0x%b", mask,
+ "\020"
+ "\003DR" /* Save debug controls */
+ /* Ignore Host address-space size */
+ "\015PERF" /* Load MSR_PERF_GLOBAL_CTRL */
+ "\020AckInt" /* Acknowledge interrupt on exit */
+ "\023PAT-SV" /* Save MSR_PAT */
+ "\024PAT-LD" /* Load MSR_PAT */
+ "\025EFER-SV" /* Save MSR_EFER */
+ "\026EFER-LD" /* Load MSR_EFER */
+ "\027PTMR-SV" /* Save VMX-preemption timer value */
+ );
+ printf("\n Entry Controls=0x%b", mask,
+ "\020"
+ "\003DR" /* Save debug controls */
+ /* Ignore IA-32e mode guest */
+ /* Ignore Entry to SMM */
+ /* Ignore Deactivate dual-monitor treatment */
+ "\016PERF" /* Load MSR_PERF_GLOBAL_CTRL */
+ "\017PAT" /* Load MSR_PAT */
+ "\020EFER" /* Load MSR_EFER */
+ );
+ if (proc & PROCBASED_SECONDARY_CONTROLS &&
+ (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) {
+ msr = rdmsr(MSR_VMX_EPT_VPID_CAP);
+ mask = msr;
+ printf("\n EPT Features=0x%b", mask,
+ "\020"
+ "\001XO" /* Execute-only translations */
+ "\007PW4" /* Page-walk length of 4 */
+ "\011UC" /* EPT paging-structure mem can be UC */
+ "\017WB" /* EPT paging-structure mem can be WB */
+ "\0212M" /* EPT PDE can map a 2-Mbyte page */
+ "\0221G" /* EPT PDPTE can map a 1-Gbyte page */
+ "\025INVEPT" /* INVEPT is supported */
+ "\026AD" /* Accessed and dirty flags for EPT */
+ "\032single" /* INVEPT single-context type */
+ "\033all" /* INVEPT all-context type */
+ );
+ mask = msr >> 32;
+ printf("\n VPID Features=0x%b", mask,
+ "\020"
+ "\001INVVPID" /* INVVPID is supported */
+ "\011individual" /* INVVPID individual-address type */
+ "\012single" /* INVVPID single-context type */
+ "\013all" /* INVVPID all-context type */
+ /* INVVPID single-context-retaining-globals type */
+ "\014single-globals"
+ );
+ }
+}
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 5a359e9..63a9b3f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -29,11 +29,14 @@
#ifndef _VMM_H_
#define _VMM_H_
+#include <x86/segments.h>
+
enum vm_suspend_how {
VM_SUSPEND_NONE,
VM_SUSPEND_RESET,
VM_SUSPEND_POWEROFF,
VM_SUSPEND_HALT,
+ VM_SUSPEND_TRIPLEFAULT,
VM_SUSPEND_LAST
};
@@ -75,6 +78,10 @@ enum vm_reg_name {
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_GUEST_CR2,
+ VM_REG_GUEST_PDPTE0,
+ VM_REG_GUEST_PDPTE1,
+ VM_REG_GUEST_PDPTE2,
+ VM_REG_GUEST_PDPTE3,
VM_REG_LAST
};
@@ -84,6 +91,16 @@ enum x2apic_state {
X2APIC_STATE_LAST
};
+#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
+#define VM_INTINFO_DEL_ERRCODE 0x800
+#define VM_INTINFO_RSVD 0x7ffff000
+#define VM_INTINFO_VALID 0x80000000
+#define VM_INTINFO_TYPE 0x700
+#define VM_INTINFO_HWINTR (0 << 8)
+#define VM_INTINFO_NMI (2 << 8)
+#define VM_INTINFO_HWEXCEPTION (3 << 8)
+#define VM_INTINFO_SWINTR (4 << 8)
+
#ifdef _KERNEL
#define VM_MAX_NAMELEN 32
@@ -99,6 +116,7 @@ struct vioapic;
struct vlapic;
struct vmspace;
struct vm_object;
+struct vm_guest_paging;
struct pmap;
typedef int (*vmm_init_func_t)(int ipinum);
@@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
}
+#ifdef _SYS_PROC_H_
+static int __inline
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+ return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+}
+#endif
+
void *vcpu_stats(struct vm *vm, int vcpu);
void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
struct vmspace *vm_get_vmspace(struct vm *vm);
@@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm);
int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
/*
- * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
- * exception is pending and also updates 'vme'. The pending exception is
- * cleared when this function returns.
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
*
- * This function should only be called in the context of the thread that is
- * executing this vcpu.
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
*/
-int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
-void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
-void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
-void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
enum vm_reg_name vm_segment_name(int seg_encoding);
+struct vm_copyinfo {
+ uint64_t gpa;
+ size_t len;
+ void *hva;
+ void *cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len);
#endif /* KERNEL */
#define VM_MAXCPU 16 /* maximum virtual cpus */
@@ -322,13 +390,16 @@ struct seg_desc {
uint32_t limit;
uint32_t access;
};
-#define SEG_DESC_TYPE(desc) ((desc)->access & 0x001f)
-#define SEG_DESC_PRESENT(desc) ((desc)->access & 0x0080)
-#define SEG_DESC_DEF32(desc) ((desc)->access & 0x4000)
-#define SEG_DESC_GRANULARITY(desc) ((desc)->access & 0x8000)
-#define SEG_DESC_UNUSABLE(desc) ((desc)->access & 0x10000)
+#define SEG_DESC_TYPE(access) ((access) & 0x001f)
+#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
+#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
+#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
+#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
+#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
enum vm_cpu_mode {
+ CPU_MODE_REAL,
+ CPU_MODE_PROTECTED,
CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
};
@@ -364,11 +435,14 @@ struct vie {
uint8_t num_valid; /* size of the instruction */
uint8_t num_processed;
+ uint8_t addrsize:4, opsize:4; /* address and operand sizes */
uint8_t rex_w:1, /* REX prefix */
rex_r:1,
rex_x:1,
rex_b:1,
- rex_present:1;
+ rex_present:1,
+ opsize_override:1, /* Operand size override */
+ addrsize_override:1; /* Address size override */
uint8_t mod:2, /* ModRM byte */
reg:4,
@@ -410,6 +484,7 @@ enum vm_exitcode {
VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_SUSPENDED,
VM_EXITCODE_INOUT_STR,
+ VM_EXITCODE_TASK_SWITCH,
VM_EXITCODE_MAX
};
@@ -434,6 +509,22 @@ struct vm_inout_str {
struct seg_desc seg_desc;
};
+enum task_switch_reason {
+ TSR_CALL,
+ TSR_IRET,
+ TSR_JMP,
+ TSR_IDT_GATE, /* task gate in IDT */
+};
+
+struct vm_task_switch {
+ uint16_t tsssel; /* new TSS selector */
+ int ext; /* task switch due to external event */
+ uint32_t errcode;
+ int errcode_valid; /* push 'errcode' on the new stack */
+ enum task_switch_reason reason;
+ struct vm_guest_paging paging;
+};
+
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
@@ -448,6 +539,7 @@ struct vm_exit {
struct {
uint64_t gpa;
uint64_t gla;
+ int cs_d; /* CS.D */
struct vm_guest_paging paging;
struct vie vie;
} inst_emul;
@@ -487,7 +579,38 @@ struct vm_exit {
struct {
enum vm_suspend_how how;
} suspended;
+ struct vm_task_switch task_switch;
} u;
};
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+ int errcode);
+
+static void __inline
+vm_inject_ud(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static void __inline
+vm_inject_gp(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static void __inline
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static void __inline
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
#endif /* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 9b3b00d..e4d839e 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -189,6 +189,12 @@ struct vm_cpuset {
#define VM_ACTIVE_CPUS 0
#define VM_SUSPENDED_CPUS 1
+struct vm_intinfo {
+ int vcpuid;
+ uint64_t info1;
+ uint64_t info2;
+};
+
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@@ -211,6 +217,8 @@ enum {
IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
/* interrupt injection */
+ IOCNUM_GET_INTINFO = 28,
+ IOCNUM_SET_INTINFO = 29,
IOCNUM_INJECT_EXCEPTION = 30,
IOCNUM_LAPIC_IRQ = 31,
IOCNUM_INJECT_NMI = 32,
@@ -324,4 +332,8 @@ enum {
_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
#define VM_GET_CPUS \
_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define VM_SET_INTINFO \
+ _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define VM_GET_INTINFO \
+ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index e4c408b..bbd3d88 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
* s
*/
int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
- mem_region_read_t mrr, mem_region_write_t mrw,
- void *mrarg);
+ struct vm_guest_paging *paging, mem_region_read_t mrr,
+ mem_region_write_t mrw, void *mrarg);
int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
uint64_t val, int size);
@@ -108,7 +108,7 @@ void vie_init(struct vie *vie);
*/
#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
- enum vm_cpu_mode cpu_mode, struct vie *vie);
+ enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
#endif /* _KERNEL */
#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index cc97d95..51e5c2c 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
return (VMCS_GUEST_LDTR_SELECTOR);
case VM_REG_GUEST_EFER:
return (VMCS_GUEST_IA32_EFER);
+ case VM_REG_GUEST_PDPTE0:
+ return (VMCS_GUEST_PDPTE0);
+ case VM_REG_GUEST_PDPTE1:
+ return (VMCS_GUEST_PDPTE1);
+ case VM_REG_GUEST_PDPTE2:
+ return (VMCS_GUEST_PDPTE2);
+ case VM_REG_GUEST_PDPTE3:
+ return (VMCS_GUEST_PDPTE3);
default:
return (-1);
}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 657d5b0..4e9557c 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define VMCS_INTR_T_HWINTR (0 << 8)
#define VMCS_INTR_T_NMI (2 << 8)
#define VMCS_INTR_T_HWEXCEPTION (3 << 8)
+#define VMCS_INTR_T_SWINTR (4 << 8)
+#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define VMCS_INTR_T_SWEXCEPTION (6 << 8)
#define VMCS_INTR_DEL_ERRCODE (1 << 11)
/*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2cbb159..b2c5702 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
&cr4_zeros_mask, 0, NULL);
-static int vmx_no_patmsr;
-
static int vmx_initialized;
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
&vmx_initialized, 0, "Intel VMX initialized");
@@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
/*
* Optional capabilities
*/
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
+static int vmx_patmsr;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
+ "PAT MSR saved and restored in VCMS");
+
static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+ "HLT triggers a VM-exit");
+
static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+ 0, "PAUSE triggers a VM-exit");
+
static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+ &cap_unrestricted_guest, 0, "Unrestricted guests");
+
static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+ &cap_monitor_trap, 0, "Monitor trap flag");
+
static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+ 0, "Guests are allowed to use INVPCID");
static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
&virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
&posted_interrupts, 0, "APICv posted interrupt support");
static int pirvec;
@@ -618,6 +636,7 @@ vmx_init(int ipinum)
}
/* Check support for VM-exit controls */
+ vmx_patmsr = 1;
error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
VM_EXIT_CTLS_ONE_SETTING,
VM_EXIT_CTLS_ZERO_SETTING,
@@ -637,12 +656,12 @@ vmx_init(int ipinum)
if (bootverbose)
printf("vmm: PAT MSR access not supported\n");
guest_msr_valid(MSR_PAT);
- vmx_no_patmsr = 1;
+ vmx_patmsr = 0;
}
}
/* Check support for VM-entry controls */
- if (!vmx_no_patmsr) {
+ if (vmx_patmsr) {
error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
MSR_VMX_TRUE_ENTRY_CTLS,
VM_ENTRY_CTLS_ONE_SETTING,
@@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
* MSR_PAT save/restore support, leave access disabled so accesses
* will be trapped.
*/
- if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
panic("vmx_vminit: error setting guest pat msr access");
vpid_alloc(vpid, VM_MAXCPU);
@@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
vmx->cap[i].proc_ctls = procbased_ctls;
vmx->cap[i].proc_ctls2 = procbased_ctls2;
- vmx->state[i].lastcpu = -1;
+ vmx->state[i].lastcpu = NOCPU;
vmx->state[i].vpid = vpid[i];
msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
@@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
}
static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
{
struct vmxstate *vmxstate;
struct invvpid_desc invvpid_desc;
vmxstate = &vmx->state[vcpu];
- if (vmxstate->lastcpu == curcpu)
+ if (vmxstate->vpid == 0)
return;
- vmxstate->lastcpu = curcpu;
-
- vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+ if (!running) {
+ /*
+ * Set the 'lastcpu' to an invalid host cpu.
+ *
+ * This will invalidate TLB entries tagged with the vcpu's
+ * vpid the next time it runs via vmx_set_pcpu_defaults().
+ */
+ vmxstate->lastcpu = NOCPU;
+ return;
+ }
- vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
- vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
- vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+ "critical section", __func__, vcpu));
/*
- * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ * Invalidate all mappings tagged with 'vpid'
*
* We do this because this vcpu was executing on a different host
* cpu when it last ran. We do not track whether it invalidated
@@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
* Note also that this will invalidate mappings tagged with 'vpid'
* for "all" EP4TAs.
*/
- if (vmxstate->vpid != 0) {
- if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
- invvpid_desc._res1 = 0;
- invvpid_desc._res2 = 0;
- invvpid_desc.vpid = vmxstate->vpid;
- invvpid_desc.linear_addr = 0;
- invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
- } else {
- /*
- * The invvpid can be skipped if an invept is going to
- * be performed before entering the guest. The invept
- * will invalidate combined mappings tagged with
- * 'vmx->eptp' for all vpids.
- */
- vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
- }
+ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+ invvpid_desc._res1 = 0;
+ invvpid_desc._res2 = 0;
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid_desc.linear_addr = 0;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+ } else {
+ /*
+ * The invvpid can be skipped if an invept is going to
+ * be performed before entering the guest. The invept
+ * will invalidate combined mappings tagged with
+ * 'vmx->eptp' for all vpids.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
}
}
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+{
+ struct vmxstate *vmxstate;
+
+ vmxstate = &vmx->state[vcpu];
+ if (vmxstate->lastcpu == curcpu)
+ return;
+
+ vmxstate->lastcpu = curcpu;
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
/*
* We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
*/
@@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
static void
vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
{
- struct vm_exception exc;
int vector, need_nmi_exiting, extint_pending;
- uint64_t rflags;
+ uint64_t rflags, entryinfo;
uint32_t gi, info;
- if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
- KASSERT(exc.vector >= 0 && exc.vector < 32,
- ("%s: invalid exception vector %d", __func__, exc.vector));
+ if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+ "intinfo is not valid: %#lx", __func__, entryinfo));
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
- "pending exception %d: %#x", __func__, exc.vector, info));
+ "pending exception: %#lx/%#x", __func__, entryinfo, info));
- info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
- if (exc.error_code_valid) {
- info |= VMCS_INTR_DEL_ERRCODE;
- vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+ info = entryinfo;
+ vector = info & 0xff;
+ if (vector == IDT_BP || vector == IDT_OF) {
+ /*
+ * VT-x requires #BP and #OF to be injected as software
+ * exceptions.
+ */
+ info &= ~VMCS_INTR_T_MASK;
+ info |= VMCS_INTR_T_SWEXCEPTION;
}
+
+ if (info & VMCS_INTR_DEL_ERRCODE)
+ vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
}
@@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
}
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+ uint32_t gi;
+
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+ ("NMI blocking is not in effect %#x", gi));
+}
+
static int
vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
@@ -1659,11 +1724,19 @@ vmx_cpl(void)
static enum vm_cpu_mode
vmx_cpu_mode(void)
{
+ uint32_t csar;
- if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
- return (CPU_MODE_64BIT);
- else
- return (CPU_MODE_COMPATIBILITY);
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ if (csar & 0x2000)
+ return (CPU_MODE_64BIT); /* CS.L = 1 */
+ else
+ return (CPU_MODE_COMPATIBILITY);
+ } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+ return (CPU_MODE_PROTECTED);
+ } else {
+ return (CPU_MODE_REAL);
+ }
}
static enum vm_paging_mode
@@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging)
static void
vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
{
+ struct vm_guest_paging *paging;
+ uint32_t csar;
+
+ paging = &vmexit->u.inst_emul.paging;
+
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = gla;
- vmx_paging_info(&vmexit->u.inst_emul.paging);
+ vmx_paging_info(paging);
+ switch (paging->cpu_mode) {
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+ break;
+ default:
+ vmexit->u.inst_emul.cs_d = 0;
+ break;
+ }
}
static int
@@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
return (UNHANDLED);
}
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+ int reason;
+
+ reason = (qual >> 30) & 0x3;
+ switch (reason) {
+ case 0:
+ return (TSR_CALL);
+ case 1:
+ return (TSR_IRET);
+ case 2:
+ return (TSR_JMP);
+ case 3:
+ return (TSR_IDT_GATE);
+ default:
+ panic("%s: invalid reason %d", __func__, reason);
+ }
+}
+
static int
vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
@@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
struct vmxctx *vmxctx;
struct vlapic *vlapic;
struct vm_inout_str *vis;
+ struct vm_task_switch *ts;
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
- uint32_t reason;
- uint64_t qual, gpa;
+ uint32_t intr_type, reason;
+ uint64_t exitintinfo, qual, gpa;
bool retu;
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
/*
- * VM exits that could be triggered during event injection on the
- * previous VM entry need to be handled specially by re-injecting
- * the event.
+ * VM exits that can be triggered during event delivery need to
+ * be handled specially by re-injecting the event if the IDT
+ * vectoring information field's valid bit is set.
*
* See "Information for VM Exits During Event Delivery" in Intel SDM
* for details.
*/
- switch (reason) {
- case EXIT_REASON_EPT_FAULT:
- case EXIT_REASON_EPT_MISCONFIG:
- case EXIT_REASON_APIC_ACCESS:
- case EXIT_REASON_TASK_SWITCH:
- case EXIT_REASON_EXCEPTION:
- idtvec_info = vmcs_idt_vectoring_info();
- if (idtvec_info & VMCS_IDT_VEC_VALID) {
- idtvec_info &= ~(1 << 12); /* clear undefined bit */
- vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
- if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
- idtvec_err = vmcs_idt_vectoring_err();
- vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
- idtvec_err);
- }
- /*
- * If 'virtual NMIs' are being used and the VM-exit
- * happened while injecting an NMI during the previous
- * VM-entry, then clear "blocking by NMI" in the Guest
- * Interruptibility-state.
- */
- if ((idtvec_info & VMCS_INTR_T_MASK) ==
- VMCS_INTR_T_NMI) {
- vmx_clear_nmi_blocking(vmx, vcpu);
- }
+ idtvec_info = vmcs_idt_vectoring_info();
+ if (idtvec_info & VMCS_IDT_VEC_VALID) {
+ idtvec_info &= ~(1 << 12); /* clear undefined bit */
+ exitintinfo = idtvec_info;
+ if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+ idtvec_err = vmcs_idt_vectoring_err();
+ exitintinfo |= (uint64_t)idtvec_err << 32;
+ }
+ error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+ KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+ __func__, error));
+
+ /*
+ * If 'virtual NMIs' are being used and the VM-exit
+ * happened while injecting an NMI during the previous
+ * VM-entry, then clear "blocking by NMI" in the
+ * Guest Interruptibility-State so the NMI can be
+ * reinjected on the subsequent VM-entry.
+ *
+ * However, if the NMI was being delivered through a task
+ * gate, then the new task must start execution with NMIs
+ * blocked so don't clear NMI blocking in this case.
+ */
+ intr_type = idtvec_info & VMCS_INTR_T_MASK;
+ if (intr_type == VMCS_INTR_T_NMI) {
+ if (reason != EXIT_REASON_TASK_SWITCH)
+ vmx_clear_nmi_blocking(vmx, vcpu);
+ else
+ vmx_assert_nmi_blocking(vmx, vcpu);
+ }
+
+ /*
+ * Update VM-entry instruction length if the event being
+ * delivered was a software interrupt or software exception.
+ */
+ if (intr_type == VMCS_INTR_T_SWINTR ||
+ intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+ intr_type == VMCS_INTR_T_SWEXCEPTION) {
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
}
- default:
- idtvec_info = 0;
- break;
}
switch (reason) {
+ case EXIT_REASON_TASK_SWITCH:
+ ts = &vmexit->u.task_switch;
+ ts->tsssel = qual & 0xffff;
+ ts->reason = vmx_task_switch_reason(qual);
+ ts->ext = 0;
+ ts->errcode_valid = 0;
+ vmx_paging_info(&ts->paging);
+ /*
+ * If the task switch was due to a CALL, JMP, IRET, software
+ * interrupt (INT n) or software exception (INT3, INTO),
+ * then the saved %rip references the instruction that caused
+ * the task switch. The instruction length field in the VMCS
+ * is valid in this case.
+ *
+ * In all other cases (e.g., NMI, hardware exception) the
+ * saved %rip is one that would have been saved in the old TSS
+ * had the task switch completed normally so the instruction
+ * length field is not needed in this case and is explicitly
+ * set to 0.
+ */
+ if (ts->reason == TSR_IDT_GATE) {
+ KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+ ("invalid idtvec_info %#x for IDT task switch",
+ idtvec_info));
+ intr_type = idtvec_info & VMCS_INTR_T_MASK;
+ if (intr_type != VMCS_INTR_T_SWINTR &&
+ intr_type != VMCS_INTR_T_SWEXCEPTION &&
+ intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+ /* Task switch triggered by external event */
+ ts->ext = 1;
+ vmexit->inst_length = 0;
+ if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+ ts->errcode_valid = 1;
+ ts->errcode = vmcs_idt_vectoring_err();
+ }
+ }
+ }
+ vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+ VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+ "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+ ts->ext ? "external" : "internal",
+ ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+ break;
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
switch (qual & 0xf) {
@@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
* the guest.
*
* See "Resuming Guest Software after Handling an Exception".
+ * See "Information for VM Exits Due to Vectored Events".
*/
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
(intr_info & 0xff) != IDT_DF &&
@@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
* pmap_invalidate_ept().
*/
disable_intr();
+ vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+ /*
+ * Check for vcpu suspension after injecting events because
+ * vmx_inject_interrupts() can suspend the vcpu due to a
+ * triple fault.
+ */
if (vcpu_suspended(suspend_cookie)) {
enable_intr();
vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
@@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
break;
}
- if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+ if (vcpu_should_yield(vm, vcpu)) {
enable_intr();
vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
vmx_astpending_trace(vmx, vcpu, vmexit->rip);
@@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
break;
}
- vmx_inject_interrupts(vmx, vcpu, vlapic);
vmx_run_trace(vmx, vcpu);
rc = vmx_enter_guest(vmxctx, vmx, launched);
@@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
{
int error, hostcpu, running, shadow;
uint64_t ctls;
+ pmap_t pmap;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
@@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
error = vmcs_setreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(shadow), val);
}
+
+ if (reg == VM_REG_GUEST_CR3) {
+ /*
+ * Invalidate the guest vcpu's TLB mappings to emulate
+ * the behavior of updating %cr3.
+ *
+ * XXX the processor retains global mappings when %cr3
+ * is updated but vmx_invvpid() does not.
+ */
+ pmap = vmx->ctx[vcpu].pmap;
+ vmx_invvpid(vmx, vcpu, pmap, running);
+ }
}
return (error);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 2aba63c..a3428db 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
#include "vmx_msr.h"
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index e6379a9..340b0f7 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,29 +29,6 @@
#ifndef _VMX_MSR_H_
#define _VMX_MSR_H_
-#define MSR_VMX_BASIC 0x480
-#define MSR_VMX_EPT_VPID_CAP 0x48C
-
-#define MSR_VMX_PROCBASED_CTLS 0x482
-#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
-
-#define MSR_VMX_PINBASED_CTLS 0x481
-#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
-
-#define MSR_VMX_PROCBASED_CTLS2 0x48B
-
-#define MSR_VMX_EXIT_CTLS 0x483
-#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
-
-#define MSR_VMX_ENTRY_CTLS 0x484
-#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
-
-#define MSR_VMX_CR0_FIXED0 0x486
-#define MSR_VMX_CR0_FIXED1 0x487
-
-#define MSR_VMX_CR4_FIXED0 0x488
-#define MSR_VMX_CR4_FIXED1 0x489
-
uint32_t vmx_revision(void);
int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index ca76ea8..f5ef71b 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
ptpindex = 0;
ptpshift = 0;
+ KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+ gpa, len));
+ KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+ "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ee6fc84..38fc458 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic)
atpic->mask, atpic->request, atpic->service);
/*
+ * From Section 3.6.2, "Interrupt Modes", in the
+ * MPtable Specification, Version 1.4
+ *
* PIC interrupts are routed to both the Local APIC
* and the I/O APIC to support operation in 1 of 3
* modes.
*
* 1. Legacy PIC Mode: the PIC effectively bypasses
- * all APIC components. In mode '1' the local APIC is
+ * all APIC components. In this mode the local APIC is
* disabled and LINT0 is reconfigured as INTR to
* deliver the PIC interrupt directly to the CPU.
*
* 2. Virtual Wire Mode: the APIC is treated as a
* virtual wire which delivers interrupts from the PIC
- * to the CPU. In mode '2' LINT0 is programmed as
+ * to the CPU. In this mode LINT0 is programmed as
* ExtINT to indicate that the PIC is the source of
* the interrupt.
*
- * 3. Symmetric I/O Mode: PIC interrupts are fielded
- * by the I/O APIC and delivered to the appropriate
- * CPU. In mode '3' the I/O APIC input 0 is
- * programmed as ExtINT to indicate that the PIC is
- * the source of the interrupt.
+ * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+ * fielded by the I/O APIC and delivered to the appropriate
+ * CPU. In this mode the I/O APIC input 0 is programmed
+ * as ExtINT to indicate that the PIC is the source of the
+ * interrupt.
*/
atpic->intr_raised = true;
lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index c2a9fd1..fa0200e 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -97,6 +97,7 @@ struct vcpu {
int hostcpu; /* (o) vcpu's host cpu */
struct vlapic *vlapic; /* (i) APIC device model */
enum x2apic_state x2apic_state; /* (i) APIC mode */
+ uint64_t exitintinfo; /* (i) events pending at VM exit */
int nmi_pending; /* (i) NMI pending */
int extint_pending; /* (i) INTR pending */
struct vm_exception exception; /* (x) exception collateral */
@@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+ vcpu->exitintinfo = 0;
vcpu->nmi_pending = 0;
vcpu->extint_pending = 0;
vcpu->exception_pending = 0;
@@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
return (0);
}
+static vm_paddr_t
+vm_maxmem(struct vm *vm)
+{
+ int i;
+ vm_paddr_t gpa, maxmem;
+
+ maxmem = 0;
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
+ if (gpa > maxmem)
+ maxmem = gpa;
+ }
+ return (maxmem);
+}
+
static void
vm_gpa_unwire(struct vm *vm)
{
@@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
if (ppt_assigned_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
- maxaddr = vmm_mem_maxaddr();
+ maxaddr = vm_maxmem(vm);
vm->iommu = iommu_create_domain(maxaddr);
error = vm_gpa_wire(vm);
@@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
}
}
+ /* Don't go to sleep if the vcpu thread needs to yield */
+ if (vcpu_should_yield(vm, vcpuid))
+ break;
+
/*
* Some Linux guests implement "halt" by having all vcpus
* execute HLT with interrupts disabled. 'halted_cpus' keeps
@@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
- msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
+ /*
+ * XXX msleep_spin() cannot be interrupted by signals so
+ * wake up periodically to check pending signals.
+ */
+ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
@@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
struct vm_guest_paging *paging;
mem_region_read_t mread;
mem_region_write_t mwrite;
- int error;
+ enum vm_cpu_mode cpu_mode;
+ int cs_d, error;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
+ cs_d = vme->u.inst_emul.cs_d;
vie = &vme->u.inst_emul.vie;
paging = &vme->u.inst_emul.paging;
+ cpu_mode = paging->cpu_mode;
vie_init(vie);
@@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
else if (error != 0)
panic("%s: vmm_fetch_instruction error %d", __func__, error);
- if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
+ if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
return (EFAULT);
/* return to userland unless this is an in-kernel emulated device */
@@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
return (0);
}
- error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
- retu);
+ error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+ mread, mwrite, retu);
return (error);
}
@@ -1456,6 +1484,202 @@ restart:
}
int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+ struct vcpu *vcpu;
+ int type, vector;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (info & VM_INTINFO_VALID) {
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+ if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+ return (EINVAL);
+ if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+ return (EINVAL);
+ if (info & VM_INTINFO_RSVD)
+ return (EINVAL);
+ } else {
+ info = 0;
+ }
+ VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+ vcpu->exitintinfo = info;
+ return (0);
+}
+
+enum exc_class {
+ EXC_BENIGN,
+ EXC_CONTRIBUTORY,
+ EXC_PAGEFAULT
+};
+
+#define IDT_VE 20 /* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+ int type, vector;
+
+ KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+
+ /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+ switch (type) {
+ case VM_INTINFO_HWINTR:
+ case VM_INTINFO_SWINTR:
+ case VM_INTINFO_NMI:
+ return (EXC_BENIGN);
+ default:
+ /*
+ * Hardware exception.
+ *
+ * SVM and VT-x use identical type values to represent NMI,
+ * hardware interrupt and software interrupt.
+ *
+ * SVM uses type '3' for all exceptions. VT-x uses type '3'
+ * for exceptions except #BP and #OF. #BP and #OF use a type
+ * value of '5' or '6'. Therefore we don't check for explicit
+ * values of 'type' to classify 'intinfo' into a hardware
+ * exception.
+ */
+ break;
+ }
+
+ switch (vector) {
+ case IDT_PF:
+ case IDT_VE:
+ return (EXC_PAGEFAULT);
+ case IDT_DE:
+ case IDT_TS:
+ case IDT_NP:
+ case IDT_SS:
+ case IDT_GP:
+ return (EXC_CONTRIBUTORY);
+ default:
+ return (EXC_BENIGN);
+ }
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+ uint64_t *retinfo)
+{
+ enum exc_class exc1, exc2;
+ int type1, vector1;
+
+ KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+ KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+ /*
+ * If an exception occurs while attempting to call the double-fault
+ * handler the processor enters shutdown mode (aka triple fault).
+ */
+ type1 = info1 & VM_INTINFO_TYPE;
+ vector1 = info1 & 0xff;
+ if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+ VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+ info1, info2);
+ vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+ *retinfo = 0;
+ return (0);
+ }
+
+ /*
+ * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+ */
+ exc1 = exception_class(info1);
+ exc2 = exception_class(info2);
+ if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+ (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+ /* Convert nested fault into a double fault. */
+ *retinfo = IDT_DF;
+ *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ *retinfo |= VM_INTINFO_DEL_ERRCODE;
+ } else {
+ /* Handle exceptions serially */
+ *retinfo = info2;
+ }
+ return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+ uint64_t info = 0;
+
+ if (vcpu->exception_pending) {
+ info = vcpu->exception.vector & 0xff;
+ info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ if (vcpu->exception.error_code_valid) {
+ info |= VM_INTINFO_DEL_ERRCODE;
+ info |= (uint64_t)vcpu->exception.error_code << 32;
+ }
+ }
+ return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+ struct vcpu *vcpu;
+ uint64_t info1, info2;
+ int valid;
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ info1 = vcpu->exitintinfo;
+ vcpu->exitintinfo = 0;
+
+ info2 = 0;
+ if (vcpu->exception_pending) {
+ info2 = vcpu_exception_intinfo(vcpu);
+ vcpu->exception_pending = 0;
+ VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+ vcpu->exception.vector, info2);
+ }
+
+ if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+ valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+ } else if (info1 & VM_INTINFO_VALID) {
+ *retinfo = info1;
+ valid = 1;
+ } else if (info2 & VM_INTINFO_VALID) {
+ *retinfo = info2;
+ valid = 1;
+ } else {
+ valid = 0;
+ }
+
+ if (valid) {
+ VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+ "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+ }
+
+ return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ *info1 = vcpu->exitintinfo;
+ *info2 = vcpu_exception_intinfo(vcpu);
+ return (0);
+}
+
+int
vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
{
struct vcpu *vcpu;
@@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
if (exception->vector < 0 || exception->vector >= 32)
return (EINVAL);
+ /*
+ * A double fault exception should never be injected directly into
+ * the guest. It is a derived exception that results from specific
+ * combinations of nested faults.
+ */
+ if (exception->vector == IDT_DF)
+ return (EINVAL);
+
vcpu = &vm->vcpu[vcpuid];
if (vcpu->exception_pending) {
@@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
return (0);
}
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
- struct vcpu *vcpu;
- int pending;
-
- KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
- vcpu = &vm->vcpu[vcpuid];
- pending = vcpu->exception_pending;
- if (pending) {
- vcpu->exception_pending = 0;
- *exception = vcpu->exception;
- VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
- exception->vector);
- }
- return (pending);
-}
-
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+ int errcode)
{
+ struct vm_exception exception;
struct vm_exit *vmexit;
+ struct vm *vm;
int error;
- error = vm_inject_exception(vm, vcpuid, exception);
+ vm = vmarg;
+
+ exception.vector = vector;
+ exception.error_code = errcode;
+ exception.error_code_valid = errcode_valid;
+ error = vm_inject_exception(vm, vcpuid, &exception);
KASSERT(error == 0, ("vm_inject_exception error %d", error));
/*
@@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
}
void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
{
- struct vm_exception pf = {
- .vector = IDT_PF,
- .error_code_valid = 1,
- .error_code = error_code
- };
+ struct vm *vm;
int error;
+ vm = vmarg;
VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
error_code, cr2);
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
- vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
- struct vm_exception gpf = {
- .vector = IDT_GP,
- .error_code_valid = 1,
- .error_code = 0
- };
-
- vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
- struct vm_exception udf = {
- .vector = IDT_UD,
- .error_code_valid = 0
- };
-
- vm_inject_fault(vm, vcpuid, &udf);
+ vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
@@ -1993,6 +2188,97 @@ vm_segment_name(int seg)
return (seg_names[seg]);
}
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int idx;
+
+ for (idx = 0; idx < num_copyinfo; idx++) {
+ if (copyinfo[idx].cookie != NULL)
+ vm_gpa_release(copyinfo[idx].cookie);
+ }
+ bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int error, idx, nused;
+ size_t n, off, remaining;
+ void *hva, *cookie;
+ uint64_t gpa;
+
+ bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+ nused = 0;
+ remaining = len;
+ while (remaining > 0) {
+ KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+ error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+ if (error)
+ return (error);
+ off = gpa & PAGE_MASK;
+ n = min(remaining, PAGE_SIZE - off);
+ copyinfo[nused].gpa = gpa;
+ copyinfo[nused].len = n;
+ remaining -= n;
+ gla += n;
+ nused++;
+ }
+
+ for (idx = 0; idx < nused; idx++) {
+ hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+ prot, &cookie);
+ if (hva == NULL)
+ break;
+ copyinfo[idx].hva = hva;
+ copyinfo[idx].cookie = cookie;
+ }
+
+ if (idx != nused) {
+ vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+ return (-1);
+ } else {
+ return (0);
+ }
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+ size_t len)
+{
+ char *dst;
+ int idx;
+
+ dst = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ dst += copyinfo[idx].len;
+ idx++;
+ }
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len)
+{
+ const char *src;
+ int idx;
+
+ src = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ src += copyinfo[idx].len;
+ idx++;
+ }
+}
/*
* Return the amount of in-use and wired memory for the VM. Since
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f3e31a3..a85109e 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_gla2gpa *gg;
struct vm_activate_cpu *vac;
struct vm_cpuset *vm_cpuset;
+ struct vm_intinfo *vmii;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_SET_X2APIC_STATE:
case VM_GLA2GPA:
case VM_ACTIVATE_CPU:
+ case VM_SET_INTINFO:
+ case VM_GET_INTINFO:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
@@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = copyout(cpuset, vm_cpuset->cpus, size);
free(cpuset, M_TEMP);
break;
+ case VM_SET_INTINFO:
+ vmii = (struct vm_intinfo *)data;
+ error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+ break;
+ case VM_GET_INTINFO:
+ vmii = (struct vm_intinfo *)data;
+ error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+ &vmii->info2);
+ break;
default:
error = ENOTTY;
break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 921deb5..a65b125 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#else /* !_KERNEL */
#include <sys/types.h>
#include <sys/errno.h>
+#include <sys/_iovec.h>
#include <machine/vmm.h>
@@ -65,18 +66,26 @@ enum {
VIE_OP_TYPE_AND,
VIE_OP_TYPE_OR,
VIE_OP_TYPE_TWO_BYTE,
+ VIE_OP_TYPE_PUSH,
+ VIE_OP_TYPE_CMP,
VIE_OP_TYPE_LAST
};
/* struct vie_op.op_flags */
-#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
-#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
+#define VIE_OP_F_NO_MODRM (1 << 3)
static const struct vie_op two_byte_opcodes[256] = {
[0xB6] = {
.op_byte = 0xB6,
.op_type = VIE_OP_TYPE_MOVZX,
},
+ [0xB7] = {
+ .op_byte = 0xB7,
+ .op_type = VIE_OP_TYPE_MOVZX,
+ },
[0xBE] = {
.op_byte = 0xBE,
.op_type = VIE_OP_TYPE_MOVSX,
@@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x0F,
.op_type = VIE_OP_TYPE_TWO_BYTE
},
+ [0x3B] = {
+ .op_byte = 0x3B,
+ .op_type = VIE_OP_TYPE_CMP,
+ },
[0x88] = {
.op_byte = 0x88,
.op_type = VIE_OP_TYPE_MOV,
@@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x8B,
.op_type = VIE_OP_TYPE_MOV,
},
+ [0xA1] = {
+ .op_byte = 0xA1,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xA3] = {
+ .op_byte = 0xA3,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xC6] = {
+ /* XXX Group 11 extended opcode - not just MOV */
+ .op_byte = 0xC6,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM8,
+ },
[0xC7] = {
.op_byte = 0xC7,
.op_type = VIE_OP_TYPE_MOV,
@@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_type = VIE_OP_TYPE_OR,
.op_flags = VIE_OP_F_IMM8,
},
+ [0xFF] = {
+ /* XXX Group 5 extended opcode - not just PUSH */
+ .op_byte = 0xFF,
+ .op_type = VIE_OP_TYPE_PUSH,
+ }
};
/* struct vie.mod */
@@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
return (error);
}
-static int
-vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
{
- uint64_t val;
- int error, rshift;
- enum vm_reg_name reg;
-
- rshift = 0;
- reg = gpr_map[vie->reg];
+ *lhbr = 0;
+ *reg = gpr_map[vie->reg];
/*
- * 64-bit mode imposes limitations on accessing legacy byte registers.
+ * 64-bit mode imposes limitations on accessing legacy high byte
+ * registers (lhbr).
*
* The legacy high-byte registers cannot be addressed if the REX
* prefix is present. In this case the values 4, 5, 6 and 7 of the
@@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
*/
if (!vie->rex_present) {
if (vie->reg & 0x4) {
- /*
- * Obtain the value of %ah by reading %rax and shifting
- * right by 8 bits (same for %bh, %ch and %dh).
- */
- rshift = 8;
- reg = gpr_map[vie->reg & 0x3];
+ *lhbr = 1;
+ *reg = gpr_map[vie->reg & 0x3];
}
}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+ uint64_t val;
+ int error, lhbr;
+ enum vm_reg_name reg;
+ vie_calc_bytereg(vie, &reg, &lhbr);
error = vm_get_register(vm, vcpuid, reg, &val);
- *rval = val >> rshift;
+
+ /*
+ * To obtain the value of a legacy high byte register shift the
+ * base register right by 8 bits (%ah = %rax >> 8).
+ */
+ if (lhbr)
+ *rval = val >> 8;
+ else
+ *rval = val;
+ return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+ uint64_t origval, val, mask;
+ int error, lhbr;
+ enum vm_reg_name reg;
+
+ vie_calc_bytereg(vie, &reg, &lhbr);
+ error = vm_get_register(vm, vcpuid, reg, &origval);
+ if (error == 0) {
+ val = byte;
+ mask = 0xff;
+ if (lhbr) {
+ /*
+ * Shift left by 8 to store 'byte' in a legacy high
+ * byte register.
+ */
+ val <<= 8;
+ mask <<= 8;
+ }
+ val |= origval & ~mask;
+ error = vm_set_register(vm, vcpuid, reg, val);
+ }
return (error);
}
@@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
}
/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- * - default address size is 64-bits
- * - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
+ * Return the status flags that would result from doing (x - y).
*/
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+ KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+ ("getcc: invalid operand size %d", opsize));
+
+ if (opsize == 2)
+ return (getcc16(x, y));
+ else if (opsize == 4)
+ return (getcc32(x, y));
+ else
+ return (getcc64(x, y));
+}
+
static int
emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
uint8_t byte;
uint64_t val;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* 88/r: mov r/m8, r8
* REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
*/
- size = 1;
+ size = 1; /* override for byte operation */
error = vie_read_bytereg(vm, vcpuid, vie, &byte);
if (error == 0)
error = memwrite(vm, vcpuid, gpa, byte, size, arg);
@@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
case 0x89:
/*
* MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m16, r16
* 89/r: mov r/m32, r32
* REX.W + 89/r mov r/m64, r64
*/
- if (vie->rex_w)
- size = 8;
reg = gpr_map[vie->reg];
error = vie_read_register(vm, vcpuid, reg, &val);
if (error == 0) {
@@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
}
break;
case 0x8A:
+ /*
+ * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8A/r: mov r8, r/m8
+ * REX + 8A/r: mov r8, r/m8
+ */
+ size = 1; /* override for byte operation */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0)
+ error = vie_write_bytereg(vm, vcpuid, vie, val);
+ break;
case 0x8B:
/*
* MOV from mem (ModRM:r/m) to reg (ModRM:reg)
- * 8A/r: mov r/m8, r8
- * REX + 8A/r: mov r/m8, r8
+ * 8B/r: mov r16, r/m16
* 8B/r: mov r32, r/m32
* REX.W 8B/r: mov r64, r/m64
*/
- if (vie->op.op_byte == 0x8A)
- size = 1;
- else if (vie->rex_w)
- size = 8;
error = memread(vm, vcpuid, gpa, &val, size, arg);
if (error == 0) {
reg = gpr_map[vie->reg];
error = vie_update_register(vm, vcpuid, reg, val, size);
}
break;
+ case 0xA1:
+ /*
+ * MOV from seg:moffset to AX/EAX/RAX
+ * A1: mov AX, moffs16
+ * A1: mov EAX, moffs32
+ * REX.W + A1: mov RAX, moffs64
+ */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = VM_REG_GUEST_RAX;
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xA3:
+ /*
+ * MOV from AX/EAX/RAX to seg:moffset
+ * A3: mov moffs16, AX
+ * A3: mov moffs32, EAX
+ * REX.W + A3: mov moffs64, RAX
+ */
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0xC6:
+ /*
+ * MOV from imm8 to mem (ModRM:r/m)
+ * C6/0 mov r/m8, imm8
+ * REX + C6/0 mov r/m8, imm8
+ */
+ size = 1; /* override for byte operation */
+ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+ break;
case 0xC7:
/*
- * MOV from imm32 to mem (ModRM:r/m)
+ * MOV from imm16/imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m16, imm16
* C7/0 mov r/m32, imm32
* REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
*/
- val = vie->immediate; /* already sign-extended */
-
- if (vie->rex_w)
- size = 8;
-
- if (size != 8)
- val &= size2mask[size];
-
+ val = vie->immediate & size2mask[size];
error = memwrite(vm, vcpuid, gpa, val, size, arg);
break;
default:
@@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (error);
}
-/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- * - default address size is 64-bits
- * - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
- */
static int
emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite,
@@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
enum vm_reg_name reg;
uint64_t val;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* MOV and zero extend byte from mem (ModRM:r/m) to
* reg (ModRM:reg).
*
- * 0F B6/r movzx r/m8, r32
- * REX.W + 0F B6/r movzx r/m8, r64
+ * 0F B6/r movzx r16, r/m8
+ * 0F B6/r movzx r32, r/m8
+ * REX.W + 0F B6/r movzx r64, r/m8
*/
/* get the first operand */
@@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
/* get the second operand */
reg = gpr_map[vie->reg];
- if (vie->rex_w)
- size = 8;
+ /* zero-extend byte */
+ val = (uint8_t)val;
/* write the result */
error = vie_update_register(vm, vcpuid, reg, val, size);
break;
+ case 0xB7:
+ /*
+ * MOV and zero extend word from mem (ModRM:r/m) to
+ * reg (ModRM:reg).
+ *
+ * 0F B7/r movzx r32, r/m16
+ * REX.W + 0F B7/r movzx r64, r/m16
+ */
+ error = memread(vm, vcpuid, gpa, &val, 2, arg);
+ if (error)
+ return (error);
+
+ reg = gpr_map[vie->reg];
+
+ /* zero-extend word */
+ val = (uint16_t)val;
+
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ break;
case 0xBE:
/*
* MOV and sign extend byte from mem (ModRM:r/m) to
* reg (ModRM:reg).
*
- * 0F BE/r movsx r/m8, r32
- * REX.W + 0F BE/r movsx r/m8, r64
+ * 0F BE/r movsx r16, r/m8
+ * 0F BE/r movsx r32, r/m8
+ * REX.W + 0F BE/r movsx r64, r/m8
*/
/* get the first operand */
@@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
/* get the second operand */
reg = gpr_map[vie->reg];
- if (vie->rex_w)
- size = 8;
-
/* sign extend byte */
val = (int8_t)val;
@@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
enum vm_reg_name reg;
uint64_t val1, val2;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
* result in reg.
*
+ * 23/r and r16, r/m16
* 23/r and r32, r/m32
* REX.W + 23/r and r64, r/m64
*/
- if (vie->rex_w)
- size = 8;
/* get the first operand */
reg = gpr_map[vie->reg];
@@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* AND mem (ModRM:r/m) with immediate and store the
* result in mem.
*
- * 81/ and r/m32, imm32
- * REX.W + 81/ and r/m64, imm32 sign-extended to 64
+ * 81 /4 and r/m16, imm16
+ * 81 /4 and r/m32, imm32
+ * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
*
* Currently, only the AND operation of the 0x81 opcode
* is implemented (ModRM:reg = b100).
@@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
if ((vie->reg & 7) != 4)
break;
- if (vie->rex_w)
- size = 8;
-
/* get the first operand */
error = memread(vm, vcpuid, gpa, &val1, size, arg);
if (error)
@@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
int error, size;
uint64_t val1;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* OR mem (ModRM:r/m) with immediate and store the
* result in mem.
*
- * 83/ OR r/m32, imm8 sign-extended to 32
- * REX.W + 83/ OR r/m64, imm8 sign-extended to 64
+ * 83 /1 OR r/m16, imm8 sign-extended to 16
+ * 83 /1 OR r/m32, imm8 sign-extended to 32
+ * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64
*
* Currently, only the OR operation of the 0x83 opcode
* is implemented (ModRM:reg = b001).
@@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
if ((vie->reg & 7) != 1)
break;
- if (vie->rex_w)
- size = 8;
-
/* get the first operand */
error = memread(vm, vcpuid, gpa, &val1, size, arg);
if (error)
@@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (error);
}
+#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ uint64_t op1, op2, rflags, rflags2;
+ enum vm_reg_name reg;
+
+ size = vie->opsize;
+ switch (vie->op.op_byte) {
+ case 0x3B:
+ /*
+ * 3B/r CMP r16, r/m16
+ * 3B/r CMP r32, r/m32
+ * REX.W + 3B/r CMP r64, r/m64
+ *
+ * Compare first operand (reg) with second operand (r/m) and
+ * set status flags in EFLAGS register. The comparison is
+ * performed by subtracting the second operand from the first
+ * operand and then setting the status flags.
+ */
+
+ /* Get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &op1);
+ if (error)
+ return (error);
+
+ /* Get the second operand */
+ error = memread(vm, vcpuid, gpa, &op2, size, arg);
+ if (error)
+ return (error);
+
+ break;
+ default:
+ return (EINVAL);
+ }
+ rflags2 = getcc(size, op1, op2);
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ if (error)
+ return (error);
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+ return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+ struct vm_copyinfo copyinfo[2];
+#else
+ struct iovec copyinfo[2];
+#endif
+ struct seg_desc ss_desc;
+ uint64_t cr0, rflags, rsp, stack_gla, val;
+ int error, size, stackaddrsize;
+
+ /*
+ * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+ *
+ * PUSH is part of the group 5 extended opcodes and is identified
+ * by ModRM:reg = b110.
+ */
+ if ((vie->reg & 7) != 6)
+ return (EINVAL);
+
+ size = vie->opsize;
+ /*
+ * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+ */
+ if (paging->cpu_mode == CPU_MODE_REAL) {
+ stackaddrsize = 2;
+ } else if (paging->cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+ * - Stack pointer size is always 64-bits.
+ * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+ * - 16-bit PUSH/POP is supported by using the operand size
+ * override prefix (66H).
+ */
+ stackaddrsize = 8;
+ size = vie->opsize_override ? 2 : 8;
+ } else {
+ /*
+ * In protected or compability mode the 'B' flag in the
+ * stack-segment descriptor determines the size of the
+ * stack pointer.
+ */
+ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+ KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+ __func__, error));
+ if (SEG_DESC_DEF32(ss_desc.access))
+ stackaddrsize = 4;
+ else
+ stackaddrsize = 2;
+ }
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+ KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+ KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+ rsp -= size;
+ if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+ rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+ vm_inject_ac(vm, vcpuid, 0);
+ return (0);
+ }
+
+ error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+ copyinfo, nitems(copyinfo));
+ if (error == -1) {
+ /*
+ * XXX cannot return a negative error value here because it
+ * ends up being the return value of the VM_RUN() ioctl and
+ * is interpreted as a pseudo-error (for e.g. ERESTART).
+ */
+ return (EFAULT);
+ } else if (error == 1) {
+ /* Resume guest execution to handle page fault */
+ return (0);
+ }
+
+ error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+ if (error == 0) {
+ vm_copyout(vm, vcpuid, &val, copyinfo, size);
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+ stackaddrsize);
+ KASSERT(error == 0, ("error %d updating rsp", error));
+ }
+#ifdef _KERNEL
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+ return (error);
+}
+
int
vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
- mem_region_read_t memread, mem_region_write_t memwrite,
- void *memarg)
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *memarg)
{
int error;
@@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (EINVAL);
switch (vie->op.op_type) {
+ case VIE_OP_TYPE_PUSH:
+ error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_CMP:
+ error = emulate_cmp(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
case VIE_OP_TYPE_MOV:
error = emulate_mov(vm, vcpuid, gpa, vie,
memread, memwrite, memarg);
@@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
* then the descriptor is unusable and attempting to use
* it results in a #GP(0).
*/
- if (SEG_DESC_UNUSABLE(desc))
+ if (SEG_DESC_UNUSABLE(desc->access))
return (-1);
/*
@@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
* descriptor that is not present. If this was the case then
* it would have been checked before the VM-exit.
*/
- KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
- seg, desc->access));
+ KASSERT(SEG_DESC_PRESENT(desc->access),
+ ("segment %d not present: %#x", seg, desc->access));
/*
* The descriptor type must indicate a code/data segment.
*/
- type = SEG_DESC_TYPE(desc);
+ type = SEG_DESC_TYPE(desc->access);
KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
"descriptor type %#x", seg, type));
@@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
if ((type & 0xC) == 0x4) {
/* expand-down data segment */
low_limit = desc->limit + 1;
- high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+ high_limit = SEG_DESC_DEF32(desc->access) ?
+ 0xffffffff : 0xffff;
} else {
/* code segment or expand-up data segment */
low_limit = 0;
@@ -947,45 +1254,24 @@ fault:
}
int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t rip, int inst_length, struct vie *vie)
{
- int n, error, prot;
- uint64_t gpa, off;
- void *hpa, *cookie;
-
- /*
- * XXX cache previously fetched instructions using 'rip' as the tag
- */
+ struct vm_copyinfo copyinfo[2];
+ int error, prot;
- prot = VM_PROT_READ | VM_PROT_EXECUTE;
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
- /* Copy the instruction into 'vie' */
- while (vie->num_valid < inst_length) {
- error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
- if (error)
- return (error);
-
- off = gpa & PAGE_MASK;
- n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
- if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
- break;
-
- bcopy(hpa, &vie->inst[vie->num_valid], n);
-
- vm_gpa_release(cookie);
-
- rip += n;
- vie->num_valid += n;
+ prot = PROT_READ | PROT_EXEC;
+ error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+ copyinfo, nitems(copyinfo));
+ if (error == 0) {
+ vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+ vie->num_valid = inst_length;
}
-
- if (vie->num_valid == inst_length)
- return (0);
- else
- return (-1);
+ return (error);
}
static int
@@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie)
}
static int
-decode_rex(struct vie *vie)
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
{
uint8_t x;
- if (vie_peek(vie, &x))
- return (-1);
+ while (1) {
+ if (vie_peek(vie, &x))
+ return (-1);
- if (x >= 0x40 && x <= 0x4F) {
- vie->rex_present = 1;
+ if (x == 0x66)
+ vie->opsize_override = 1;
+ else if (x == 0x67)
+ vie->addrsize_override = 1;
+ else
+ break;
+
+ vie_advance(vie);
+ }
+ /*
+ * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+ * - Only one REX prefix is allowed per instruction.
+ * - The REX prefix must immediately precede the opcode byte or the
+ * escape opcode byte.
+ * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+ * the mandatory prefix must come before the REX prefix.
+ */
+ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+ vie->rex_present = 1;
vie->rex_w = x & 0x8 ? 1 : 0;
vie->rex_r = x & 0x4 ? 1 : 0;
vie->rex_x = x & 0x2 ? 1 : 0;
vie->rex_b = x & 0x1 ? 1 : 0;
-
vie_advance(vie);
}
+ /*
+ * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+ */
+ if (cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * Default address size is 64-bits and default operand size
+ * is 32-bits.
+ */
+ vie->addrsize = vie->addrsize_override ? 4 : 8;
+ if (vie->rex_w)
+ vie->opsize = 8;
+ else if (vie->opsize_override)
+ vie->opsize = 2;
+ else
+ vie->opsize = 4;
+ } else if (cs_d) {
+ /* Default address and operand sizes are 32-bits */
+ vie->addrsize = vie->addrsize_override ? 2 : 4;
+ vie->opsize = vie->opsize_override ? 2 : 4;
+ } else {
+ /* Default address and operand sizes are 16-bits */
+ vie->addrsize = vie->addrsize_override ? 4 : 2;
+ vie->opsize = vie->opsize_override ? 4 : 2;
+ }
return (0);
}
@@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
{
uint8_t x;
+ if (cpu_mode == CPU_MODE_REAL)
+ return (-1);
+
+ if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+ return (0);
+
if (vie_peek(vie, &x))
return (-1);
@@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie)
union {
char buf[4];
int8_t signed8;
+ int16_t signed16;
int32_t signed32;
} u;
/* Figure out immediate operand size (if any) */
- if (vie->op.op_flags & VIE_OP_F_IMM)
- vie->imm_bytes = 4;
- else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ if (vie->op.op_flags & VIE_OP_F_IMM) {
+ /*
+ * Section 2.2.1.5 "Immediates", Intel SDM:
+ * In 64-bit mode the typical size of immediate operands
+ * remains 32-bits. When the operand size if 64-bits, the
+ * processor sign-extends all immediates to 64-bits prior
+ * to their use.
+ */
+ if (vie->opsize == 4 || vie->opsize == 8)
+ vie->imm_bytes = 4;
+ else
+ vie->imm_bytes = 2;
+ } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
vie->imm_bytes = 1;
+ }
if ((n = vie->imm_bytes) == 0)
return (0);
- if (n != 1 && n != 4)
- panic("decode_immediate: invalid imm_bytes %d", n);
+ KASSERT(n == 1 || n == 2 || n == 4,
+ ("%s: invalid number of immediate bytes: %d", __func__, n));
for (i = 0; i < n; i++) {
if (vie_peek(vie, &x))
@@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie)
u.buf[i] = x;
vie_advance(vie);
}
-
+
+ /* sign-extend the immediate value before use */
if (n == 1)
- vie->immediate = u.signed8; /* sign-extended */
+ vie->immediate = u.signed8;
+ else if (n == 2)
+ vie->immediate = u.signed16;
else
- vie->immediate = u.signed32; /* sign-extended */
+ vie->immediate = u.signed32;
+
+ return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[8];
+ uint64_t u64;
+ } u;
+
+ if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+ return (0);
+
+ /*
+ * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+ * The memory offset size follows the address-size of the instruction.
+ */
+ n = vie->addrsize;
+ KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+ u.u64 = 0;
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+ vie->displacement = u.u64;
return (0);
}
@@ -1301,7 +1681,7 @@ static int
verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
int error;
- uint64_t base, idx;
+ uint64_t base, idx, gla2;
/* Skip 'gla' verification */
if (gla == VIE_INVALID_GLA)
@@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
}
}
- if (base + vie->scale * idx + vie->displacement != gla) {
+ /* XXX assuming that the base address of the segment is 0 */
+ gla2 = base + vie->scale * idx + vie->displacement;
+ gla2 &= size2mask[vie->addrsize];
+ if (gla != gla2) {
printf("verify_gla mismatch: "
"base(0x%0lx), scale(%d), index(0x%0lx), "
- "disp(0x%0lx), gla(0x%0lx)\n",
- base, vie->scale, idx, vie->displacement, gla);
+ "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla, gla2);
return (-1);
}
@@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
int
vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
- enum vm_cpu_mode cpu_mode, struct vie *vie)
+ enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
{
- if (cpu_mode == CPU_MODE_64BIT) {
- if (decode_rex(vie))
- return (-1);
- }
+ if (decode_prefixes(vie, cpu_mode, cs_d))
+ return (-1);
if (decode_opcode(vie))
return (-1);
@@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
if (decode_displacement(vie))
return (-1);
-
+
if (decode_immediate(vie))
return (-1);
+ if (decode_moffset(vie))
+ return (-1);
+
if (verify_inst_length(vie))
return (-1);
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index c0cef2c..8610684 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -436,6 +436,25 @@
#define MSR_MC4_MISC 0x413
/*
+ * VMX MSRs
+ */
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+#define MSR_VMX_PROCBASED_CTLS2 0x48b
+#define MSR_VMX_EPT_VPID_CAP 0x48c
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48d
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+/*
* X2APIC MSRs
*/
#define MSR_APIC_ID 0x802
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 23e16cb..1c95f77 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -35,6 +35,7 @@ SRCS= \
post.c \
rtc.c \
smbiostbl.c \
+ task_switch.c \
uart_emul.c \
virtio.c \
xmsr.c \
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
index c4ec020..5dea300 100644
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -40,12 +40,13 @@
* Layout
* ------
* RSDP -> 0xf2400 (36 bytes fixed)
- * RSDT -> 0xf2440 (36 bytes + 4*N table addrs, 2 used)
- * XSDT -> 0xf2480 (36 bytes + 8*N table addrs, 2 used)
+ * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used)
+ * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used)
* MADT -> 0xf2500 (depends on #CPUs)
* FADT -> 0xf2600 (268 bytes)
* HPET -> 0xf2740 (56 bytes)
- * FACS -> 0xf2780 (64 bytes)
+ * MCFG -> 0xf2780 (60 bytes)
+ * FACS -> 0xf27C0 (64 bytes)
* DSDT -> 0xf2800 (variable - can go up to 0x100000)
*/
@@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");
#define MADT_OFFSET 0x100
#define FADT_OFFSET 0x200
#define HPET_OFFSET 0x340
-#define FACS_OFFSET 0x380
+#define MCFG_OFFSET 0x380
+#define FACS_OFFSET 0x3C0
#define DSDT_OFFSET 0x400
#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX"
@@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)
basl_acpi_base + FADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
basl_acpi_base + HPET_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+ basl_acpi_base + MCFG_OFFSET);
EFFLUSH(fp);
@@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)
basl_acpi_base + FADT_OFFSET);
EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
basl_acpi_base + HPET_OFFSET);
+ EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+ basl_acpi_base + MCFG_OFFSET);
EFFLUSH(fp);
@@ -583,6 +589,39 @@ err_exit:
}
static int
+basl_fwrite_mcfg(FILE *fp)
+{
+ int err = 0;
+
+ EFPRINTF(fp, "/*\n");
+ EFPRINTF(fp, " * bhyve MCFG template\n");
+ EFPRINTF(fp, " */\n");
+ EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+ EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+ EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+ EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+ EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+ EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n");
+ EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+ /* iasl will fill in the compiler ID/revision fields */
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+ EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+ EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+ EFPRINTF(fp, "\n");
+
+ EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+ EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+ EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+ EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+ EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+ EFFLUSH(fp);
+ return (0);
+err_exit:
+ return (errno);
+}
+
+static int
basl_fwrite_facs(FILE *fp)
{
int err;
@@ -921,6 +960,7 @@ static struct {
{ basl_fwrite_madt, MADT_OFFSET },
{ basl_fwrite_fadt, FADT_OFFSET },
{ basl_fwrite_hpet, HPET_OFFSET },
+ { basl_fwrite_mcfg, MCFG_OFFSET },
{ basl_fwrite_facs, FACS_OFFSET },
{ basl_fwrite_dsdt, DSDT_OFFSET },
{ NULL }
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 6e13c19..930b7af 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -31,6 +31,10 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
#include <stdio.h>
#include "inout.h"
@@ -48,29 +52,30 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
- return (INOUT_ERROR);
+ return (-1);
*eax = 0;
- return (INOUT_OK);
+ return (0);
}
static int
atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg)
{
- int retval;
+ int error, retval;
if (bytes != 1)
- return (INOUT_ERROR);
+ return (-1);
- retval = INOUT_OK;
+ retval = 0;
if (in) {
*eax = KBD_SYS_FLAG; /* system passed POST */
} else {
switch (*eax) {
case KBDC_RESET: /* Pulse "reset" line. */
- retval = INOUT_RESET;
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
break;
}
}
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index aad1aef..80b814a 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd April 2, 2014
+.Dd June 26, 2014
.Dt BHYVE 8
.Os
.Sh NAME
@@ -32,12 +32,14 @@
.Nd "run a guest operating system inside a virtual machine"
.Sh SYNOPSIS
.Nm
-.Op Fl aehwxACHPW
+.Op Fl abehwxACHPWY
.Op Fl c Ar numcpus
.Op Fl g Ar gdbport
+.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
.Op Fl p Ar vcpu:hostcpu
.Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
-.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl U Ar uuid
.Ar vmname
.Sh DESCRIPTION
.Nm
@@ -66,21 +68,49 @@ Generate ACPI tables.
Required for
.Fx Ns /amd64
guests.
+.It Fl b
+Enable a low-level console device supported by
+.Fx kernels compiled with
+.Cd "device bvmconsole" .
+This option will be deprecated in a future version.
.It Fl c Ar numcpus
Number of guest virtual CPUs.
The default is 1 and the maximum is 16.
.It Fl C
Include guest memory in core file.
-.It Fl H
-Yield the virtual CPU thread when a HLT instruction is detected.
-If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl e
+Force
+.Nm
+to exit when a guest issues an access to an I/O port that is not emulated.
+This is intended for debug purposes.
.It Fl g Ar gdbport
For
-.Fx Ns /amd64 kernels compiled with
-.Cd "option bvmdebug" ,
+.Fx
+kernels compiled with
+.Cd "device bvmdebug" ,
allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
via a local IPv4 address and this port.
This option will be deprecated in a future version.
+.It Fl h
+Print help message and exit.
+.It Fl H
+Yield the virtual CPU thread when a HLT instruction is detected.
+If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl l Ar lpcdev Ns Op , Ns Ar conf
+Allow devices behind the LPC PCI-ISA bridge to be configured.
+The only supported devices are the TTY-class devices,
+.Li com1
+and
+.Li com2 .
+.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
+Guest physical memory size in bytes.
+This must be the same size that was given to
+.Xr bhyveload 8 .
+.Pp
+The size argument may be suffixed with one of K, M, G or T (either upper
+or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
+or terabytes.
+If no suffix is given, the value is assumed to be in megabytes.
.It Fl p Ar vcpu:hostcpu
Pin guest's virtual CPU
.Em vcpu
@@ -88,9 +118,6 @@ to
.Em hostcpu .
.It Fl P
Force the guest virtual CPU to exit when a PAUSE instruction is detected.
-.It Fl W
-Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
-interrupts.
.It Fl s Ar slot,emulation Ns Op , Ns Ar conf
Configure a virtual PCI slot and function.
.Pp
@@ -211,34 +238,21 @@ The host device must have been reserved at boot-time using the
loader variable as described in
.Xr vmm 4 .
.El
-.It Fl l Ar lpcdev Ns Op , Ns Ar conf
-Allow devices behind the LPC PCI-ISA bridge to be configured.
-The only supported devices are the TTY-class devices,
-.Li com1
-and
-.Li com2 .
-.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
-Guest physical memory size in bytes.
-This must be the same size that was given to
-.Xr bhyveload 8 .
-.Pp
-The size argument may be suffixed with one of K, M, G or T (either upper
-or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
-or terabytes.
-If no suffix is given, the value is assumed to be in megabytes.
-.It Fl e
-Force
-.Nm
-to exit when a guest issues an access to an I/O port that is not emulated.
-This is intended for debug purposes.
+.It Fl U Ar uuid
+Set the universally unique identifier
+.Pq UUID
+in the guest's System Management BIOS System Information structure.
+By default a UUID is generated from the host's hostname and
+.Ar vmname .
.It Fl w
Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes.
+.It Fl W
+Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
+interrupts.
.It Fl x
The guest's local APIC is configured in x2APIC mode.
.It Fl Y
Disable MPtable generation.
-.It Fl h
-Print help message and exit.
.It Ar vmname
Alphanumeric name of the guest.
This should be the same as that created by
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 1e5d3b3..7dcf6d0 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$");
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
-#define VMEXIT_CONTINUE 1 /* continue from next instruction */
-#define VMEXIT_RESTART 2 /* restart current instruction */
-#define VMEXIT_ABORT 3 /* abort the vm run loop */
-#define VMEXIT_RESET 4 /* guest machine has reset */
-#define VMEXIT_POWEROFF 5 /* guest machine has powered off */
-
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
char *vmname;
@@ -101,7 +96,7 @@ static cpuset_t cpumask;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
-struct vm_exit vmexit[VM_MAXCPU];
+static struct vm_exit vmexit[VM_MAXCPU];
struct bhyvestats {
uint64_t vmexit_bogus;
@@ -112,8 +107,6 @@ struct bhyvestats {
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
- int io_reset;
- int io_poweroff;
} stats;
struct mt_vmm_info {
@@ -129,26 +122,26 @@ usage(int code)
{
fprintf(stderr,
- "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
- " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+ "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
+ " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
" -a: local apic is in xAPIC mode (deprecated)\n"
- " -A: create an ACPI table\n"
- " -g: gdb port\n"
+ " -A: create ACPI tables\n"
" -c: # cpus (default 1)\n"
" -C: include guest memory in core file\n"
- " -p: pin 'vcpu' to 'hostcpu'\n"
- " -H: vmexit from the guest on hlt\n"
- " -P: vmexit from the guest on pause\n"
- " -W: force virtio to use single-vector MSI\n"
" -e: exit on unhandled I/O access\n"
+ " -g: gdb port\n"
" -h: help\n"
- " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
+ " -p: pin 'vcpu' to 'hostcpu'\n"
+ " -P: vmexit from the guest on pause\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -U: uuid\n"
" -w: ignore unimplemented MSRs\n"
+ " -W: force virtio to use single-vector MSI\n"
" -x: local apic is in x2APIC mode\n"
- " -Y: disable MPtable generation\n"
- " -U: uuid\n",
+ " -Y: disable MPtable generation\n",
progname, (int)strlen(progname), "");
exit(code);
@@ -187,6 +180,27 @@ pincpu_parse(const char *opt)
return (0);
}
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+ int errcode)
+{
+ struct vmctx *ctx;
+ int error;
+
+ ctx = arg;
+ if (errcode_valid)
+ error = vm_inject_exception2(ctx, vcpu, vector, errcode);
+ else
+ error = vm_inject_exception(ctx, vcpu, vector);
+ assert(error == 0);
+
+ /*
+ * Set the instruction length to 0 to ensure that the instruction is
+ * restarted when the fault handler returns.
+ */
+ vmexit[vcpu].inst_length = 0;
+}
+
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
@@ -315,27 +329,18 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
}
error = emulate_inout(ctx, vcpu, vme, strictio);
- if (error == INOUT_OK && in && !string) {
+ if (!error && in && !string) {
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
vme->u.inout.eax);
+ assert(error == 0);
}
- switch (error) {
- case INOUT_OK:
- return (VMEXIT_CONTINUE);
- case INOUT_RESTART:
- return (VMEXIT_RESTART);
- case INOUT_RESET:
- stats.io_reset++;
- return (VMEXIT_RESET);
- case INOUT_POWEROFF:
- stats.io_poweroff++;
- return (VMEXIT_POWEROFF);
- default:
- fprintf(stderr, "Unhandled %s%c 0x%04x\n",
- in ? "in" : "out",
- bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+ if (error) {
+ fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
return (VMEXIT_ABORT);
+ } else {
+ return (VMEXIT_CONTINUE);
}
}
@@ -352,8 +357,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
vme->u.msr.code, *pvcpu);
if (strictmsr) {
- error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
- assert(error == 0);
+ vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_RESTART);
}
}
@@ -379,8 +383,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
- error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
- assert(error == 0);
+ vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_RESTART);
}
}
@@ -399,6 +402,16 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
return (retval);
}
+#define DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+#define VMCS_IDENT(x) ((x) | 0x80000000)
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
@@ -413,7 +426,21 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit->u.vmx.exit_qualification);
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
-
+#ifdef DEBUG_EPT_MISCONFIG
+ if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ vm_get_register(ctx, *pvcpu,
+ VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+ &ept_misconfig_gpa);
+ vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+ &ept_misconfig_ptenum);
+ fprintf(stderr, "\tEPT misconfiguration:\n");
+ fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+ fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+ ept_misconfig_ptenum, ept_misconfig_pte[0],
+ ept_misconfig_pte[1], ept_misconfig_pte[2],
+ ept_misconfig_pte[3]);
+ }
+#endif /* DEBUG_EPT_MISCONFIG */
return (VMEXIT_ABORT);
}
@@ -465,7 +492,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
stats.vmexit_inst_emul++;
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
- &vmexit->u.inst_emul.vie);
+ &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
if (err) {
if (err == EINVAL) {
@@ -515,6 +542,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
exit(1);
case VM_SUSPEND_HALT:
exit(2);
+ case VM_SUSPEND_TRIPLEFAULT:
+ exit(3);
default:
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
exit(100);
@@ -532,7 +561,8 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
- [VM_EXITCODE_SUSPENDED] = vmexit_suspend
+ [VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+ [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
};
static void
@@ -540,7 +570,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
{
int error, rc, prevcpu;
enum vm_exitcode exitcode;
- enum vm_suspend_how how;
cpuset_t active_cpus;
if (vcpumap[vcpu] != NULL) {
@@ -575,16 +604,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
case VMEXIT_RESTART:
rip = vmexit[vcpu].rip;
break;
- case VMEXIT_RESET:
- case VMEXIT_POWEROFF:
- if (rc == VMEXIT_RESET)
- how = VM_SUSPEND_RESET;
- else
- how = VM_SUSPEND_POWEROFF;
- error = vm_suspend(ctx, how);
- assert(error == 0 || errno == EALREADY);
- rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
- break;
case VMEXIT_ABORT:
abort();
default:
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index f18d42f..87824ef 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -35,6 +35,10 @@
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
+#define VMEXIT_CONTINUE 1 /* continue from next instruction */
+#define VMEXIT_RESTART 2 /* restart current instruction */
+#define VMEXIT_ABORT 3 /* abort the vm run loop */
+
struct vmctx;
extern int guest_ncpus;
extern char *guest_uuid_str;
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index b29bc78..1ec0344 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -390,6 +390,55 @@ blockif_close(struct blockif_ctxt *bc)
}
/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+ off_t sectors; /* total sectors of the block dev */
+ off_t hcyl; /* cylinders times heads */
+ uint16_t secpt; /* sectors per track */
+ uint8_t heads;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ sectors = bc->bc_size / bc->bc_sectsz;
+
+ /* Clamp the size to the largest possible with CHS */
+ if (sectors > 65535UL*16*255)
+ sectors = 65535UL*16*255;
+
+ if (sectors >= 65536UL*16*63) {
+ secpt = 255;
+ heads = 16;
+ hcyl = sectors / secpt;
+ } else {
+ secpt = 17;
+ hcyl = sectors / secpt;
+ heads = (hcyl + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (hcyl >= (heads * 1024) || heads > 16) {
+ secpt = 31;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ if (hcyl >= (heads * 1024)) {
+ secpt = 63;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ }
+
+ *c = hcyl / heads;
+ *h = heads;
+ *s = secpt;
+}
+
+/*
* Accessors
*/
off_t
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index e0c0bb1..c2c21f6 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -52,6 +52,8 @@ struct blockif_req {
struct blockif_ctxt;
struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
off_t blockif_size(struct blockif_ctxt *bc);
+void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+ uint8_t *s);
int blockif_sectsz(struct blockif_ctxt *bc);
int blockif_queuesz(struct blockif_ctxt *bc);
int blockif_is_ro(struct blockif_ctxt *bc);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index fe9e0d8..1041a59 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -154,31 +154,28 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
/* Limit number of back-to-back in/out emulations to 16 */
iterations = MIN(count, 16);
while (iterations > 0) {
+ assert(retval == 0);
if (vie_calculate_gla(vis->paging.cpu_mode,
vis->seg_name, &vis->seg_desc, index, bytes,
addrsize, prot, &gla)) {
- error = vm_inject_exception2(ctx, vcpu,
- IDT_GP, 0);
- assert(error == 0);
- retval = INOUT_RESTART;
+ vm_inject_gp(ctx, vcpu);
break;
}
- error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
- prot, iov, nitems(iov));
- assert(error == 0 || error == 1 || error == -1);
- if (error) {
- retval = (error == 1) ? INOUT_RESTART :
- INOUT_ERROR;
+ error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+ bytes, prot, iov, nitems(iov));
+ if (error == -1) {
+ retval = -1; /* Unrecoverable error */
+ break;
+ } else if (error == 1) {
+ retval = 0; /* Resume guest to handle fault */
break;
}
if (vie_alignment_check(vis->paging.cpl, bytes,
vis->cr0, vis->rflags, gla)) {
- error = vm_inject_exception2(ctx, vcpu,
- IDT_AC, 0);
- assert(error == 0);
- return (INOUT_RESTART);
+ vm_inject_ac(ctx, vcpu, 0);
+ break;
}
val = 0;
@@ -217,8 +214,8 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
}
/* Restart the instruction if more iterations remain */
- if (retval == INOUT_OK && count != 0)
- retval = INOUT_RESTART;
+ if (retval == 0 && count != 0)
+ vmexit->inst_length = 0;
} else {
if (!in) {
val = vmexit->u.inout.eax & vie_size2mask(bytes);
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
index f15a2c8..7f39095 100644
--- a/usr.sbin/bhyve/inout.h
+++ b/usr.sbin/bhyve/inout.h
@@ -34,13 +34,9 @@
struct vmctx;
struct vm_exit;
-/* Handler return values. */
-#define INOUT_ERROR -1
-#define INOUT_OK 0
-#define INOUT_RESTART 1
-#define INOUT_RESET 2
-#define INOUT_POWEROFF 3
-
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index 7ea630f..2a9f430 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -157,10 +157,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
}
int
-emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging)
+
{
struct mmio_rb_range *entry;
- int err;
+ int err, immutable;
pthread_rwlock_rdlock(&mmio_rwlock);
/*
@@ -184,10 +186,28 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
}
assert(entry != NULL);
- err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+
+ /*
+ * An 'immutable' memory range is guaranteed to be never removed
+ * so there is no need to hold 'mmio_rwlock' while calling the
+ * handler.
+ *
+ * XXX writes to the PCIR_COMMAND register can cause register_mem()
+ * to be called. If the guest is using PCI extended config space
+ * to modify the PCIR_COMMAND register then register_mem() can
+ * deadlock on 'mmio_rwlock'. However by registering the extended
+ * config space window as 'immutable' the deadlock can be avoided.
+ */
+ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+ if (immutable)
+ pthread_rwlock_unlock(&mmio_rwlock);
+
+ err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
mem_read, mem_write, &entry->mr_param);
- pthread_rwlock_unlock(&mmio_rwlock);
-
+
+ if (!immutable)
+ pthread_rwlock_unlock(&mmio_rwlock);
+
return (err);
}
@@ -244,6 +264,7 @@ unregister_mem(struct mem_range *memp)
mr = &entry->mr_param;
assert(mr->name == memp->name);
assert(mr->base == memp->base && mr->size == memp->size);
+ assert((mr->flags & MEM_F_IMMUTABLE) == 0);
RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
/* flush Per-vCPU cache */
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
index 264bff9..f671eae 100644
--- a/usr.sbin/bhyve/mem.h
+++ b/usr.sbin/bhyve/mem.h
@@ -48,9 +48,11 @@ struct mem_range {
#define MEM_F_READ 0x1
#define MEM_F_WRITE 0x2
#define MEM_F_RW 0x3
+#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
void init_mem(void);
-int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging);
int register_mem(struct mem_range *memp);
int register_mem_fallback(struct mem_range *memp);
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 9f61107..214237d 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -336,8 +336,9 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
fis[13] = cfis[13];
if (fis[2] & ATA_S_ERROR)
p->is |= AHCI_P_IX_TFE;
+ else
+ p->ci &= ~(1 << slot);
p->tfd = tfd;
- p->ci &= ~(1 << slot);
ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
}
@@ -598,10 +599,16 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
} else {
uint16_t buf[256];
uint64_t sectors;
+ uint16_t cyl;
+ uint8_t sech, heads;
sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ blockif_chs(p->bctx, &cyl, &heads, &sech);
memset(buf, 0, sizeof(buf));
buf[0] = 0x0040;
+ buf[1] = cyl;
+ buf[3] = heads;
+ buf[6] = sech;
/* TODO emulate different serial? */
ata_string((uint8_t *)(buf+10), "123456", 20);
ata_string((uint8_t *)(buf+23), "001", 8);
@@ -645,8 +652,8 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
p->tfd = ATA_S_DSC | ATA_S_READY;
p->is |= AHCI_P_IX_DP;
+ p->ci &= ~(1 << slot);
}
- p->ci &= ~(1 << slot);
ahci_generate_intr(p->pr_sc);
}
@@ -688,8 +695,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
p->tfd = ATA_S_DSC | ATA_S_READY;
p->is |= AHCI_P_IX_DHR;
+ p->ci &= ~(1 << slot);
}
- p->ci &= ~(1 << slot);
ahci_generate_intr(p->pr_sc);
}
@@ -1292,7 +1299,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
if (!p->atapi) {
p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
p->is |= AHCI_P_IX_TFE;
- p->ci &= ~(1 << slot);
ahci_generate_intr(p->pr_sc);
} else
handle_packet_cmd(p, slot, cfis);
@@ -1301,7 +1307,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
p->is |= AHCI_P_IX_TFE;
- p->ci &= ~(1 << slot);
ahci_generate_intr(p->pr_sc);
break;
}
@@ -1369,8 +1374,11 @@ ahci_handle_port(struct ahci_port *p)
* are already in-flight.
*/
for (i = 0; (i < 32) && p->ci; i++) {
- if ((p->ci & (1 << i)) && !(p->pending & (1 << i)))
+ if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+ p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+ p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
ahci_handle_slot(p, i);
+ }
}
}
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 458ba76..6b906ed 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;
#define PCI_EMUL_IOBASE 0x2000
#define PCI_EMUL_IOLIMIT 0x10000
-#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
+#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
+#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
#define PCI_EMUL_MEMBASE64 0xD000000000UL
#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
static struct pci_devemu *pci_emul_finddev(char *name);
-static void pci_lintr_route(struct pci_devinst *pi);
-static void pci_lintr_update(struct pci_devinst *pi);
-
-static struct mem_range pci_mem_hole;
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+ int func, int coff, int bytes, uint32_t *val);
/*
* I/O access
@@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
return (0);
}
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int bytes, uint64_t *val, void *arg1, long arg2)
+{
+ int bus, slot, func, coff, in;
+
+ coff = addr & 0xfff;
+ func = (addr >> 12) & 0x7;
+ slot = (addr >> 15) & 0x1f;
+ bus = (addr >> 20) & 0xff;
+ in = (dir == MEM_F_READ);
+ if (in)
+ *val = ~0UL;
+ pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+ return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+ return (PCI_EMUL_ECFG_BASE);
+}
+
#define BUSIO_ROUNDUP 32
#define BUSMEM_ROUNDUP (1024 * 1024)
int
init_pci(struct vmctx *ctx)
{
+ struct mem_range mr;
struct pci_devemu *pde;
struct businfo *bi;
struct slotinfo *si;
@@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)
* The guest physical memory map looks like the following:
* [0, lowmem) guest system memory
* [lowmem, lowmem_limit) memory hole (may be absent)
- * [lowmem_limit, 4GB) PCI hole (32-bit BAR allocation)
+ * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation)
+ * [0xE0000000, 0xF0000000) PCI extended config window
+ * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
* [4GB, 4GB + highmem)
- *
+ */
+
+ /*
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
lowmem = vm_get_lowmem_size(ctx);
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI hole";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = lowmem;
+ mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+ mr.handler = pci_emul_fallback_handler;
+ error = register_mem_fallback(&mr);
+ assert(error == 0);
- memset(&pci_mem_hole, 0, sizeof(struct mem_range));
- pci_mem_hole.name = "PCI hole";
- pci_mem_hole.flags = MEM_F_RW;
- pci_mem_hole.base = lowmem;
- pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
- pci_mem_hole.handler = pci_emul_fallback_handler;
-
- error = register_mem_fallback(&pci_mem_hole);
+ /* PCI extended config space */
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI ECFG";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = PCI_EMUL_ECFG_BASE;
+ mr.size = PCI_EMUL_ECFG_SIZE;
+ mr.handler = pci_emul_ecfg_handler;
+ error = register_mem(&mr);
assert(error == 0);
return (0);
@@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
}
}
-static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
-
-static int
-pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
- uint32_t *eax, void *arg)
-{
- uint32_t x;
-
- if (bytes != 4) {
- if (in)
- *eax = (bytes == 2) ? 0xffff : 0xff;
- return (0);
- }
-
- if (in) {
- x = (cfgbus << 16) |
- (cfgslot << 11) |
- (cfgfunc << 8) |
- cfgoff;
- if (cfgenable)
- x |= CONF1_ENABLE;
- *eax = x;
- } else {
- x = *eax;
- cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
- cfgoff = x & PCI_REGMAX;
- cfgfunc = (x >> 8) & PCI_FUNCMAX;
- cfgslot = (x >> 11) & PCI_SLOTMAX;
- cfgbus = (x >> 16) & PCI_BUSMAX;
- }
-
- return (0);
-}
-INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
-
static uint32_t
bits_changed(uint32_t old, uint32_t new, uint32_t mask)
{
@@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
pci_lintr_update(pi);
}
-static int
-pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
- uint32_t *eax, void *arg)
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+ int coff, int bytes, uint32_t *eax)
{
struct businfo *bi;
struct slotinfo *si;
struct pci_devinst *pi;
struct pci_devemu *pe;
- int coff, idx, needcfg;
+ int idx, needcfg;
uint64_t addr, bar, mask;
- assert(bytes == 1 || bytes == 2 || bytes == 4);
-
- if ((bi = pci_businfo[cfgbus]) != NULL) {
- si = &bi->slotinfo[cfgslot];
- pi = si->si_funcs[cfgfunc].fi_devi;
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ pi = si->si_funcs[func].fi_devi;
} else
pi = NULL;
- coff = cfgoff + (port - CONF1_DATA_PORT);
-
-#if 0
- printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
- in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
-#endif
-
/*
- * Just return if there is no device at this cfgslot:cfgfunc,
- * if the guest is doing an un-aligned access, or if the config
- * address word isn't enabled.
+ * Just return if there is no device at this slot:func or if the
+ * the guest is doing an un-aligned access.
*/
- if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) {
+ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+ (coff & (bytes - 1)) != 0) {
if (in)
*eax = 0xffffffff;
- return (0);
+ return;
+ }
+
+ /*
+ * Ignore all writes beyond the standard config space and return all
+ * ones on reads.
+ */
+ if (coff >= PCI_REGMAX + 1) {
+ if (in) {
+ *eax = 0xffffffff;
+ /*
+ * Extended capabilities begin at offset 256 in config
+ * space. Absence of extended capabilities is signaled
+ * with all 0s in the extended capability header at
+ * offset 256.
+ */
+ if (coff <= PCI_REGMAX + 4)
+ *eax = 0x00000000;
+ }
+ return;
}
pe = pi->pi_d;
@@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
if (in) {
/* Let the device emulation override the default handler */
if (pe->pe_cfgread != NULL) {
- needcfg = pe->pe_cfgread(ctx, vcpu, pi,
- coff, bytes, eax);
+ needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+ eax);
} else {
needcfg = 1;
}
@@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
*eax = pci_get_cfgdata32(pi, coff);
}
- pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax);
+ pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
} else {
/* Let the device emulation override the default handler */
if (pe->pe_cfgwrite != NULL &&
(*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
- return (0);
+ return;
/*
* Special handling for write to BAR registers
@@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
* 4-byte aligned.
*/
if (bytes != 4 || (coff & 0x3) != 0)
- return (0);
+ return;
idx = (coff - PCIR_BAR(0)) / 4;
mask = ~(pi->pi_bar[idx].size - 1);
switch (pi->pi_bar[idx].type) {
@@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
CFGWRITE(pi, coff, *eax, bytes);
}
}
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ if (bytes != 4) {
+ if (in)
+ *eax = (bytes == 2) ? 0xffff : 0xff;
+ return (0);
+ }
+
+ if (in) {
+ x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+ if (cfgenable)
+ x |= CONF1_ENABLE;
+ *eax = x;
+ } else {
+ x = *eax;
+ cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+ }
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int coff;
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+ if (cfgenable) {
+ pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+ eax);
+ } else {
+ /* Ignore accesses to cfgdata if not enabled by cfgaddr */
+ if (in)
+ *eax = 0xffffffff;
+ }
return (0);
}
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 866ffc5..6b8c4e0 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
int pci_count_lintr(int bus);
void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
static __inline void
diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c
index 653aeb0..20e033f 100644
--- a/usr.sbin/bhyve/pci_irq.c
+++ b/usr.sbin/bhyve/pci_irq.c
@@ -115,7 +115,7 @@ void
pci_irq_reserve(int irq)
{
- assert(irq < nitems(irq_counts));
+ assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
irq_counts[irq] = IRQ_DISABLED;
@@ -125,10 +125,10 @@ void
pci_irq_use(int irq)
{
- assert(irq < nitems(irq_counts));
+ assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
- if (irq_counts[irq] != IRQ_DISABLED)
- irq_counts[irq]++;
+ assert(irq_counts[irq] != IRQ_DISABLED);
+ irq_counts[irq]++;
}
void
@@ -197,7 +197,7 @@ pirq_alloc_pin(struct vmctx *ctx)
{
int best_count, best_irq, best_pin, irq, pin;
- pirq_cold = 1;
+ pirq_cold = 0;
/* First, find the least-used PIRQ pin. */
best_pin = 0;
@@ -222,7 +222,7 @@ pirq_alloc_pin(struct vmctx *ctx)
best_count = irq_counts[irq];
}
}
- assert(best_irq != 0);
+ assert(best_irq >= 0);
irq_counts[best_irq]++;
pirqs[best_pin].reg = best_irq;
vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
@@ -234,9 +234,6 @@ pirq_alloc_pin(struct vmctx *ctx)
int
pirq_irq(int pin)
{
-
- if (pin == -1)
- return (255);
assert(pin > 0 && pin <= nitems(pirqs));
return (pirqs[pin - 1].reg & PIRQ_IRQ);
}
diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c
index 67126d8..f5a2d43 100644
--- a/usr.sbin/bhyve/pm.c
+++ b/usr.sbin/bhyve/pm.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include <assert.h>
+#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <vmmapi.h>
@@ -56,6 +57,8 @@ static int
reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
+ int error;
+
static uint8_t reset_control;
if (bytes != 1)
@@ -66,8 +69,10 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
reset_control = *eax;
/* Treat hard and soft resets the same. */
- if (reset_control & 0x4)
- return (INOUT_RESET);
+ if (reset_control & 0x4) {
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
+ }
}
return (0);
}
@@ -224,6 +229,7 @@ static int
pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
+ int error;
if (bytes != 2)
return (-1);
@@ -243,8 +249,10 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
* says that '5' should be stored in SLP_TYP for S5.
*/
if (*eax & PM1_SLP_EN) {
- if ((pm1_control & PM1_SLP_TYP) >> 10 == 5)
- return (INOUT_POWEROFF);
+ if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+ error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+ assert(error == 0 || errno == EALREADY);
+ }
}
}
return (0);
diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c
index d560f02..28c7eb2 100644
--- a/usr.sbin/bhyve/smbiostbl.c
+++ b/usr.sbin/bhyve/smbiostbl.c
@@ -321,8 +321,8 @@ struct smbios_table_type0 smbios_type0_template = {
const char *smbios_type0_strings[] = {
"BHYVE", /* vendor string */
- __TIME__, /* bios version string */
- __DATE__, /* bios release date string */
+ "1.00", /* bios version string */
+ "03/14/2014", /* bios release date string */
NULL
};
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
new file mode 100644
index 0000000..0002da8
--- /dev/null
+++ b/usr.sbin/bhyve/task_switch.c
@@ -0,0 +1,932 @@
+/*-
+ * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+ uint16_t tss_link;
+ uint16_t rsvd1;
+ uint32_t tss_esp0;
+ uint16_t tss_ss0;
+ uint16_t rsvd2;
+ uint32_t tss_esp1;
+ uint16_t tss_ss1;
+ uint16_t rsvd3;
+ uint32_t tss_esp2;
+ uint16_t tss_ss2;
+ uint16_t rsvd4;
+ uint32_t tss_cr3;
+ uint32_t tss_eip;
+ uint32_t tss_eflags;
+ uint32_t tss_eax;
+ uint32_t tss_ecx;
+ uint32_t tss_edx;
+ uint32_t tss_ebx;
+ uint32_t tss_esp;
+ uint32_t tss_ebp;
+ uint32_t tss_esi;
+ uint32_t tss_edi;
+ uint16_t tss_es;
+ uint16_t rsvd5;
+ uint16_t tss_cs;
+ uint16_t rsvd6;
+ uint16_t tss_ss;
+ uint16_t rsvd7;
+ uint16_t tss_ds;
+ uint16_t rsvd8;
+ uint16_t tss_fs;
+ uint16_t rsvd9;
+ uint16_t tss_gs;
+ uint16_t rsvd10;
+ uint16_t tss_ldt;
+ uint16_t rsvd11;
+ uint16_t tss_trap;
+ uint16_t tss_iomap;
+};
+CTASSERT(sizeof(struct tss32) == 104);
+
+#define SEL_START(sel) (((sel) & ~0x7))
+#define SEL_LIMIT(sel) (((sel) | 0x7))
+#define TSS_BUSY(type) (((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+ uint64_t val;
+ int error;
+
+ error = vm_get_register(ctx, vcpu, reg, &val);
+ assert(error == 0);
+ return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+ int error;
+
+ error = vm_set_register(ctx, vcpu, reg, val);
+ assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+ struct seg_desc seg_desc;
+
+ seg_desc.base = (u_int)USD_GETBASE(usd);
+ if (usd->sd_gran)
+ seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+ else
+ seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+ seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+ seg_desc.access |= usd->sd_xx << 12;
+ seg_desc.access |= usd->sd_def32 << 14;
+ seg_desc.access |= usd->sd_gran << 15;
+
+ return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+ /*
+ * Bit 2 from the selector is retained as-is in the error code.
+ *
+ * Bit 1 can be safely cleared because none of the selectors
+ * encountered during task switch emulation refer to a task
+ * gate in the IDT.
+ *
+ * Bit 0 is set depending on the value of 'ext'.
+ */
+ sel &= ~0x3;
+ if (ext)
+ sel |= 0x1;
+ vm_inject_fault(ctx, vcpu, vector, 1, sel);
+}
+
+/*
+ * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
+ * and non-zero otherwise.
+ */
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+ uint64_t base;
+ uint32_t limit, access;
+ int error, reg;
+
+ reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+ error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+ assert(error == 0);
+
+ if (reg == VM_REG_GUEST_LDTR) {
+ if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+ return (-1);
+ }
+
+ if (limit < SEL_LIMIT(sel))
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
+ * by the selector 'sel'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc, bool doread)
+{
+ struct iovec iov[2];
+ uint64_t base;
+ uint32_t limit, access;
+ int error, reg;
+
+ reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+ error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+ assert(error == 0);
+ assert(limit >= SEL_LIMIT(sel));
+
+ error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
+ sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
+ if (error == 0) {
+ if (doread)
+ vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+ else
+ vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+ }
+ return (error);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc)
+{
+ return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint16_t sel, struct user_segment_descriptor *desc)
+{
+ return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
+}
+
+/*
+ * Read the TSS descriptor referenced by 'sel' into 'desc'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ uint16_t sel, struct user_segment_descriptor *desc)
+{
+ struct vm_guest_paging sup_paging;
+ int error;
+
+ assert(!ISLDT(sel));
+ assert(IDXSEL(sel) != 0);
+
+ /* Fetch the new TSS descriptor */
+ if (desc_table_limit_check(ctx, vcpu, sel)) {
+ if (ts->reason == TSR_IRET)
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ else
+ sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+ return (1);
+ }
+
+ sup_paging = ts->paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+ error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
+ return (error);
+}
+
+static bool
+code_desc(int sd_type)
+{
+ /* code descriptor */
+ return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+ /* writable data descriptor */
+ return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+ /* data descriptor or a readable code descriptor */
+ return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+ return (sd_type == SDT_SYSLDT);
+}
+
+/*
+ * Validate the descriptor 'seg_desc' associated with 'segment'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ int segment, struct seg_desc *seg_desc)
+{
+ struct vm_guest_paging sup_paging;
+ struct user_segment_descriptor usd;
+ int error, idtvec;
+ int cpl, dpl, rpl;
+ uint16_t sel, cs;
+ bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+ ldtseg = codeseg = stackseg = dataseg = false;
+ switch (segment) {
+ case VM_REG_GUEST_LDTR:
+ ldtseg = true;
+ break;
+ case VM_REG_GUEST_CS:
+ codeseg = true;
+ break;
+ case VM_REG_GUEST_SS:
+ stackseg = true;
+ break;
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ dataseg = true;
+ break;
+ default:
+ assert(0);
+ }
+
+ /* Get the segment selector */
+ sel = GETREG(ctx, vcpu, segment);
+
+ /* LDT selector must point into the GDT */
+ if (ldtseg && ISLDT(sel)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* Descriptor table limit check */
+ if (desc_table_limit_check(ctx, vcpu, sel)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* NULL selector */
+ if (IDXSEL(sel) == 0) {
+ /* Code and stack segment selectors cannot be NULL */
+ if (codeseg || stackseg) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ seg_desc->base = 0;
+ seg_desc->limit = 0;
+ seg_desc->access = 0x10000; /* unusable */
+ return (0);
+ }
+
+ /* Read the descriptor from the GDT/LDT */
+ sup_paging = ts->paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+ error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
+ if (error)
+ return (error);
+
+ /* Verify that the descriptor type is compatible with the segment */
+ if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+ (codeseg && !code_desc(usd.sd_type)) ||
+ (dataseg && !data_desc(usd.sd_type)) ||
+ (stackseg && !stack_desc(usd.sd_type))) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ /* Segment must be marked present */
+ if (!usd.sd_p) {
+ if (ldtseg)
+ idtvec = IDT_TS;
+ else if (stackseg)
+ idtvec = IDT_SS;
+ else
+ idtvec = IDT_NP;
+ sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+ return (1);
+ }
+
+ cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+ cpl = cs & SEL_RPL_MASK;
+ rpl = sel & SEL_RPL_MASK;
+ dpl = usd.sd_dpl;
+
+ if (stackseg && (rpl != cpl || dpl != cpl)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+
+ if (codeseg) {
+ conforming = (usd.sd_type & 0x4) ? true : false;
+ if ((conforming && (cpl < dpl)) ||
+ (!conforming && (cpl != dpl))) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ }
+
+ if (dataseg) {
+ /*
+ * A data segment is always non-conforming except when it's
+ * descriptor is a readable, conforming code segment.
+ */
+ if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+ conforming = true;
+ else
+ conforming = false;
+
+ if (!conforming && (rpl > dpl || cpl > dpl)) {
+ sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+ return (1);
+ }
+ }
+ *seg_desc = usd_to_seg_desc(&usd);
+ return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+ uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+ /* General purpose registers */
+ tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+ tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+ tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+ tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+ tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+ tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+ tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+ tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+ /* Segment selectors */
+ tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+ tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+ tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+ tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+ tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+ tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+ /* eflags and eip */
+ tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+ if (task_switch->reason == TSR_IRET)
+ tss->tss_eflags &= ~PSL_NT;
+ tss->tss_eip = eip;
+
+ /* Copy updated old TSS into guest memory */
+ vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+ int error;
+
+ error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+ assert(error == 0);
+}
+
+/*
+ * Update the vcpu registers to reflect the state of the new task.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+ uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
+{
+ struct seg_desc seg_desc, seg_desc2;
+ uint64_t *pdpte, maxphyaddr, reserved;
+ uint32_t eflags;
+ int error, i;
+ bool nested;
+
+ nested = false;
+ if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+ tss->tss_link = ot_sel;
+ nested = true;
+ }
+
+ eflags = tss->tss_eflags;
+ if (nested)
+ eflags |= PSL_NT;
+
+ /* LDTR */
+ SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+ /* PBDR */
+ if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+ if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+ /*
+ * XXX Assuming 36-bit MAXPHYADDR.
+ */
+ maxphyaddr = (1UL << 36) - 1;
+ pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+ for (i = 0; i < 4; i++) {
+ /* Check reserved bits if the PDPTE is valid */
+ if (!(pdpte[i] & 0x1))
+ continue;
+ /*
+ * Bits 2:1, 8:5 and bits above the processor's
+ * maximum physical address are reserved.
+ */
+ reserved = ~maxphyaddr | 0x1E6;
+ if (pdpte[i] & reserved) {
+ vm_inject_gp(ctx, vcpu);
+ return (1);
+ }
+ }
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+ SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+ }
+ SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+ ts->paging.cr3 = tss->tss_cr3;
+ }
+
+ /* eflags and eip */
+ SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+ /* General purpose registers */
+ SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+ /* Segment selectors */
+ SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+ SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+ SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+ SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+ SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+ SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+ /*
+ * If this is a nested task then write out the new TSS to update
+ * the previous link field.
+ */
+ if (nested)
+ vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+ /* Validate segment descriptors */
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+ /*
+ * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+ *
+ * The SS and CS attribute checks on VM-entry are inter-dependent so
+ * we need to make sure that both segments are valid before updating
+ * either of them. This ensures that the VMCS state can pass the
+ * VM-entry checks so the guest can handle any exception injected
+ * during task switch emulation.
+ */
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
+ if (error)
+ return (error);
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+ ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
+ if (error)
+ return (error);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+ return (0);
+}
+
+/*
+ * Push an error code on the stack of the new task. This is needed if the
+ * task switch was triggered by a hardware exception that causes an error
+ * code to be saved (e.g. #PF).
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ int task_type, uint32_t errcode)
+{
+ struct iovec iov[2];
+ struct seg_desc seg_desc;
+ int stacksize, bytes, error;
+ uint64_t gla, cr0, rflags;
+ uint32_t esp;
+ uint16_t stacksel;
+
+ cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+ rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+ stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+ &seg_desc.limit, &seg_desc.access);
+ assert(error == 0);
+
+ /*
+ * Section "Error Code" in the Intel SDM vol 3: the error code is
+ * pushed on the stack as a doubleword or word (depending on the
+ * default interrupt, trap or task gate size).
+ */
+ if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+ bytes = 4;
+ else
+ bytes = 2;
+
+ /*
+ * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+ * stack-segment descriptor determines the size of the stack
+ * pointer outside of 64-bit mode.
+ */
+ if (SEG_DESC_DEF32(seg_desc.access))
+ stacksize = 4;
+ else
+ stacksize = 2;
+
+ esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+ esp -= bytes;
+
+ if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+ &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+ sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+ return (1);
+ }
+
+ if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+ vm_inject_ac(ctx, vcpu, 1);
+ return (1);
+ }
+
+ error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+ iov, nitems(iov));
+ if (error)
+ return (error);
+
+ vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+ SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+ return (0);
+}
+
+/*
+ * Evaluate return value from helper functions and potentially return to
+ * the VM run loop.
+ * 0: success
+ * +1: an exception was injected into the guest vcpu
+ * -1: unrecoverable/programming error
+ */
+#define CHKERR(x) \
+ do { \
+ assert(((x) == 0) || ((x) == 1) || ((x) == -1)); \
+ if ((x) == -1) \
+ return (VMEXIT_ABORT); \
+ else if ((x) == 1) \
+ return (VMEXIT_CONTINUE); \
+ } while (0)
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ struct seg_desc nt;
+ struct tss32 oldtss, newtss;
+ struct vm_task_switch *task_switch;
+ struct vm_guest_paging *paging, sup_paging;
+ struct user_segment_descriptor nt_desc, ot_desc;
+ struct iovec nt_iov[2], ot_iov[2];
+ uint64_t cr0, ot_base;
+ uint32_t eip, ot_lim, access;
+ int error, ext, minlimit, nt_type, ot_type, vcpu;
+ enum task_switch_reason reason;
+ uint16_t nt_sel, ot_sel;
+
+ task_switch = &vmexit->u.task_switch;
+ nt_sel = task_switch->tsssel;
+ ext = vmexit->u.task_switch.ext;
+ reason = vmexit->u.task_switch.reason;
+ paging = &vmexit->u.task_switch.paging;
+ vcpu = *pvcpu;
+
+ assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+ /*
+ * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+ * The following page table accesses are implicitly supervisor mode:
+ * - accesses to GDT or LDT to load segment descriptors
+ * - accesses to the task state segment during task switch
+ */
+ sup_paging = *paging;
+ sup_paging.cpl = 0; /* implicit supervisor mode */
+
+ /* Fetch the new TSS descriptor */
+ error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
+ CHKERR(error);
+
+ nt = usd_to_seg_desc(&nt_desc);
+
+ /* Verify the type of the new TSS */
+ nt_type = SEG_DESC_TYPE(nt.access);
+ if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+ nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /* TSS descriptor must have present bit set */
+ if (!SEG_DESC_PRESENT(nt.access)) {
+ sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+ goto done;
+ }
+
+ /*
+ * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+ * 44 bytes for a 16-bit TSS.
+ */
+ if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
+ minlimit = 104 - 1;
+ else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
+ minlimit = 44 - 1;
+ else
+ minlimit = 0;
+
+ assert(minlimit > 0);
+ if (nt.limit < minlimit) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /* TSS must be busy if task switch is due to IRET */
+ if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
+ sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+ goto done;
+ }
+
+ /*
+ * TSS must be available (not busy) if task switch reason is
+ * CALL, JMP, exception or interrupt.
+ */
+ if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
+ sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
+ goto done;
+ }
+
+ /* Fetch the new TSS */
+ error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+ PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
+ CHKERR(error);
+ vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
+
+ /* Get the old TSS selector from the guest's task register */
+ ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
+ if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
+ /*
+ * This might happen if a task switch was attempted without
+ * ever loading the task register with LTR. In this case the
+ * TR would contain the values from power-on:
+ * (sel = 0, base = 0, limit = 0xffff).
+ */
+ sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
+ goto done;
+ }
+
+ /* Get the old TSS base and limit from the guest's task register */
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
+ &access);
+ assert(error == 0);
+ assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
+ ot_type = SEG_DESC_TYPE(access);
+ assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
+
+ /* Fetch the old TSS descriptor */
+ error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
+ CHKERR(error);
+
+ /* Get the old TSS */
+ error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+ PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
+ CHKERR(error);
+ vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
+
+ /*
+ * Clear the busy bit in the old TSS descriptor if the task switch
+ * due to an IRET or JMP instruction.
+ */
+ if (reason == TSR_IRET || reason == TSR_JMP) {
+ ot_desc.sd_type &= ~0x2;
+ error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
+ &ot_desc);
+ CHKERR(error);
+ }
+
+ if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
+ fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
+ return (VMEXIT_ABORT);
+ }
+
+ /* Save processor state in old TSS */
+ eip = vmexit->rip + vmexit->inst_length;
+ tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
+
+ /*
+ * If the task switch was triggered for any reason other than IRET
+ * then set the busy bit in the new TSS descriptor.
+ */
+ if (reason != TSR_IRET) {
+ nt_desc.sd_type |= 0x2;
+ error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
+ &nt_desc);
+ CHKERR(error);
+ }
+
+ /* Update task register to point at the new TSS */
+ SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
+
+ /* Update the hidden descriptor state of the task register */
+ nt = usd_to_seg_desc(&nt_desc);
+ update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
+
+ /* Set CR0.TS */
+ cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+ SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
+
+ /*
+ * We are now committed to the task switch. Any exceptions encountered
+ * after this point will be handled in the context of the new task and
+ * the saved instruction pointer will belong to the new task.
+ */
+ vmexit->rip = newtss.tss_eip;
+ vmexit->inst_length = 0;
+
+ /* Load processor state from new TSS */
+ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
+ CHKERR(error);
+
+ /*
+ * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
+ * caused an error code to be generated, this error code is copied
+ * to the stack of the new task.
+ */
+ if (task_switch->errcode_valid) {
+ assert(task_switch->ext);
+ assert(task_switch->reason == TSR_IDT_GATE);
+ error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
+ task_switch->errcode);
+ CHKERR(error);
+ }
+
+ /*
+ * Treatment of virtual-NMI blocking if NMI is delivered through
+ * a task gate.
+ *
+ * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
+ * If the virtual NMIs VM-execution control is 1, VM entry injects
+ * an NMI, and delivery of the NMI causes a task switch that causes
+ * a VM exit, virtual-NMI blocking is in effect before the VM exit
+ * commences.
+ *
+ * Thus, virtual-NMI blocking is in effect at the time of the task
+ * switch VM exit.
+ */
+
+ /*
+ * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
+ *
+ * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
+ * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
+ * This unblocking of virtual-NMI occurs even if IRET causes a fault.
+ *
+ * Thus, virtual-NMI blocking is cleared at the time of the task switch
+ * VM exit.
+ */
+
+ /*
+ * If the task switch was triggered by an event delivered through
+ * the IDT then extinguish the pending event from the vcpu's
+ * exitintinfo.
+ */
+ if (task_switch->reason == TSR_IDT_GATE) {
+ error = vm_set_intinfo(ctx, vcpu, 0);
+ assert(error == 0);
+ }
+
+ /*
+ * XXX should inject debug exception if 'T' bit is 1
+ */
+done:
+ return (VMEXIT_CONTINUE);
+}
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index 4e58dd6..1f27300 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -437,7 +437,7 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail)
if (used_all_avail &&
(vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
intr = 1;
- else if (vs->vs_flags & VIRTIO_EVENT_IDX) {
+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
event_idx = VQ_USED_EVENT_IDX(vq);
/*
* This calculation is per docs and the kernel
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index 01b5f7b..1f29dfa 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -352,7 +352,7 @@ struct virtio_consts {
/* called to read config regs */
int (*vc_cfgwrite)(void *, int, int, uint32_t);
/* called to write config regs */
- uint32_t vc_hv_caps; /* hypervisor-provided capabilities */
+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
};
/*
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index e77f0d7..b6006b7 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -195,7 +195,8 @@ usage(void)
" [--force-reset]\n"
" [--force-poweroff]\n"
" [--get-active-cpus]\n"
- " [--get-suspended-cpus]\n",
+ " [--get-suspended-cpus]\n"
+ " [--get-intinfo]\n",
progname);
exit(1);
}
@@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt;
static int force_reset, force_poweroff;
static const char *capname;
static int create, destroy, get_lowmem, get_highmem;
+static int get_intinfo;
static int get_active_cpus, get_suspended_cpus;
static uint64_t memsize;
static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
@@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus)
printf("\n");
}
+static void
+print_intinfo(const char *banner, uint64_t info)
+{
+ int type;
+
+ printf("%s:\t", banner);
+ if (info & VM_INTINFO_VALID) {
+ type = info & VM_INTINFO_TYPE;
+ switch (type) {
+ case VM_INTINFO_HWINTR:
+ printf("extint");
+ break;
+ case VM_INTINFO_NMI:
+ printf("nmi");
+ break;
+ case VM_INTINFO_SWINTR:
+ printf("swint");
+ break;
+ default:
+ printf("exception");
+ break;
+ }
+ printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
+ if (info & VM_INTINFO_DEL_ERRCODE)
+ printf(" errcode %#x", (u_int)(info >> 32));
+ } else {
+ printf("n/a");
+ }
+ printf("\n");
+}
+
int
main(int argc, char *argv[])
{
@@ -420,7 +453,7 @@ main(int argc, char *argv[])
vm_paddr_t gpa, gpa_pmap;
size_t len;
struct vm_exit vmexit;
- uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
+ uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];
struct vmctx *ctx;
int wired;
cpuset_t cpus;
@@ -595,6 +628,7 @@ main(int argc, char *argv[])
{ "force-poweroff", NO_ARG, &force_poweroff, 1 },
{ "get-active-cpus", NO_ARG, &get_active_cpus, 1 },
{ "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 },
+ { "get-intinfo", NO_ARG, &get_intinfo, 1 },
{ NULL, 0, NULL, 0 }
};
@@ -1566,6 +1600,14 @@ main(int argc, char *argv[])
print_cpus("suspended cpus", &cpus);
}
+ if (!error && (get_intinfo || get_all)) {
+ error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
+ if (!error) {
+ print_intinfo("pending", info[0]);
+ print_intinfo("current", info[1]);
+ }
+ }
+
if (!error && run) {
error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
index 3a300cc..0bdf151 100644
--- a/usr.sbin/bhyveload/bhyveload.8
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -35,11 +35,11 @@
guest inside a bhyve virtual machine
.Sh SYNOPSIS
.Nm
-.Op Fl m Ar mem-size
+.Op Fl c Ar cons-dev
.Op Fl d Ar disk-path
-.Op Fl h Ar host-path
.Op Fl e Ar name=value
-.Op Fl c Ar cons-dev
+.Op Fl h Ar host-path
+.Op Fl m Ar mem-size
.Ar vmname
.Sh DESCRIPTION
.Nm
@@ -62,6 +62,32 @@ and will be created if it does not already exist.
.Sh OPTIONS
The following options are available:
.Bl -tag -width indent
+.It Fl c Ar cons-dev
+.Ar cons-dev
+is a
+.Xr tty 4
+device to use for
+.Nm
+terminal I/O.
+.Pp
+The text string "stdio" is also accepted and selects the use of
+unbuffered standard I/O. This is the default value.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl e Ar name=value
+Set the FreeBSD loader environment variable
+.Ar name
+to
+.Ar value .
+.Pp
+The option may be used more than once to set more than one environment
+variable.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
.It Fl m Ar mem-size Xo
.Sm off
.Op Cm K | k | M | m | G | g | T | t
@@ -85,32 +111,6 @@ respectively.
The default value of
.Ar mem-size
is 256M.
-.It Fl d Ar disk-path
-The
-.Ar disk-path
-is the pathname of the guest's boot disk image.
-.It Fl h Ar host-path
-The
-.Ar host-path
-is the directory at the top of the guest's boot filesystem.
-.It Fl e Ar name=value
-Set the FreeBSD loader environment variable
-.Ar name
-to
-.Ar value .
-.Pp
-The option may be used more than once to set more than one environment
-variable.
-.It Fl c Ar cons-dev
-.Ar cons-dev
-is a
-.Xr tty 4
-device to use for
-.Nm
-terminal I/O.
-.Pp
-The text string "stdio" is also accepted and selects the use of
-unbuffered standard I/O. This is the default value.
.El
.Sh EXAMPLES
To create a virtual machine named
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index ff6b269..eaf71a8 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -629,8 +629,8 @@ usage(void)
{
fprintf(stderr,
- "usage: %s [-m mem-size] [-d <disk-path>] [-h <host-path>]\n"
- " %*s [-e <name=value>] [-c <console-device>] <vmname>\n",
+ "usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
+ " %*s [-h <host-path>] [-m mem-size] <vmname>\n",
progname,
(int)strlen(progname), "");
exit(1);
OpenPOWER on IntegriCloud