MFC 258859,259081,259085,259205,259213,259275,259482,259537,259702,259779:

Several changes to the local APIC support in bhyve: - Rename 'vm_interrupt_hostcpu()' to 'vcpu_notify_event()'. - If a vcpu disables its local apic and then executes a 'HLT' then spin down the vcpu and destroy its thread context. Also modify the 'HLT' processing to ignore pending interrupts in the IRR if interrupts have been disabled by the guest. The interrupt cannot be injected into the guest in any case so resuming it is futile. - Use callout(9) to drive the vlapic timer instead of clocking it on each VM exit. - When the guest is bringing up the APs in the x2APIC mode a write to the ICR register will now trigger a return to userspace with an exitcode of VM_EXITCODE_SPINUP_AP. - Change the vlapic timer lock to be a spinlock because the vlapic can be accessed from within a critical section (vm run loop) when guest is using x2apic mode. - Fix the vlapic version register. - Add a command to bhyvectl to inject an NMI on a specific vcpu. - Add an API to deliver message signalled interrupts to vcpus. This allows callers to treat the MSI 'addr' and 'data' fields as opaque and also lets bhyve implement multiple destination modes: physical, flat and clustered. - Rename the ambiguously named 'vm_setup_msi()' and 'vm_setup_msix()' to 'vm_setup_pptdev_msi()' and 'vm_setup_pptdev_msix()' respectively. - Consolidate the virtual apic initialization in a single function: vlapic_reset() - Add a generic routine to trigger an LVT interrupt that supports both fixed and NMI delivery modes. - Add an ioctl and bhyvectl command to trigger local interrupts inside a guest. In particular, a global NMI similar to that raised by SERR# or PERR# can be simulated by asserting LINT1 on all vCPUs. - Extend the LVT table in the vCPU local APIC to support CMCI. - Flesh out the local APIC error reporting a bit to cache errors and report them via ESR when ESR is written to. Add support for asserting the error LVT when an error occurs. Raise illegal vector errors when attempting to signal an invalid vector for an interrupt or when sending an IPI. - Export table entries in the MADT and MP Table advertising the stock x86 config of LINT0 set to ExtInt and LINT1 wired to NMI.
author: jhb <jhb@FreeBSD.org> 2014-02-23 00:46:05 +0000
committer: jhb <jhb@FreeBSD.org> 2014-02-23 00:46:05 +0000
commit: 69d17427cae2b573203a13c2fe8cac0865c3cfdc (patch)
tree: 74c089432fac1660f52a522e3e53195374381e38 /sys/amd64
parent: 04e37d68ee180962d9cdaef4ffd90789f36548ab (diff)
download: FreeBSD-src-69d17427cae2b573203a13c2fe8cac0865c3cfdc.zip
FreeBSD-src-69d17427cae2b573203a13c2fe8cac0865c3cfdc.tar.gz
15 files changed, 880 insertions, 313 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index f7acb62..d6f1a5a 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -158,7 +158,7 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 }
 
 void *vcpu_stats(struct vm *vm, int vcpu);
-void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+void vcpu_notify_event(struct vm *vm, int vcpuid);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
@@ -266,6 +266,7 @@ enum vm_exitcode {
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
+	VM_EXITCODE_SPINDOWN_CPU,
 	VM_EXITCODE_MAX
 };
 
@@ -310,6 +311,9 @@ struct vm_exit {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
+		struct {
+			uint64_t	rflags;
+		} hlt;
 	} u;
 };
 
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 19a5b02..454c411 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -66,6 +66,11 @@ struct vm_event {
 	int		error_code_valid;
 };
 
+struct vm_lapic_msi {
+	uint64_t	msg;
+	uint64_t	addr;
+};
+
 struct vm_lapic_irq {
 	int		cpuid;
 	int		vector;
@@ -103,8 +108,8 @@ struct vm_pptdev_msi {
 	int		slot;
 	int		func;
 	int		numvec;		/* 0 means disabled */
-	int		vector;
-	int		destcpu;
+	uint64_t	msg;
+	uint64_t	addr;
 };
 
 struct vm_pptdev_msix {
@@ -113,7 +118,7 @@ struct vm_pptdev_msix {
 	int		slot;
 	int		func;
 	int		idx;
-	uint32_t	msg;
+	uint64_t	msg;
 	uint32_t	vector_control;
 	uint64_t	addr;
 };
@@ -175,6 +180,8 @@ enum {
 	IOCNUM_IOAPIC_ASSERT_IRQ = 33,
 	IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
 	IOCNUM_IOAPIC_PULSE_IRQ = 35,
+	IOCNUM_LAPIC_MSI = 36,
+	IOCNUM_LAPIC_LOCAL_IRQ = 37, 
 
 	/* PCI pass-thru */
 	IOCNUM_BIND_PPTDEV = 40,
@@ -211,6 +218,10 @@ enum {
 	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
 #define	VM_LAPIC_IRQ 		\
 	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_LAPIC_LOCAL_IRQ 	\
+	_IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
+#define	VM_LAPIC_MSI		\
+	_IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
 #define	VM_IOAPIC_ASSERT_IRQ	\
 	_IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
 #define	VM_IOAPIC_DEASSERT_IRQ	\
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9071f3e..10e83ea 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1359,7 +1359,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vmcs *vmcs;
 	struct vmxctx *vmxctx;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
-	uint64_t qual, gpa;
+	uint64_t qual, gpa, rflags;
+	bool retu;
 
 	handled = 0;
 	vmcs = &vmx->vmcs[vcpu];
@@ -1405,31 +1406,46 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
+		retu = false;
 		ecx = vmxctx->guest_rcx;
-		error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
-		} else
+		} else if (!retu) {
 			handled = 1;
+		} else {
+			/* Return to userspace with a valid exitcode */
+			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+			    ("emulate_wrmsr retu with bogus exitcode"));
+		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
+		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
-					(uint64_t)edx << 32 | eax);
+		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
-		} else
+		} else if (!retu) {
 			handled = 1;
+		} else {
+			/* Return to userspace with a valid exitcode */
+			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+			    ("emulate_wrmsr retu with bogus exitcode"));
+		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		if ((error = vmread(VMCS_GUEST_RFLAGS, &rflags)) != 0)
+			panic("vmx_exit_process: vmread(rflags) %d", error);
 		vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->u.hlt.rflags = rflags;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@@ -1584,7 +1600,6 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
 		panic("vmx_run: error %d setting up pcpu defaults", error);
 
 	do {
-		lapic_timer_tick(vmx->vm, vcpu);
 		vmx_inject_interrupts(vmx, vcpu);
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_setjmp(vmxctx);
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index fce4bbd..32d59a0 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -72,8 +72,8 @@ MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
 
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
 	struct pptdev	*pptdev;
-	int		vec;
-	int 		vcpu;
+	uint64_t	addr;
+	uint64_t	msg_data;
 };
 
 static struct pptdev {
@@ -412,16 +412,14 @@ ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 static int
 pptintr(void *arg)
 {
-	int vec;
 	struct pptdev *ppt;
 	struct pptintr_arg *pptarg;
 	
 	pptarg = arg;
 	ppt = pptarg->pptdev;
-	vec = pptarg->vec;
 
 	if (ppt->vm != NULL)
-		lapic_intr_edge(ppt->vm, pptarg->vcpu, vec);
+		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
 	else {
 		/*
 		 * XXX
@@ -441,15 +439,13 @@ pptintr(void *arg)
 
 int
 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
-	      int destcpu, int vector, int numvec)
+	      uint64_t addr, uint64_t msg, int numvec)
 {
 	int i, rid, flags;
 	int msi_count, startrid, error, tmp;
 	struct pptdev *ppt;
 
-	if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
-	    (vector < 0 || vector > 255) ||
-	    (numvec < 0 || numvec > MAX_MSIMSGS))
+	if (numvec < 0 || numvec > MAX_MSIMSGS)
 		return (EINVAL);
 
 	ppt = ppt_find(bus, slot, func);
@@ -513,8 +509,8 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 			break;
 
 		ppt->msi.arg[i].pptdev = ppt;
-		ppt->msi.arg[i].vec = vector + i;
-		ppt->msi.arg[i].vcpu = destcpu;
+		ppt->msi.arg[i].addr = addr;
+		ppt->msi.arg[i].msg_data = msg + i;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
@@ -534,7 +530,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 
 int
 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
-	       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
 	struct pptdev *ppt;
 	struct pci_devinfo *dinfo;
@@ -605,8 +601,8 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
 			return (ENXIO);
 	
 		ppt->msix.arg[idx].pptdev = ppt;
-		ppt->msix.arg[idx].vec = msg & 0xFF;
-		ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+		ppt->msix.arg[idx].addr = addr;
+		ppt->msix.arg[idx].msg_data = msg;
 	
 		/* Setup the MSI-X interrupt */
 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
index 7670bc4..45ba323 100644
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@@ -33,9 +33,9 @@ int	ppt_unassign_all(struct vm *vm);
 int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
-		      int destcpu, int vector, int numvec);
+		      uint64_t addr, uint64_t msg, int numvec);
 int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
-		int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+		int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
 int	ppt_num_devices(struct vm *vm);
 boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
 
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index 112480ee..929b343 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -240,8 +240,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 static void
 vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 {
-	int apicid, vector, vcpuid, pin;
-	cpuset_t dmask;
+	int pin;
 
 	/* If interrupts are not enabled for this timer then just return. */
 	if (!vhpet_timer_interrupt_enabled(vhpet, n))
@@ -256,26 +255,8 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 	}
 
 	if (vhpet_timer_msi_enabled(vhpet, n)) {
-		/*
-		 * XXX should have an API 'vlapic_deliver_msi(vm, addr, data)'
-		 * - assuming physical delivery mode
-		 * - no need to interpret contents of 'msireg' here
-		 */
-		vector = vhpet->timer[n].msireg & 0xff;
-		apicid = (vhpet->timer[n].msireg >> (32 + 12)) & 0xff;
-		if (apicid != 0xff) {
-			/* unicast */
-			vcpuid = vm_apicid2vcpuid(vhpet->vm, apicid);
-			lapic_intr_edge(vhpet->vm, vcpuid, vector);
-		} else {
-			/* broadcast */
-			dmask = vm_active_cpus(vhpet->vm);
-			while ((vcpuid = CPU_FFS(&dmask)) != 0) {
-				vcpuid--;
-				CPU_CLR(vcpuid, &dmask);
-				lapic_intr_edge(vhpet->vm, vcpuid, vector);
-			}
-		}
+		lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32,
+		    vhpet->timer[n].msireg & 0xffffffff);
 		return;
 	}	
 
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 167e8ab..151065a 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
+#include "vlapic.h"
 #include "vioapic.h"
 
 #define	IOREGSEL	0x00
@@ -91,25 +92,14 @@ pinstate_str(bool asserted)
 	else
 		return ("deasserted");
 }
-
-static const char *
-trigger_str(bool level)
-{
-
-	if (level)
-		return ("level");
-	else
-		return ("edge");
-}
 #endif
 
 static void
 vioapic_send_intr(struct vioapic *vioapic, int pin)
 {
-	int vector, apicid, vcpuid;
-	uint32_t low, high;
-	cpuset_t dmask;
-	bool level;
+	int vector, delmode;
+	uint32_t low, high, dest;
+	bool level, phys;
 
 	KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
 	    ("vioapic_set_pinstate: invalid pin number %d", pin));
@@ -120,52 +110,20 @@ vioapic_send_intr(struct vioapic *vioapic, int pin)
 	low = vioapic->rtbl[pin].reg;
 	high = vioapic->rtbl[pin].reg >> 32;
 
-	/*
-	 * XXX We only deal with:
-	 * - physical destination
-	 * - fixed delivery mode
-	 */
-	if ((low & IOART_DESTMOD) != IOART_DESTPHY) {
-		VIOAPIC_CTR2(vioapic, "ioapic pin%d: unsupported dest mode "
-		    "0x%08x", pin, low);
-		return;
-	}
-
-	if ((low & IOART_DELMOD) != IOART_DELFIXED) {
-		VIOAPIC_CTR2(vioapic, "ioapic pin%d: unsupported delivery mode "
-		    "0x%08x", pin, low);
-		return;
-	}
-
 	if ((low & IOART_INTMASK) == IOART_INTMSET) {
 		VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin);
 		return;
 	}
 
+	phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+	delmode = low & IOART_DELMOD;
 	level = low & IOART_TRGRLVL ? true : false;
 	if (level)
 		vioapic->rtbl[pin].reg |= IOART_REM_IRR;
 
 	vector = low & IOART_INTVEC;
-	apicid = high >> APIC_ID_SHIFT;
-	if (apicid != 0xff) {
-		/* unicast */
-		vcpuid = vm_apicid2vcpuid(vioapic->vm, apicid);
-		VIOAPIC_CTR4(vioapic, "ioapic pin%d: %s triggered intr "
-		    "vector %d on vcpuid %d", pin, trigger_str(level),
-		    vector, vcpuid);
-		lapic_set_intr(vioapic->vm, vcpuid, vector, level);
-	} else {
-		/* broadcast */
-		VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s triggered intr "
-		    "vector %d on all vcpus", pin, trigger_str(level), vector);
-		dmask = vm_active_cpus(vioapic->vm);
-		while ((vcpuid = CPU_FFS(&dmask)) != 0) {
-			vcpuid--;
-			CPU_CLR(vcpuid, &dmask);
-			lapic_set_intr(vioapic->vm, vcpuid, vector, level);
-		}
-	}
+	dest = high >> APIC_ID_SHIFT;
+	vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
 }
 
 static void
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 6e5b5ea..695040d 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -30,8 +30,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
+#include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 
@@ -53,6 +55,9 @@ __FBSDID("$FreeBSD$");
 #define	VLAPIC_CTR1(vlapic, format, p1)					\
 	VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
 
+#define	VLAPIC_CTR2(vlapic, format, p1, p2)				\
+	VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
+
 #define	VLAPIC_CTR_IRR(vlapic, msg)					\
 do {									\
 	uint32_t *irrptr = &(vlapic)->apic.irr0;			\
@@ -86,7 +91,7 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 #define	PRIO(x)			((x) >> 4)
 
 #define VLAPIC_VERSION		(16)
-#define VLAPIC_MAXLVT_ENTRIES	(5)
+#define VLAPIC_MAXLVT_ENTRIES	(APIC_LVT_CMCI)
 
 #define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
 
@@ -100,12 +105,16 @@ struct vlapic {
 	struct vm		*vm;
 	int			vcpuid;
 
-	struct LAPIC		 apic;
+	struct LAPIC		apic;
 
-	int			 esr_update;
+	uint32_t		esr_pending;
+	int			esr_firing;
 
-	int			 divisor;
-	int			 ccr_ticks;
+	struct callout	callout;	/* vlapic timer */
+	struct bintime	timer_fire_bt;	/* callout expiry time */
+	struct bintime	timer_freq_bt;	/* timer frequency */
+	struct bintime	timer_period_bt; /* timer period */
+	struct mtx	timer_mtx;
 
 	/*
 	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
@@ -120,8 +129,101 @@ struct vlapic {
 	enum boot_state		boot_state;
 };
 
+/*
+ * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
+ * vlapic_callout_handler() and vcpu accesses to the following registers:
+ * - initial count register aka icr_timer
+ * - current count register aka ccr_timer
+ * - divide config register aka dcr_timer
+ * - timer LVT register
+ *
+ * Note that the vlapic_callout_handler() does not write to any of these
+ * registers so they can be safely read from the vcpu context without locking.
+ */
+#define	VLAPIC_TIMER_LOCK(vlapic)	mtx_lock_spin(&((vlapic)->timer_mtx))
+#define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
+#define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
+
 #define VLAPIC_BUS_FREQ	tsc_freq
 
+static __inline uint32_t
+vlapic_get_id(struct vlapic *vlapic)
+{
+
+	if (x2apic(vlapic))
+		return (vlapic->vcpuid);
+	else
+		return (vlapic->vcpuid << 24);
+}
+
+static __inline uint32_t
+vlapic_get_ldr(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	int apicid;
+	uint32_t ldr;
+
+	lapic = &vlapic->apic;
+	if (x2apic(vlapic)) {
+		apicid = vlapic_get_id(vlapic);
+		ldr = 1 << (apicid & 0xf);
+		ldr |= (apicid & 0xffff0) << 12;
+		return (ldr);
+	} else
+		return (lapic->ldr);
+}
+
+static __inline uint32_t
+vlapic_get_dfr(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+
+	lapic = &vlapic->apic;
+	if (x2apic(vlapic))
+		return (0);
+	else
+		return (lapic->dfr);
+}
+
+static void
+vlapic_set_dfr(struct vlapic *vlapic, uint32_t data)
+{
+	uint32_t dfr;
+	struct LAPIC *lapic;
+	
+	if (x2apic(vlapic)) {
+		VM_CTR1(vlapic->vm, "write to DFR in x2apic mode: %#x", data);
+		return;
+	}
+
+	lapic = &vlapic->apic;
+	dfr = (lapic->dfr & APIC_DFR_RESERVED) | (data & APIC_DFR_MODEL_MASK);
+	if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
+		VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
+	else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
+		VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
+	else
+		VLAPIC_CTR1(vlapic, "vlapic DFR in Unknown Model %#x", dfr);
+
+	lapic->dfr = dfr;
+}
+
+static void
+vlapic_set_ldr(struct vlapic *vlapic, uint32_t data)
+{
+	struct LAPIC *lapic;
+
+	/* LDR is read-only in x2apic mode */
+	if (x2apic(vlapic)) {
+		VLAPIC_CTR1(vlapic, "write to LDR in x2apic mode: %#x", data);
+		return;
+	}
+
+	lapic = &vlapic->apic;
+	lapic->ldr = data & ~APIC_LDR_RESERVED;
+	VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
+}
+
 static int
 vlapic_timer_divisor(uint32_t dcr)
 {
@@ -167,48 +269,92 @@ vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
 }
 #endif
 
-static uint64_t
+static uint32_t
 vlapic_get_ccr(struct vlapic *vlapic)
 {
-	struct LAPIC    *lapic = &vlapic->apic;
-	return lapic->ccr_timer;
+	struct bintime bt_now, bt_rem;
+	struct LAPIC *lapic;
+	uint32_t ccr;
+	
+	ccr = 0;
+	lapic = &vlapic->apic;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+	if (callout_active(&vlapic->callout)) {
+		/*
+		 * If the timer is scheduled to expire in the future then
+		 * compute the value of 'ccr' based on the remaining time.
+		 */
+		binuptime(&bt_now);
+		if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
+			bt_rem = vlapic->timer_fire_bt;
+			bintime_sub(&bt_rem, &bt_now);
+			ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
+			ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
+		}
+	}
+	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
+	    "icr_timer is %#x", ccr, lapic->icr_timer));
+	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
+	    ccr, lapic->icr_timer);
+	VLAPIC_TIMER_UNLOCK(vlapic);
+	return (ccr);
 }
 
 static void
-vlapic_update_errors(struct vlapic *vlapic)
+vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr)
 {
-	struct LAPIC    *lapic = &vlapic->apic;
-	lapic->esr = 0; // XXX 
+	struct LAPIC *lapic;
+	int divisor;
+	
+	lapic = &vlapic->apic;
+	VLAPIC_TIMER_LOCK(vlapic);
+
+	lapic->dcr_timer = dcr;
+	divisor = vlapic_timer_divisor(dcr);
+	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor);
+
+	/*
+	 * Update the timer frequency and the timer period.
+	 *
+	 * XXX changes to the frequency divider will not take effect until
+	 * the timer is reloaded.
+	 */
+	FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
+	vlapic->timer_period_bt = vlapic->timer_freq_bt;
+	bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
+
+	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 static void
-vlapic_init_ipi(struct vlapic *vlapic)
+vlapic_update_errors(struct vlapic *vlapic)
 {
 	struct LAPIC    *lapic = &vlapic->apic;
-	lapic->version = VLAPIC_VERSION;
-	lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
-	lapic->dfr = 0xffffffff;
-	lapic->svr = APIC_SVR_VECTOR;
-	vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+	lapic->esr = vlapic->esr_pending;
+	vlapic->esr_pending = 0;
 }
 
-static int
+static void
 vlapic_reset(struct vlapic *vlapic)
 {
-	struct LAPIC	*lapic = &vlapic->apic;
+	struct LAPIC *lapic;
+	
+	lapic = &vlapic->apic;
+	bzero(lapic, sizeof(struct LAPIC));
 
-	memset(lapic, 0, sizeof(*lapic));
-	lapic->apr = vlapic->vcpuid;
-	vlapic_init_ipi(vlapic);
-	vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_ENTRIES << MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(&lapic->lvt_timer, 6);
+	vlapic_mask_lvts(&lapic->lvt_cmci, 1);
+	vlapic_set_dcr(vlapic, 0);
 
 	if (vlapic->vcpuid == 0)
 		vlapic->boot_state = BS_RUNNING;	/* BSP */
 	else
 		vlapic->boot_state = BS_INIT;		/* AP */
-	
-	return 0;
-
 }
 
 void
@@ -221,6 +367,17 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	if (vector < 0 || vector >= 256)
 		panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
 
+	if (!(lapic->svr & APIC_SVR_ENABLE)) {
+		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
+		    "interrupt %d", vector);
+		return;
+	}
+
+	if (vector < 16) {
+		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
+		return;
+	}
+		
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
 
@@ -241,39 +398,93 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
 }
 
-static void
-vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+static __inline uint32_t *
+vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
 {
-	uint32_t icr_timer;
-
-	icr_timer = vlapic->apic.icr_timer;
+	struct LAPIC	*lapic = &vlapic->apic;
+	int 		 i;
 
-	vlapic->ccr_ticks = ticks;
-	if (elapsed < icr_timer)
-		vlapic->apic.ccr_timer = icr_timer - elapsed;
-	else {
-		/*
-		 * This can happen when the guest is trying to run its local
-		 * apic timer higher that the setting of 'hz' in the host.
-		 *
-		 * We deal with this by running the guest local apic timer
-		 * at the rate of the host's 'hz' setting.
-		 */
-		vlapic->apic.ccr_timer = 0;
+	switch (offset) {
+	case APIC_OFFSET_CMCI_LVT:
+		return (&lapic->lvt_cmci);
+	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+		return ((&lapic->lvt_timer) + i);;
+	default:
+		panic("vlapic_get_lvt: invalid LVT\n");
 	}
 }
 
-static __inline uint32_t *
+static __inline uint32_t
 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
 {
-	struct LAPIC	*lapic = &vlapic->apic;
-	int 		 i;
 
-	if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
-		panic("vlapic_get_lvt: invalid LVT\n");
+	return (*vlapic_get_lvtptr(vlapic, offset));
+}
+
+static void
+vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val)
+{
+	uint32_t *lvtptr, mask;
+	struct LAPIC *lapic;
+	
+	lapic = &vlapic->apic;
+	lvtptr = vlapic_get_lvtptr(vlapic, offset);	
+
+	if (offset == APIC_OFFSET_TIMER_LVT)
+		VLAPIC_TIMER_LOCK(vlapic);
+
+	if (!(lapic->svr & APIC_SVR_ENABLE))
+		val |= APIC_LVT_M;
+	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
+	switch (offset) {
+	case APIC_OFFSET_TIMER_LVT:
+		mask |= APIC_LVTT_TM;
+		break;
+	case APIC_OFFSET_ERROR_LVT:
+		break;
+	case APIC_OFFSET_LINT0_LVT:
+	case APIC_OFFSET_LINT1_LVT:
+		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
+		/* FALLTHROUGH */
+	default:
+		mask |= APIC_LVT_DM;
+		break;
 	}
-	i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
-	return ((&lapic->lvt_timer) + i);;
+	*lvtptr = val & mask;
+
+	if (offset == APIC_OFFSET_TIMER_LVT)
+		VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+static int
+vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
+{
+	uint32_t vec, mode;
+
+	if (lvt & APIC_LVT_M)
+		return (0);
+
+	vec = lvt & APIC_LVT_VECTOR;
+	mode = lvt & APIC_LVT_DM;
+
+	switch (mode) {
+	case APIC_LVT_DM_FIXED:
+		if (vec < 16) {
+			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+			return (0);
+		}
+		vlapic_set_intr_ready(vlapic, vec, false);
+		vcpu_notify_event(vlapic->vm, vlapic->vcpuid);
+		break;
+	case APIC_LVT_DM_NMI:
+		vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
+		break;
+	default:
+		// Other modes ignored
+		return (0);
+	}
+	return (1);
 }
 
 #if 1
@@ -398,44 +609,314 @@ vlapic_process_eoi(struct vlapic *vlapic)
 }
 
 static __inline int
-vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
 {
-	return (*lvt & mask);
+
+	return (lvt & mask);
 }
 
 static __inline int
 vlapic_periodic_timer(struct vlapic *vlapic)
 {
-	uint32_t *lvt;
+	uint32_t lvt;
 	
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 
 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
 }
 
+static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
+
+void
+vlapic_set_error(struct vlapic *vlapic, uint32_t mask)
+{
+	uint32_t lvt;
+
+	vlapic->esr_pending |= mask;
+	if (vlapic->esr_firing)
+		return;
+	vlapic->esr_firing = 1;
+
+	// The error LVT always uses the fixed delivery mode.
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
+	}
+	vlapic->esr_firing = 0;
+}
+
 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
 
 static void
 vlapic_fire_timer(struct vlapic *vlapic)
 {
-	int vector;
-	uint32_t *lvt;
+	uint32_t lvt;
+
+	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
 	
+	// The timer LVT always uses the fixed delivery mode.
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
-
-	if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
-		vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
-		vlapic_set_intr_ready(vlapic, vector, false);
+	}
+}
+
+static VMM_STAT(VLAPIC_INTR_CMC,
+    "corrected machine check interrupts generated by vlapic");
+
+void
+vlapic_fire_cmci(struct vlapic *vlapic)
+{
+	uint32_t lvt;
+
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+	if (vlapic_fire_lvt(vlapic, lvt)) {
+		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
+	}
+}
+
+static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_ENTRIES,
+    "lvts triggered");
+
+int
+vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
+{
+	uint32_t lvt;
+
+	switch (vector) {
+	case APIC_LVT_LINT0:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT);
+		break;
+	case APIC_LVT_LINT1:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT);
+		break;
+	case APIC_LVT_TIMER:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+		lvt |= APIC_LVT_DM_FIXED;
+		break;
+	case APIC_LVT_ERROR:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+		lvt |= APIC_LVT_DM_FIXED;
+		break;
+	case APIC_LVT_PMC:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT);
+		break;
+	case APIC_LVT_THERMAL:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT);
+		break;
+	case APIC_LVT_CMCI:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+		break;
+	default:
+		return (EINVAL);
+	}
+	if (vlapic_fire_lvt(vlapic, lvt)) {
+		vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+		    LVTS_TRIGGERRED, vector, 1);
+	}
+	return (0);
+}
+
+static void
+vlapic_callout_handler(void *arg)
+{
+	struct vlapic *vlapic;
+	struct bintime bt, btnow;
+	sbintime_t rem_sbt;
+
+	vlapic = arg;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+	if (callout_pending(&vlapic->callout))	/* callout was reset */
+		goto done;
+
+	if (!callout_active(&vlapic->callout))	/* callout was stopped */
+		goto done;
+
+	callout_deactivate(&vlapic->callout);
+
+	KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled"));
+
+	vlapic_fire_timer(vlapic);
+
+	if (vlapic_periodic_timer(vlapic)) {
+		binuptime(&btnow);
+		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
+		    ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
+		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
+		    vlapic->timer_fire_bt.frac));
+
+		/*
+		 * Compute the delta between when the timer was supposed to
+		 * fire and the present time.
+		 */
+		bt = btnow;
+		bintime_sub(&bt, &vlapic->timer_fire_bt);
+
+		rem_sbt = bttosbt(vlapic->timer_period_bt);
+		if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
+			/*
+			 * Adjust the time until the next countdown downward
+			 * to account for the lost time.
+			 */
+			rem_sbt -= bttosbt(bt);
+		} else {
+			/*
+			 * If the delta is greater than the timer period then
+			 * just reset our time base instead of trying to catch
+			 * up.
+			 */
+			vlapic->timer_fire_bt = btnow;
+			VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
+			    "usecs, period is %lu usecs - resetting time base",
+			    bttosbt(bt) / SBT_1US,
+			    bttosbt(vlapic->timer_period_bt) / SBT_1US);
+		}
+
+		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+		callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
+		    vlapic_callout_handler, vlapic, 0);
+	}
+done:
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+static void
+vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer)
+{
+	struct LAPIC *lapic;
+	sbintime_t sbt;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+
+	lapic = &vlapic->apic;
+	lapic->icr_timer = icr_timer;
+
+	vlapic->timer_period_bt = vlapic->timer_freq_bt;
+	bintime_mul(&vlapic->timer_period_bt, icr_timer);
+
+	if (icr_timer != 0) {
+		binuptime(&vlapic->timer_fire_bt);
+		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+
+		sbt = bttosbt(vlapic->timer_period_bt);
+		callout_reset_sbt(&vlapic->callout, sbt, 0,
+		    vlapic_callout_handler, vlapic, 0);
+	} else
+		callout_stop(&vlapic->callout);
+
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+/*
+ * This function populates 'dmask' with the set of vcpus that match the
+ * addressing specified by the (dest, phys, lowprio) tuple.
+ * 
+ * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
+ * or xAPIC (8-bit) destination field.
+ */
+static void
+vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
+    bool lowprio, bool x2apic_dest)
+{
+	struct vlapic *vlapic;
+	uint32_t dfr, ldr, ldest, cluster;
+	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
+	cpuset_t amask;
+	int vcpuid;
+
+	if ((x2apic_dest && dest == 0xffffffff) ||
+	    (!x2apic_dest && dest == 0xff)) {
+		/*
+		 * Broadcast in both logical and physical modes.
+		 */
+		*dmask = vm_active_cpus(vm);
+		return;
+	}
+
+	if (phys) {
+		/*
+		 * Physical mode: destination is APIC ID.
+		 */
+		CPU_ZERO(dmask);
+		vcpuid = vm_apicid2vcpuid(vm, dest);
+		if (vcpuid < VM_MAXCPU)
+			CPU_SET(vcpuid, dmask);
+	} else {
+		/*
+		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
+		 * bitmask. This model is only avilable in the xAPIC mode.
+		 */
+		mda_flat_ldest = dest & 0xff;
+
+		/*
+		 * In the "Cluster Model" the MDA is used to identify a
+		 * specific cluster and a set of APICs in that cluster.
+		 */
+		if (x2apic_dest) {
+			mda_cluster_id = dest >> 16;
+			mda_cluster_ldest = dest & 0xffff;
+		} else {
+			mda_cluster_id = (dest >> 4) & 0xf;
+			mda_cluster_ldest = dest & 0xf;
+		}
+
+		/*
+		 * Logical mode: match each APIC that has a bit set
+		 * in it's LDR that matches a bit in the ldest.
+		 */
+		CPU_ZERO(dmask);
+		amask = vm_active_cpus(vm);
+		while ((vcpuid = CPU_FFS(&amask)) != 0) {
+			vcpuid--;
+			CPU_CLR(vcpuid, &amask);
+
+			vlapic = vm_lapic(vm, vcpuid);
+			dfr = vlapic_get_dfr(vlapic);
+			ldr = vlapic_get_ldr(vlapic);
+
+			if ((dfr & APIC_DFR_MODEL_MASK) ==
+			    APIC_DFR_MODEL_FLAT) {
+				ldest = ldr >> 24;
+				mda_ldest = mda_flat_ldest;
+			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
+			    APIC_DFR_MODEL_CLUSTER) {
+				if (x2apic(vlapic)) {
+					cluster = ldr >> 16;
+					ldest = ldr & 0xffff;
+				} else {
+					cluster = ldr >> 28;
+					ldest = (ldr >> 24) & 0xf;
+				}
+				if (cluster != mda_cluster_id)
+					continue;
+				mda_ldest = mda_cluster_ldest;
+			} else {
+				/*
+				 * Guest has configured a bad logical
+				 * model for this vcpu - skip it.
+				 */
+				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
+				    "model %x - cannot deliver interrupt", dfr);
+				continue;
+			}
+
+			if ((mda_ldest & ldest) != 0) {
+				CPU_SET(vcpuid, dmask);
+				if (lowprio)
+					break;
+			}
+		}
 	}
 }
 
 static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
 
 static int
-lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
 {
 	int i;
+	bool phys;
 	cpuset_t dmask;
 	uint32_t dest, vec, mode;
 	struct vlapic *vlapic2;
@@ -448,10 +929,17 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
 
+	if (mode == APIC_DELMODE_FIXED && vec < 16) {
+		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+		return (0);
+	}
+	
 	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
 		switch (icrval & APIC_DEST_MASK) {
 		case APIC_DEST_DESTFLD:
-			CPU_SETOF(dest, &dmask);
+			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
+			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
+			    x2apic(vlapic));
 			break;
 		case APIC_DEST_SELF:
 			CPU_SETOF(vlapic->vcpuid, &dmask);
@@ -508,17 +996,18 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
 			if (vlapic2->boot_state != BS_SIPI)
 				return (0);
 
-			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
-			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
-			vmexit->u.spinup_ap.vcpu = dest;
-			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
-
 			/*
 			 * XXX this assumes that the startup IPI always succeeds
 			 */
 			vlapic2->boot_state = BS_RUNNING;
 			vm_activate_cpu(vlapic2->vm, dest);
 
+			*retu = true;
+			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+			vmexit->u.spinup_ap.vcpu = dest;
+			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
 			return (0);
 		}
 	}
@@ -555,7 +1044,6 @@ vlapic_pending_intr(struct vlapic *vlapic)
 				break;
 		}
 	}
-	VLAPIC_CTR0(vlapic, "no pending intr");
 	return (-1);
 }
 
@@ -593,8 +1081,39 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector)
 	vlapic_update_ppr(vlapic);
 }
 
+static void
+lapic_set_svr(struct vlapic *vlapic, uint32_t new)
+{
+	struct LAPIC *lapic;
+	uint32_t old, changed;
+
+	lapic = &vlapic->apic;
+	old = lapic->svr;
+	changed = old ^ new;
+	if ((changed & APIC_SVR_ENABLE) != 0) {
+		if ((new & APIC_SVR_ENABLE) == 0) {
+			/*
+			 * The apic is now disabled so stop the apic timer.
+			 */
+			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
+			VLAPIC_TIMER_LOCK(vlapic);
+			callout_stop(&vlapic->callout);
+			VLAPIC_TIMER_UNLOCK(vlapic);
+		} else {
+			/*
+			 * The apic is now enabled so restart the apic timer
+			 * if it is configured in periodic mode.
+			 */
+			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
+			if (vlapic_periodic_timer(vlapic))
+				vlapic_set_icr_timer(vlapic, lapic->icr_timer);
+		}
+	}
+	lapic->svr = new;
+}
+
 int
-vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
+vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
 {
 	struct LAPIC	*lapic = &vlapic->apic;
 	uint32_t	*reg;
@@ -602,17 +1121,14 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
 
 	if (offset > sizeof(*lapic)) {
 		*data = 0;
-		return 0;
+		goto done;
 	}
 	
 	offset &= ~3;
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
-			if (x2apic(vlapic))
-				*data = vlapic->vcpuid;
-			else
-				*data = vlapic->vcpuid << 24;
+			*data = vlapic_get_id(vlapic);
 			break;
 		case APIC_OFFSET_VER:
 			*data = lapic->version;
@@ -630,10 +1146,10 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
 			*data = lapic->eoi;
 			break;
 		case APIC_OFFSET_LDR:
-			*data = lapic->ldr;
+			*data = vlapic_get_ldr(vlapic);
 			break;
 		case APIC_OFFSET_DFR:
-			*data = lapic->dfr;
+			*data = vlapic_get_dfr(vlapic);
 			break;
 		case APIC_OFFSET_SVR:
 			*data = lapic->svr;
@@ -662,9 +1178,9 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
 		case APIC_OFFSET_ICR_HI: 
 			*data = lapic->icr_hi;
 			break;
+		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
-			reg = vlapic_get_lvt(vlapic, offset);	
-			*data = *(reg);
+			*data = vlapic_get_lvt(vlapic, offset);	
 			break;
 		case APIC_OFFSET_ICR:
 			*data = lapic->icr_timer;
@@ -680,16 +1196,19 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data)
 			*data = 0;
 			break;
 	}
+done:
+	VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
 	return 0;
 }
 
 int
-vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
+vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
 {
 	struct LAPIC	*lapic = &vlapic->apic;
-	uint32_t	*reg;
 	int		retval;
 
+	VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data);
+
 	if (offset > sizeof(*lapic)) {
 		return 0;
 	}
@@ -708,18 +1227,20 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
 			vlapic_process_eoi(vlapic);
 			break;
 		case APIC_OFFSET_LDR:
+			vlapic_set_ldr(vlapic, data);
 			break;
 		case APIC_OFFSET_DFR:
+			vlapic_set_dfr(vlapic, data);
 			break;
 		case APIC_OFFSET_SVR:
-			lapic->svr = data;
+			lapic_set_svr(vlapic, data);
 			break;
 		case APIC_OFFSET_ICR_LOW: 
 			if (!x2apic(vlapic)) {
 				data &= 0xffffffff;
 				data |= (uint64_t)lapic->icr_hi << 32;
 			}
-			retval = lapic_process_icr(vlapic, data);
+			retval = lapic_process_icr(vlapic, data, retu);
 			break;
 		case APIC_OFFSET_ICR_HI:
 			if (!x2apic(vlapic)) {
@@ -727,22 +1248,16 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
 				lapic->icr_hi = data;
 			}
 			break;
+		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
-			reg = vlapic_get_lvt(vlapic, offset);	
-			if (!(lapic->svr & APIC_SVR_ENABLE)) {
-				data |= APIC_LVT_M;
-			}
-			*reg = data;
-			// vlapic_dump_lvt(offset, reg);
+			vlapic_set_lvt(vlapic, offset, data);
 			break;
 		case APIC_OFFSET_ICR:
-			lapic->icr_timer = data;
-			vlapic_start_timer(vlapic, 0);
+			vlapic_set_icr_timer(vlapic, data);
 			break;
 
 		case APIC_OFFSET_DCR:
-			lapic->dcr_timer = data;
-			vlapic->divisor = vlapic_timer_divisor(data);
+			vlapic_set_dcr(vlapic, data);
 			break;
 
 		case APIC_OFFSET_ESR:
@@ -764,70 +1279,6 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data)
 	return (retval);
 }
 
-int
-vlapic_timer_tick(struct vlapic *vlapic)
-{
-	int curticks, delta, periodic, fired;
-	uint32_t ccr;
-	uint32_t decrement, leftover;
-
-restart:
-	curticks = ticks;
-	delta = curticks - vlapic->ccr_ticks;
-
-	/* Local APIC timer is disabled */
-	if (vlapic->apic.icr_timer == 0)
-		return (-1);
-
-	/* One-shot mode and timer has already counted down to zero */
-	periodic = vlapic_periodic_timer(vlapic);
-	if (!periodic && vlapic->apic.ccr_timer == 0)
-		return (-1);
-	/*
-	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
-	 * 2^31 ticks. We deal with this by restarting the timer.
-	 */
-	if (delta < 0) {
-		vlapic_start_timer(vlapic, 0);
-		goto restart;
-	}
-
-	fired = 0;
-	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
-
-	vlapic->ccr_ticks = curticks;
-	ccr = vlapic->apic.ccr_timer;
-
-	while (delta-- > 0) {
-		if (ccr > decrement) {
-			ccr -= decrement;
-			continue;
-		}
-
-		/* Trigger the local apic timer interrupt */
-		vlapic_fire_timer(vlapic);
-		if (periodic) {
-			leftover = decrement - ccr;
-			vlapic_start_timer(vlapic, leftover);
-			ccr = vlapic->apic.ccr_timer;
-		} else {
-			/*
-			 * One-shot timer has counted down to zero.
-			 */
-			ccr = 0;
-		}
-		fired = 1;
-		break;
-	}
-
-	vlapic->apic.ccr_timer = ccr;
-
-	if (!fired)
-		return ((ccr / decrement) + 1);
-	else
-		return (0);
-}
-
 struct vlapic *
 vlapic_init(struct vm *vm, int vcpuid)
 {
@@ -837,6 +1288,16 @@ vlapic_init(struct vm *vm, int vcpuid)
 	vlapic->vm = vm;
 	vlapic->vcpuid = vcpuid;
 
+	/*
+	 * If the vlapic is configured in x2apic mode then it will be
+	 * accessed in the critical section via the MSR emulation code.
+	 *
+	 * Therefore the timer mutex must be a spinlock because blockable
+	 * mutexes cannot be acquired in a critical section.
+	 */
+	mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
+	callout_init(&vlapic->callout, 1);
+
 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
 
 	if (vcpuid == 0)
@@ -851,6 +1312,7 @@ void
 vlapic_cleanup(struct vlapic *vlapic)
 {
 
+	callout_drain(&vlapic->callout);
 	free(vlapic, M_VLAPIC);
 }
 
@@ -887,3 +1349,43 @@ vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 	if (state == X2APIC_DISABLED)
 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
 }
+
+void
+vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+    int delmode, int vec)
+{
+	bool lowprio;
+	int vcpuid;
+	cpuset_t dmask;
+
+	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
+		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
+		return;
+	}
+	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
+
+	/*
+	 * We don't provide any virtual interrupt redirection hardware so
+	 * all interrupts originating from the ioapic or MSI specify the
+	 * 'dest' in the legacy xAPIC format.
+	 */
+	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
+
+	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
+		vcpuid--;
+		CPU_CLR(vcpuid, &dmask);
+		lapic_set_intr(vm, vcpuid, vec, level);
+	}
+}
+
+bool
+vlapic_enabled(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic = &vlapic->apic;
+
+	if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
+	    (lapic->svr & APIC_SVR_ENABLE) != 0)
+		return (true);
+	else
+		return (false);
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index 8ea65ee..98f377e 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -69,6 +69,7 @@ struct vm;
 #define APIC_OFFSET_IRR6 	0x260   // IRR  192-223                    	R
 #define APIC_OFFSET_IRR7 	0x270   // IRR  224-255                    	R
 #define APIC_OFFSET_ESR		0x280   // Error Status Register           	R
+#define APIC_OFFSET_CMCI_LVT 	0x2F0   // Local Vector Table (CMCI)      	R/W
 #define APIC_OFFSET_ICR_LOW 	0x300   // Interrupt Command Reg. (0-31)   	R/W
 #define APIC_OFFSET_ICR_HI 	0x310   // Interrupt Command Reg. (32-63)  	R/W
 #define APIC_OFFSET_TIMER_LVT 	0x320   // Local Vector Table (Timer)      	R/W
@@ -90,15 +91,22 @@ enum x2apic_state;
 
 struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
 void vlapic_cleanup(struct vlapic *vlapic);
-int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data);
-int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data);
+int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data,
+    bool *retu);
+int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data,
+    bool *retu);
 int vlapic_pending_intr(struct vlapic *vlapic);
 void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
 void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
-int vlapic_timer_tick(struct vlapic *vlapic);
+void vlapic_set_error(struct vlapic *vlapic, uint32_t mask);
+void vlapic_fire_cmci(struct vlapic *vlapic);
+int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
 
 uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
 void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
 void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+bool vlapic_enabled(struct vlapic *vlapic);
 
+void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+    int delmode, int vec);
 #endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8cbd679..f471218b 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
+#include <x86/psl.h>
 #include <x86/apicreg.h>
 #include <machine/vmparam.h>
 
@@ -869,41 +870,44 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
-vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
+	struct vm_exit *vmexit;
 	struct vcpu *vcpu;
-	int sleepticks, t;
+	int t, timo;
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 
 	/*
-	 * Figure out the number of host ticks until the next apic
-	 * timer interrupt in the guest.
-	 */
-	sleepticks = lapic_timer_tick(vm, vcpuid);
-
-	/*
-	 * If the guest local apic timer is disabled then sleep for
-	 * a long time but not forever.
-	 */
-	if (sleepticks < 0)
-		sleepticks = hz;
-
-	/*
 	 * Do a final check for pending NMI or interrupts before
 	 * really putting this thread to sleep.
 	 *
 	 * These interrupts could have happened any time after we
 	 * returned from VMRUN() and before we grabbed the vcpu lock.
 	 */
-	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
-		if (sleepticks <= 0)
-			panic("invalid sleepticks %d", sleepticks);
+	if (!vm_nmi_pending(vm, vcpuid) &&
+	    (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) {
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
-		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+		if (vlapic_enabled(vcpu->vlapic)) {
+			/*
+			 * XXX msleep_spin() is not interruptible so use the
+			 * 'timo' to put an upper bound on the sleep time.
+			 */
+			timo = hz;
+			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
+		} else {
+			/*
+			 * Spindown the vcpu if the apic is disabled and it
+			 * had entered the halted state.
+			 */
+			*retu = true;
+			vmexit = vm_exitinfo(vm, vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
+			VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+		}
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
@@ -913,7 +917,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
 }
 
 static int
-vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
@@ -951,7 +955,7 @@ done:
 }
 
 static int
-vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
@@ -992,15 +996,12 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
-		*retu = TRUE;
+		*retu = true;
 		return (0);
 	}
 
-	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0);
-
-	/* return to userland to spin up the AP */
-	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
-		*retu = TRUE;
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
+	    retu);
 
 	return (error);
 }
@@ -1013,7 +1014,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
-	boolean_t retu;
+	bool retu, intr_disabled;
 	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
@@ -1053,10 +1054,11 @@ restart:
 	critical_exit();
 
 	if (error == 0) {
-		retu = FALSE;
+		retu = false;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_HLT:
-			error = vm_handle_hlt(vm, vcpuid, &retu);
+			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
+			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
@@ -1065,12 +1067,12 @@ restart:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		default:
-			retu = TRUE;	/* handled in userland */
+			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
-	if (error == 0 && retu == FALSE) {
+	if (error == 0 && retu == false) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}
@@ -1109,7 +1111,7 @@ vm_inject_nmi(struct vm *vm, int vcpuid)
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
-	vm_interrupt_hostcpu(vm, vcpuid);
+	vcpu_notify_event(vm, vcpuid);
 	return (0);
 }
 
@@ -1329,8 +1331,15 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 	return (0);
 }
 
+/*
+ * This function is called to ensure that a vcpu "sees" a pending event
+ * as soon as possible:
+ * - If the vcpu thread is sleeping then it is woken up.
+ * - If the vcpu is running on a different host_cpu then an IPI will be directed
+ *   to the host_cpu to cause the vcpu to trap into the hypervisor.
+ */
 void
-vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+vcpu_notify_event(struct vm *vm, int vcpuid)
 {
 	int hostcpu;
 	struct vcpu *vcpu;
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f248f68..02847c2 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -152,6 +152,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_run *vmrun;
 	struct vm_event *vmevent;
 	struct vm_lapic_irq *vmirq;
+	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
 	struct vm_capability *vmcap;
 	struct vm_pptdev *pptdev;
@@ -254,7 +255,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		pptmsi = (struct vm_pptdev_msi *)data;
 		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
-				      pptmsi->destcpu, pptmsi->vector,
+				      pptmsi->addr, pptmsi->msg,
 				      pptmsi->numvec);
 		break;
 	case VM_PPTDEV_MSIX:
@@ -262,8 +263,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
 				       pptmsix->bus, pptmsix->slot, 
 				       pptmsix->func, pptmsix->idx,
-				       pptmsix->msg, pptmsix->vector_control,
-				       pptmsix->addr);
+				       pptmsix->addr, pptmsix->msg,
+				       pptmsix->vector_control);
 		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
@@ -296,6 +297,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
 		break;
+	case VM_LAPIC_LOCAL_IRQ:
+		vmirq = (struct vm_lapic_irq *)data;
+		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
+		    vmirq->vector);
+		break;
+	case VM_LAPIC_MSI:
+		vmmsi = (struct vm_lapic_msi *)data;
+		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
+		break;
 	case VM_IOAPIC_ASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 465ce6c..8d915cd 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -38,9 +38,18 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
+#include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
 
+/*
+ * Some MSI message definitions
+ */
+#define	MSI_X86_ADDR_MASK	0xfff00000
+#define	MSI_X86_ADDR_BASE	0xfee00000
+#define	MSI_X86_ADDR_RH		0x00000008	/* Redirection Hint */
+#define	MSI_X86_ADDR_LOG	0x00000004	/* Destination Mode */
+
 int
 lapic_pending_intr(struct vm *vm, int cpu)
 {
@@ -75,19 +84,74 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
 	vlapic = vm_lapic(vm, cpu);
 	vlapic_set_intr_ready(vlapic, vector, level);
 
-	vm_interrupt_hostcpu(vm, cpu);
+	vcpu_notify_event(vm, cpu);
 
 	return (0);
 }
 
 int
-lapic_timer_tick(struct vm *vm, int cpu)
+lapic_set_local_intr(struct vm *vm, int cpu, int vector)
 {
 	struct vlapic *vlapic;
+	cpuset_t dmask;
+	int error;
 
-	vlapic = vm_lapic(vm, cpu);
+	if (cpu < -1 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (cpu == -1)
+		dmask = vm_active_cpus(vm);
+	else
+		CPU_SETOF(cpu, &dmask);
+	error = 0;
+	while ((cpu = CPU_FFS(&dmask)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &dmask);
+		vlapic = vm_lapic(vm, cpu);
+		error = vlapic_trigger_lvt(vlapic, vector);
+		if (error)
+			break;
+	}
+
+	return (error);
+}
 
-	return (vlapic_timer_tick(vlapic));
+int
+lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg)
+{
+	int delmode, vec;
+	uint32_t dest;
+	bool phys;
+
+	VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg);
+
+	if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) {
+		VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr);
+		return (-1);
+	}
+
+	/*
+	 * Extract the x86-specific fields from the MSI addr/msg
+	 * params according to the Intel Arch spec, Vol3 Ch 10.
+	 *
+	 * The PCI specification does not support level triggered
+	 * MSI/MSI-X so ignore trigger level in 'msg'.
+	 *
+	 * The 'dest' is interpreted as a logical APIC ID if both
+	 * the Redirection Hint and Destination Mode are '1' and
+	 * physical otherwise.
+	 */
+	dest = (addr >> 12) & 0xff;
+	phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) !=
+	    (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG));
+	delmode = msg & APIC_DELMODE_MASK;
+	vec = msg & 0xff;
+
+	VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d",
+	    phys ? "physical" : "logical", dest, vec);
+
+	vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec);
+	return (0);
 }
 
 static boolean_t
@@ -117,7 +181,7 @@ lapic_msr(u_int msr)
 }
 
 int
-lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu)
 {
 	int error;
 	u_int offset;
@@ -130,14 +194,14 @@ lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
 		error = 0;
 	} else {
 		offset = x2apic_msr_to_regoff(msr);
-		error = vlapic_read(vlapic, offset, rval);
+		error = vlapic_read(vlapic, offset, rval, retu);
 	}
 
 	return (error);
 }
 
 int
-lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu)
 {
 	int error;
 	u_int offset;
@@ -150,7 +214,7 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
 		error = 0;
 	} else {
 		offset = x2apic_msr_to_regoff(msr);
-		error = vlapic_write(vlapic, offset, val);
+		error = vlapic_write(vlapic, offset, val, retu);
 	}
 
 	return (error);
@@ -174,7 +238,7 @@ lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
-	error = vlapic_write(vlapic, off, wval);
+	error = vlapic_write(vlapic, off, wval, arg);
 	return (error);
 }
 
@@ -196,6 +260,6 @@ lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
-	error = vlapic_read(vlapic, off, rval);
+	error = vlapic_read(vlapic, off, rval, arg);
 	return (error);
 }
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index 1461185..c5c95aa 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -32,16 +32,16 @@
 struct vm;
 
 boolean_t lapic_msr(u_int num);
-int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
-int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval,
+	    bool *retu);
+int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval,
+	    bool *retu);
 
 int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
 			uint64_t *rval, int size, void *arg);
 int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
 			 uint64_t wval, int size, void *arg);
 
-int	lapic_timer_tick(struct vm *vm, int cpu);
-
 /*
  * Returns a vector between 32 and 255 if an interrupt is pending in the
  * IRR that can be delivered based on the current state of ISR and TPR.
@@ -84,4 +84,12 @@ lapic_intr_edge(struct vm *vm, int cpu, int vector)
 	return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE));
 }
 
+/*
+ * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'.  'cpu' can
+ * be set to -1 to trigger the interrupt on all CPUs.
+ */
+int	lapic_set_local_intr(struct vm *vm, int cpu, int vector);
+
+int	lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg);
+
 #endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
index 4011bb5..03e0071 100644
--- a/sys/amd64/vmm/vmm_msr.c
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -154,13 +154,13 @@ msr_num_to_idx(u_int num)
 }
 
 int
-emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val, bool *retu)
 {
 	int idx;
 	uint64_t *guest_msrs;
 
 	if (lapic_msr(num))
-		return (lapic_wrmsr(vm, cpu, num, val));
+		return (lapic_wrmsr(vm, cpu, num, val, retu));
 
 	idx = msr_num_to_idx(num);
 	if (idx < 0 || invalid_msr(idx))
@@ -181,14 +181,14 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
 }
 
 int
-emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+emulate_rdmsr(struct vm *vm, int cpu, u_int num, bool *retu)
 {
 	int error, idx;
 	uint32_t eax, edx;
 	uint64_t result, *guest_msrs;
 
 	if (lapic_msr(num)) {
-		error = lapic_rdmsr(vm, cpu, num, &result);
+		error = lapic_rdmsr(vm, cpu, num, &result, retu);
 		goto done;
 	}
 
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
index 8a1fda3..e070037 100644
--- a/sys/amd64/vmm/vmm_msr.h
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -33,8 +33,9 @@
 struct vm;
 
 void	vmm_msr_init(void);
-int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
-int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val,
+	    bool *retu);
+int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr, bool *retu);
 void	guest_msrs_init(struct vm *vm, int cpu);
 void	guest_msr_valid(int msr);
 void	restore_host_msrs(struct vm *vm, int cpu);
author	jhb <jhb@FreeBSD.org>	2014-02-23 00:46:05 +0000
committer	jhb <jhb@FreeBSD.org>	2014-02-23 00:46:05 +0000
commit	69d17427cae2b573203a13c2fe8cac0865c3cfdc (patch)
tree	74c089432fac1660f52a522e3e53195374381e38 /sys/amd64
parent	04e37d68ee180962d9cdaef4ffd90789f36548ab (diff)
download	FreeBSD-src-69d17427cae2b573203a13c2fe8cac0865c3cfdc.zip FreeBSD-src-69d17427cae2b573203a13c2fe8cac0865c3cfdc.tar.gz