Merge remote-tracking branch 'origin/stable/11' into devel-11

author: Renato Botelho <renato@netgate.com> 2016-10-06 07:51:36 -0300
committer: Renato Botelho <renato@netgate.com> 2016-10-06 07:51:36 -0300
commit: 5dd7ab172435dc99e1abc1f7d73fd5e720050bbc (patch)
tree: 19b1dffa928be3fe0fb3b046f47c39e471c592bc /sys/amd64
parent: 32988844e41ba2dafefd4b0ca819b8c48ecbbb81 (diff)
parent: 22e3935b200286024203db77aec100fca35e3e95 (diff)
download: FreeBSD-src-5dd7ab172435dc99e1abc1f7d73fd5e720050bbc.zip
FreeBSD-src-5dd7ab172435dc99e1abc1f7d73fd5e720050bbc.tar.gz
12 files changed, 889 insertions, 75 deletions
diff --git a/sys/amd64/amd64/efirt.c b/sys/amd64/amd64/efirt.c
new file mode 100644
index 0000000..8db768b
--- /dev/null
+++ b/sys/amd64/amd64/efirt.c
@@ -0,0 +1,609 @@
+/*-
+ * Copyright (c) 2004 Marcel Moolenaar
+ * Copyright (c) 2001 Doug Rabson
+ * Copyright (c) 2016 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/efi.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/clock.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <machine/fpu.h>
+#include <machine/efi.h>
+#include <machine/metadata.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+#include <machine/vmparam.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static struct efi_systbl *efi_systbl;
+static struct efi_cfgtbl *efi_cfgtbl;
+static struct efi_rt *efi_runtime;
+
+static int efi_status2err[25] = {
+	0,		/* EFI_SUCCESS */
+	ENOEXEC,	/* EFI_LOAD_ERROR */
+	EINVAL,		/* EFI_INVALID_PARAMETER */
+	ENOSYS,		/* EFI_UNSUPPORTED */
+	EMSGSIZE, 	/* EFI_BAD_BUFFER_SIZE */
+	EOVERFLOW,	/* EFI_BUFFER_TOO_SMALL */
+	EBUSY,		/* EFI_NOT_READY */
+	EIO,		/* EFI_DEVICE_ERROR */
+	EROFS,		/* EFI_WRITE_PROTECTED */
+	EAGAIN,		/* EFI_OUT_OF_RESOURCES */
+	EIO,		/* EFI_VOLUME_CORRUPTED */
+	ENOSPC,		/* EFI_VOLUME_FULL */
+	ENXIO,		/* EFI_NO_MEDIA */
+	ESTALE,		/* EFI_MEDIA_CHANGED */
+	ENOENT,		/* EFI_NOT_FOUND */
+	EACCES,		/* EFI_ACCESS_DENIED */
+	ETIMEDOUT,	/* EFI_NO_RESPONSE */
+	EADDRNOTAVAIL,	/* EFI_NO_MAPPING */
+	ETIMEDOUT,	/* EFI_TIMEOUT */
+	EDOOFUS,	/* EFI_NOT_STARTED */
+	EALREADY,	/* EFI_ALREADY_STARTED */
+	ECANCELED,	/* EFI_ABORTED */
+	EPROTO,		/* EFI_ICMP_ERROR */
+	EPROTO,		/* EFI_TFTP_ERROR */
+	EPROTO		/* EFI_PROTOCOL_ERROR */
+};
+
+static int
+efi_status_to_errno(efi_status status)
+{
+	u_long code;
+
+	code = status & 0x3ffffffffffffffful;
+	return (code < nitems(efi_status2err) ? efi_status2err[code] : EDOOFUS);
+}
+
+static struct mtx efi_lock;
+static pml4_entry_t *efi_pml4;
+static vm_object_t obj_1t1_pt;
+static vm_page_t efi_pml4_page;
+
+static void
+efi_destroy_1t1_map(void)
+{
+	vm_page_t m;
+
+	if (obj_1t1_pt != NULL) {
+		VM_OBJECT_RLOCK(obj_1t1_pt);
+		TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq)
+			m->wire_count = 0;
+		atomic_subtract_int(&vm_cnt.v_wire_count,
+		    obj_1t1_pt->resident_page_count);
+		VM_OBJECT_RUNLOCK(obj_1t1_pt);
+		vm_object_deallocate(obj_1t1_pt);
+	}
+
+	obj_1t1_pt = NULL;
+	efi_pml4 = NULL;
+	efi_pml4_page = NULL;
+}
+
+static vm_page_t
+efi_1t1_page(vm_pindex_t idx)
+{
+
+	return (vm_page_grab(obj_1t1_pt, idx, VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED | VM_ALLOC_ZERO));
+}
+
+static pt_entry_t *
+efi_1t1_pte(vm_offset_t va)
+{
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
+	pt_entry_t *pte;
+	vm_page_t m;
+	vm_pindex_t pml4_idx, pdp_idx, pd_idx;
+	vm_paddr_t mphys;
+
+	pml4_idx = pmap_pml4e_index(va);
+	pml4e = &efi_pml4[pml4_idx];
+	if (*pml4e == 0) {
+		m = efi_1t1_page(1 + pml4_idx);
+		mphys =  VM_PAGE_TO_PHYS(m);
+		*pml4e = mphys | X86_PG_RW | X86_PG_V;
+	} else {
+		mphys = *pml4e & ~PAGE_MASK;
+	}
+
+	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys);
+	pdp_idx = pmap_pdpe_index(va);
+	pdpe += pdp_idx;
+	if (*pdpe == 0) {
+		m = efi_1t1_page(1 + NPML4EPG + (pml4_idx + 1) * (pdp_idx + 1));
+		mphys =  VM_PAGE_TO_PHYS(m);
+		*pdpe = mphys | X86_PG_RW | X86_PG_V;
+	} else {
+		mphys = *pdpe & ~PAGE_MASK;
+	}
+
+	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
+	pd_idx = pmap_pde_index(va);
+	pde += pd_idx;
+	if (*pde == 0) {
+		m = efi_1t1_page(1 + NPML4EPG + NPML4EPG * NPDPEPG +
+		    (pml4_idx + 1) * (pdp_idx + 1) * (pd_idx + 1));
+		mphys = VM_PAGE_TO_PHYS(m);
+		*pde = mphys | X86_PG_RW | X86_PG_V;
+	} else {
+		mphys = *pde & ~PAGE_MASK;
+	}
+
+	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
+	pte += pmap_pte_index(va);
+	KASSERT(*pte == 0, ("va %#jx *pt %#jx", va, *pte));
+
+	return (pte);
+}
+
+static bool
+efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz)
+{
+	struct efi_md *p;
+	pt_entry_t *pte;
+	vm_offset_t va;
+	uint64_t idx;
+	int bits, i, mode;
+
+	obj_1t1_pt = vm_pager_allocate(OBJT_PHYS, NULL, 1 + NPML4EPG +
+	    NPML4EPG * NPDPEPG + NPML4EPG * NPDPEPG * NPDEPG,
+	    VM_PROT_ALL, 0, NULL);
+	VM_OBJECT_WLOCK(obj_1t1_pt);
+	efi_pml4_page = efi_1t1_page(0);
+	VM_OBJECT_WUNLOCK(obj_1t1_pt);
+	efi_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pml4_page));
+	pmap_pinit_pml4(efi_pml4_page);
+
+	for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p,
+	    descsz)) {
+		if ((p->md_attr & EFI_MD_ATTR_RT) == 0)
+			continue;
+		if (p->md_virt != NULL) {
+			if (bootverbose)
+				printf("EFI Runtime entry %d is mapped\n", i);
+			goto fail;
+		}
+		if ((p->md_phys & EFI_PAGE_MASK) != 0) {
+			if (bootverbose)
+				printf("EFI Runtime entry %d is not aligned\n",
+				    i);
+			goto fail;
+		}
+		if (p->md_phys + p->md_pages * EFI_PAGE_SIZE < p->md_phys ||
+		    p->md_phys + p->md_pages * EFI_PAGE_SIZE >=
+		    VM_MAXUSER_ADDRESS) {
+			printf("EFI Runtime entry %d is not in mappable for RT:"
+			    "base %#016jx %#jx pages\n",
+			    i, (uintmax_t)p->md_phys,
+			    (uintmax_t)p->md_pages);
+			goto fail;
+		}
+		if ((p->md_attr & EFI_MD_ATTR_WB) != 0)
+			mode = VM_MEMATTR_WRITE_BACK;
+		else if ((p->md_attr & EFI_MD_ATTR_WT) != 0)
+			mode = VM_MEMATTR_WRITE_THROUGH;
+		else if ((p->md_attr & EFI_MD_ATTR_WC) != 0)
+			mode = VM_MEMATTR_WRITE_COMBINING;
+		else if ((p->md_attr & EFI_MD_ATTR_WP) != 0)
+			mode = VM_MEMATTR_WRITE_PROTECTED;
+		else if ((p->md_attr & EFI_MD_ATTR_UC) != 0)
+			mode = VM_MEMATTR_UNCACHEABLE;
+		else {
+			if (bootverbose)
+				printf("EFI Runtime entry %d mapping "
+				    "attributes unsupported\n", i);
+			mode = VM_MEMATTR_UNCACHEABLE;
+		}
+		bits = pmap_cache_bits(kernel_pmap, mode, FALSE) | X86_PG_RW |
+		    X86_PG_V;
+		VM_OBJECT_WLOCK(obj_1t1_pt);
+		for (va = p->md_phys, idx = 0; idx < p->md_pages; idx++,
+		    va += PAGE_SIZE) {
+			pte = efi_1t1_pte(va);
+			pte_store(pte, va | bits);
+		}
+		VM_OBJECT_WUNLOCK(obj_1t1_pt);
+	}
+
+	return (true);
+
+fail:
+	efi_destroy_1t1_map();
+	return (false);
+}
+
+/*
+ * Create an environment for the EFI runtime code call.  The most
+ * important part is creating the required 1:1 physical->virtual
+ * mappings for the runtime segments.  To do that, we manually create
+ * page table which unmap userspace but gives correct kernel mapping.
+ * The 1:1 mappings for runtime segments usually occupy low 4G of the
+ * physical address map.
+ *
+ * The 1:1 mappings were chosen over the SetVirtualAddressMap() EFI RT
+ * service, because there are some BIOSes which fail to correctly
+ * relocate itself on the call, requiring both 1:1 and virtual
+ * mapping.  As result, we must provide 1:1 mapping anyway, so no
+ * reason to bother with the virtual map, and no need to add a
+ * complexity into loader.
+ *
+ * The fpu_kern_enter() call allows firmware to use FPU, as mandated
+ * by the specification.  In particular, CR0.TS bit is cleared.  Also
+ * it enters critical section, giving us neccessary protection against
+ * context switch.
+ *
+ * There is no need to disable interrupts around the change of %cr3,
+ * the kernel mappings are correct, while we only grabbed the
+ * userspace portion of VA.  Interrupts handlers must not access
+ * userspace.  Having interrupts enabled fixes the issue with
+ * firmware/SMM long operation, which would negatively affect IPIs,
+ * esp. TLB shootdown requests.
+ */
+static int
+efi_enter(void)
+{
+	pmap_t curpmap;
+	int error;
+
+	if (efi_runtime == NULL)
+		return (ENXIO);
+	curpmap = PCPU_GET(curpmap);
+	PMAP_LOCK(curpmap);
+	mtx_lock(&efi_lock);
+	error = fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
+	if (error != 0) {
+		PMAP_UNLOCK(curpmap);
+		return (error);
+	}
+
+	/*
+	 * IPI TLB shootdown handler invltlb_pcid_handler() reloads
+	 * %cr3 from the curpmap->pm_cr3, which would disable runtime
+	 * segments mappings.  Block the handler's action by setting
+	 * curpmap to impossible value.  See also comment in
+	 * pmap.c:pmap_activate_sw().
+	 */
+	if (pmap_pcid_enabled && !invpcid_works)
+		PCPU_SET(curpmap, NULL);
+
+	load_cr3(VM_PAGE_TO_PHYS(efi_pml4_page) | (pmap_pcid_enabled ?
+	    curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
+	/*
+	 * If PCID is enabled, the clear CR3_PCID_SAVE bit in the loaded %cr3
+	 * causes TLB invalidation.
+	 */
+	if (!pmap_pcid_enabled)
+		invltlb();
+	return (0);
+}
+
+static void
+efi_leave(void)
+{
+	pmap_t curpmap;
+
+	curpmap = &curproc->p_vmspace->vm_pmap;
+	if (pmap_pcid_enabled && !invpcid_works)
+		PCPU_SET(curpmap, curpmap);
+	load_cr3(curpmap->pm_cr3 | (pmap_pcid_enabled ?
+	    curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
+	if (!pmap_pcid_enabled)
+		invltlb();
+
+	fpu_kern_leave(curthread, NULL);
+	mtx_unlock(&efi_lock);
+	PMAP_UNLOCK(curpmap);
+}
+
+static int
+efi_init(void)
+{
+	struct efi_map_header *efihdr;
+	struct efi_md *map;
+	caddr_t kmdp;
+	size_t efisz;
+
+	mtx_init(&efi_lock, "efi", NULL, MTX_DEF);
+
+	if (efi_systbl_phys == 0) {
+		if (bootverbose)
+			printf("EFI systbl not available\n");
+		return (ENXIO);
+	}
+	efi_systbl = (struct efi_systbl *)PHYS_TO_DMAP(efi_systbl_phys);
+	if (efi_systbl->st_hdr.th_sig != EFI_SYSTBL_SIG) {
+		efi_systbl = NULL;
+		if (bootverbose)
+			printf("EFI systbl signature invalid\n");
+		return (ENXIO);
+	}
+	efi_cfgtbl = (efi_systbl->st_cfgtbl == 0) ? NULL :
+	    (struct efi_cfgtbl *)efi_systbl->st_cfgtbl;
+	if (efi_cfgtbl == NULL) {
+		if (bootverbose)
+			printf("EFI config table is not present\n");
+	}
+
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
+	if (efihdr == NULL) {
+		if (bootverbose)
+			printf("EFI map is not present\n");
+		return (ENXIO);
+	}
+	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
+	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
+	if (efihdr->descriptor_size == 0)
+		return (ENOMEM);
+
+	if (!efi_create_1t1_map(map, efihdr->memory_size /
+	    efihdr->descriptor_size, efihdr->descriptor_size)) {
+		if (bootverbose)
+			printf("EFI cannot create runtime map\n");
+		return (ENOMEM);
+	}
+
+	efi_runtime = (efi_systbl->st_rt == 0) ? NULL :
+	    (struct efi_rt *)efi_systbl->st_rt;
+	if (efi_runtime == NULL) {
+		if (bootverbose)
+			printf("EFI runtime services table is not present\n");
+		efi_destroy_1t1_map();
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static void
+efi_uninit(void)
+{
+
+	efi_destroy_1t1_map();
+
+	efi_systbl = NULL;
+	efi_cfgtbl = NULL;
+	efi_runtime = NULL;
+
+	mtx_destroy(&efi_lock);
+}
+
+int
+efi_get_table(struct uuid *uuid, void **ptr)
+{
+	struct efi_cfgtbl *ct;
+	u_long count;
+
+	if (efi_cfgtbl == NULL)
+		return (ENXIO);
+	count = efi_systbl->st_entries;
+	ct = efi_cfgtbl;
+	while (count--) {
+		if (!bcmp(&ct->ct_uuid, uuid, sizeof(*uuid))) {
+			*ptr = (void *)PHYS_TO_DMAP(ct->ct_data);
+			return (0);
+		}
+		ct++;
+	}
+	return (ENOENT);
+}
+
+int
+efi_get_time_locked(struct efi_tm *tm)
+{
+	efi_status status;
+	int error;
+
+	mtx_assert(&resettodr_lock, MA_OWNED);
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	status = efi_runtime->rt_gettime(tm, NULL);
+	efi_leave();
+	error = efi_status_to_errno(status);
+	return (error);
+}
+
+int
+efi_get_time(struct efi_tm *tm)
+{
+	int error;
+
+	if (efi_runtime == NULL)
+		return (ENXIO);
+	mtx_lock(&resettodr_lock);
+	error = efi_get_time_locked(tm);
+	mtx_unlock(&resettodr_lock);
+	return (error);
+}
+
+int
+efi_reset_system(void)
+{
+	int error;
+
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	efi_runtime->rt_reset(EFI_RESET_WARM, 0, 0, NULL);
+	efi_leave();
+	return (EIO);
+}
+
+int
+efi_set_time_locked(struct efi_tm *tm)
+{
+	efi_status status;
+	int error;
+
+	mtx_assert(&resettodr_lock, MA_OWNED);
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	status = efi_runtime->rt_settime(tm);
+	efi_leave();
+	error = efi_status_to_errno(status);
+	return (error);
+}
+
+int
+efi_set_time(struct efi_tm *tm)
+{
+	int error;
+
+	if (efi_runtime == NULL)
+		return (ENXIO);
+	mtx_lock(&resettodr_lock);
+	error = efi_set_time_locked(tm);
+	mtx_unlock(&resettodr_lock);
+	return (error);
+}
+
+int
+efi_var_get(efi_char *name, struct uuid *vendor, uint32_t *attrib,
+    size_t *datasize, void *data)
+{
+	efi_status status;
+	int error;
+
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	status = efi_runtime->rt_getvar(name, vendor, attrib, datasize, data);
+	efi_leave();
+	error = efi_status_to_errno(status);
+	return (error);
+}
+
+int
+efi_var_nextname(size_t *namesize, efi_char *name, struct uuid *vendor)
+{
+	efi_status status;
+	int error;
+
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	status = efi_runtime->rt_scanvar(namesize, name, vendor);
+	efi_leave();
+	error = efi_status_to_errno(status);
+	return (error);
+}
+
+int
+efi_var_set(efi_char *name, struct uuid *vendor, uint32_t attrib,
+    size_t datasize, void *data)
+{
+	efi_status status;
+	int error;
+
+	error = efi_enter();
+	if (error != 0)
+		return (error);
+	status = efi_runtime->rt_setvar(name, vendor, attrib, datasize, data);
+	efi_leave();
+	error = efi_status_to_errno(status);
+	return (error);
+}
+
+static int
+efirt_modevents(module_t m, int event, void *arg __unused)
+{
+
+	switch (event) {
+	case MOD_LOAD:
+		return (efi_init());
+		break;
+
+	case MOD_UNLOAD:
+		efi_uninit();
+		return (0);
+
+	case MOD_SHUTDOWN:
+		return (0);
+
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static moduledata_t efirt_moddata = {
+	.name = "efirt",
+	.evhand = efirt_modevents,
+	.priv = NULL,
+};
+DECLARE_MODULE(efirt, efirt_moddata, SI_SUB_VM_CONF, SI_ORDER_ANY);
+MODULE_VERSION(efirt, 1);
+
+/* XXX debug stuff */
+static int
+efi_time_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct efi_tm tm;
+	int error, val;
+
+	val = 0;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	error = efi_get_time(&tm);
+	if (error == 0) {
+		uprintf("EFI reports: Year %d Month %d Day %d Hour %d Min %d "
+		    "Sec %d\n", tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour,
+		    tm.tm_min, tm.tm_sec);
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_debug, OID_AUTO, efi_time, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+    efi_time_sysctl_handler, "I", "");
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 05b75dd..8aa21bb 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -192,7 +192,7 @@ struct msgbuf *msgbufp;
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
-vm_paddr_t efi_systbl;
+vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
@@ -1502,7 +1502,7 @@ native_parse_preload_data(u_int64_t modulep)
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end);
 #endif
-	efi_systbl = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
+	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index df04f42..442819b 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -239,10 +239,10 @@ minidumpsys(struct dumperinfo *di)
 		 * page written corresponds to 1GB of space
 		 */
 		pmapsize += PAGE_SIZE;
-		ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+		i = pmap_pdpe_index(va);
 		if ((pdp[i] & PG_V) == 0) {
 			va += NBPDP;
 			continue;
@@ -264,7 +264,7 @@ minidumpsys(struct dumperinfo *di)
 
 		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
 		for (n = 0; n < NPDEPG; n++, va += NBPDR) {
-			j = (va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1);
+			j = pmap_pde_index(va);
 
 			if ((pd[j] & PG_V) == 0)
 				continue;
@@ -368,10 +368,10 @@ minidumpsys(struct dumperinfo *di)
 	bzero(fakepd, sizeof(fakepd));
 	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); va += NBPDP) {
-		ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
+		i = pmap_pdpe_index(va);
 
 		/* We always write a page, even if it is zero */
 		if ((pdp[i] & PG_V) == 0) {
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index f87d3b5..ae525da 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -673,35 +673,6 @@ pmap_pde_pindex(vm_offset_t va)
 }
 
 
-/* Return various clipped indexes for a given VA */
-static __inline vm_pindex_t
-pmap_pte_index(vm_offset_t va)
-{
-
-	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pde_index(vm_offset_t va)
-{
-
-	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pdpe_index(vm_offset_t va)
-{
-
-	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
-}
-
-static __inline vm_pindex_t
-pmap_pml4e_index(vm_offset_t va)
-{
-
-	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
-}
-
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
@@ -1353,7 +1324,7 @@ pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
-static int
+int
 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
@@ -2374,6 +2345,29 @@ pmap_pinit0(pmap_t pmap)
 	CPU_FILL(&kernel_pmap->pm_active);
 }
 
+void
+pmap_pinit_pml4(vm_page_t pml4pg)
+{
+	pml4_entry_t *pm_pml4;
+	int i;
+
+	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+
+	/* Wire in kernel global address entries. */
+	for (i = 0; i < NKPML4E; i++) {
+		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
+		    X86_PG_V | PG_U;
+	}
+	for (i = 0; i < ndmpdpphys; i++) {
+		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
+		    X86_PG_V | PG_U;
+	}
+
+	/* install self-referential address mapping entry(s) */
+	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
+	    X86_PG_A | X86_PG_M;
+}
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
@@ -2410,20 +2404,7 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 	 */
 	if ((pmap->pm_type = pm_type) == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
-
-		/* Wire in kernel global address entries. */
-		for (i = 0; i < NKPML4E; i++) {
-			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
-			    X86_PG_RW | X86_PG_V | PG_U;
-		}
-		for (i = 0; i < ndmpdpphys; i++) {
-			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
-			    X86_PG_RW | X86_PG_V | PG_U;
-		}
-
-		/* install self-referential address mapping entry(s) */
-		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
-		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+		pmap_pinit_pml4(pml4pg);
 	}
 
 	pmap->pm_root.rt_root = 0;
@@ -5850,6 +5831,14 @@ safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  *
+ *	As an optimization, update the page's dirty field if a modified bit is
+ *	found while counting reference bits.  This opportunistic update can be
+ *	performed at low cost and can eliminate the need for some future calls
+ *	to pmap_is_modified().  However, since this function stops after
+ *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ *	dirty pages.  Those dirty pages will only be detected by a future call
+ *	to pmap_is_modified().
+ *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
@@ -5862,7 +5851,7 @@ pmap_ts_referenced(vm_page_t m)
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
-	pt_entry_t *pte, PG_A;
+	pt_entry_t *pte, PG_A, PG_M, PG_RW;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
@@ -5897,9 +5886,19 @@ retry:
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
+		PG_M = pmap_modified_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
+		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			/*
+			 * Although "oldpde" is mapping a 2MB page, because
+			 * this function is called at a 4KB page granularity,
+			 * we only update the 4KB page under test.
+			 */
+			vm_page_dirty(m);
+		}
 		if ((*pde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
@@ -5993,11 +5992,15 @@ small_mappings:
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
+		PG_M = pmap_modified_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
+		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
@@ -6865,6 +6868,7 @@ pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	uint64_t cached, cr3;
+	register_t rflags;
 	u_int cpuid;
 
 	oldpmap = PCPU_GET(curpmap);
@@ -6888,16 +6892,43 @@ pmap_activate_sw(struct thread *td)
 		    pmap == kernel_pmap,
 		    ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x",
 		    td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
+
+		/*
+		 * If the INVPCID instruction is not available,
+		 * invltlb_pcid_handler() is used for handle
+		 * invalidate_all IPI, which checks for curpmap ==
+		 * smp_tlb_pmap.  Below operations sequence has a
+		 * window where %CR3 is loaded with the new pmap's
+		 * PML4 address, but curpmap value is not yet updated.
+		 * This causes invltlb IPI handler, called between the
+		 * updates, to execute as NOP, which leaves stale TLB
+		 * entries.
+		 *
+		 * Note that the most typical use of
+		 * pmap_activate_sw(), from the context switch, is
+		 * immune to this race, because interrupts are
+		 * disabled (while the thread lock is owned), and IPI
+		 * happends after curpmap is updated.  Protect other
+		 * callers in a similar way, by disabling interrupts
+		 * around the %cr3 register reload and curpmap
+		 * assignment.
+		 */
+		if (!invpcid_works)
+			rflags = intr_disable();
+
 		if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) {
 			load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 			    cached);
 			if (cached)
 				PCPU_INC(pm_save_cnt);
 		}
+		PCPU_SET(curpmap, pmap);
+		if (!invpcid_works)
+			intr_restore(rflags);
 	} else if (cr3 != pmap->pm_cr3) {
 		load_cr3(pmap->pm_cr3);
+		PCPU_SET(curpmap, pmap);
 	}
-	PCPU_SET(curpmap, pmap);
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 #else
diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES
index 4107ca6..bf10762 100644
--- a/sys/amd64/conf/NOTES
+++ b/sys/amd64/conf/NOTES
@@ -599,6 +599,9 @@ options 	ENABLE_ALART		# Control alarm on Intel intpm driver
 #
 options 	NKPT=31
 
+# EFI Runtime Services support (not functional yet).
+options 	EFIRT
+
 
 #####################################################################
 # ABI Emulation
diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index f234873..4b7df46 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -645,12 +645,36 @@ load_gs(u_short sel)
 #endif
 
 static __inline void
+bare_lgdt(struct region_descriptor *addr)
+{
+	__asm __volatile("lgdt (%0)" : : "r" (addr));
+}
+
+static __inline void
+sgdt(struct region_descriptor *addr)
+{
+	char *loc;
+
+	loc = (char *)addr;
+	__asm __volatile("sgdt %0" : "=m" (*loc) : : "memory");
+}
+
+static __inline void
 lidt(struct region_descriptor *addr)
 {
 	__asm __volatile("lidt (%0)" : : "r" (addr));
 }
 
 static __inline void
+sidt(struct region_descriptor *addr)
+{
+	char *loc;
+
+	loc = (char *)addr;
+	__asm __volatile("sidt %0" : "=m" (*loc) : : "memory");
+}
+
+static __inline void
 lldt(u_short sel)
 {
 	__asm __volatile("lldt %0" : : "r" (sel));
@@ -662,6 +686,15 @@ ltr(u_short sel)
 	__asm __volatile("ltr %0" : : "r" (sel));
 }
 
+static __inline uint32_t
+read_tr(void)
+{
+	u_short sel;
+
+	__asm __volatile("str %0" : "=r" (sel));
+	return (sel);
+}
+
 static __inline uint64_t
 rdr0(void)
 {
diff --git a/sys/amd64/include/efi.h b/sys/amd64/include/efi.h
new file mode 100644
index 0000000..272d5a8
--- /dev/null
+++ b/sys/amd64/include/efi.h
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2016 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __AMD64_INCLUDE_EFI_H_
+#define __AMD64_INCLUDE_EFI_H_
+
+/*
+ * XXX: from gcc 6.2 manual:
+ * Note, the ms_abi attribute for Microsoft Windows 64-bit targets
+ * currently requires the -maccumulate-outgoing-args option.
+ */
+#define	EFIABI_ATTR	__attribute__((ms_abi))
+
+#ifdef _KERNEL
+struct uuid;
+struct efi_tm;
+
+int efi_get_table(struct uuid *uuid, void **ptr);
+int efi_get_time(struct efi_tm *tm);
+int efi_get_time_locked(struct efi_tm *tm);
+int efi_reset_system(void);
+int efi_set_time(struct efi_tm *tm);
+int efi_set_time_locked(struct efi_tm *tm);
+int efi_var_get(uint16_t *name, struct uuid *vendor, uint32_t *attrib,
+    size_t *datasize, void *data);
+int efi_var_nextname(size_t *namesize, uint16_t *name, struct uuid *vendor);
+int efi_var_set(uint16_t *name, struct uuid *vendor, uint32_t attrib,
+    size_t datasize, void *data);
+#endif
+
+#endif /* __AMD64_INCLUDE_EFI_H_ */
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 90546f5..4d924bd 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -391,6 +391,7 @@ struct thread;
 
 void	pmap_activate_sw(struct thread *);
 void	pmap_bootstrap(vm_paddr_t *);
+int	pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde);
 int	pmap_change_attr(vm_offset_t, vm_size_t, int);
 void	pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate);
 void	pmap_init_pat(void);
@@ -403,6 +404,7 @@ void	*pmap_mapdev(vm_paddr_t, vm_size_t);
 void	*pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
 boolean_t pmap_page_is_mapped(vm_page_t m);
 void	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
+void	pmap_pinit_pml4(vm_page_t);
 void	pmap_unmapdev(vm_offset_t, vm_size_t);
 void	pmap_invalidate_page(pmap_t, vm_offset_t);
 void	pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
@@ -416,6 +418,35 @@ boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 #endif /* _KERNEL */
 
+/* Return various clipped indexes for a given VA */
+static __inline vm_pindex_t
+pmap_pte_index(vm_offset_t va)
+{
+
+	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pde_index(vm_offset_t va)
+{
+
+	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pdpe_index(vm_offset_t va)
+{
+
+	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_index(vm_offset_t va)
+{
+
+	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
+}
+
 #endif /* !LOCORE */
 
 #endif /* !_MACHINE_PMAP_H_ */
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
index 9cfc4c2..75cf1ec 100644
--- a/sys/amd64/vmm/io/iommu.c
+++ b/sys/amd64/vmm/io/iommu.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
+#include <machine/cpu.h>
 #include <machine/md_var.h>
 
 #include "vmm_util.h"
@@ -51,8 +52,13 @@ static int iommu_avail;
 SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail,
     0, "bhyve iommu initialized?");
 
+static int iommu_enable = 1;
+SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0,
+    "Enable use of I/O MMU (required for PCI passthrough).");
+
 static struct iommu_ops *ops;
 static void *host_domain;
+static eventhandler_tag add_tag, delete_tag;
 
 static __inline int
 IOMMU_INIT(void)
@@ -148,14 +154,31 @@ IOMMU_DISABLE(void)
 		(*ops->disable)();
 }
 
-void
+static void
+iommu_pci_add(void *arg, device_t dev)
+{
+
+	/* Add new devices to the host domain. */
+	iommu_add_device(host_domain, pci_get_rid(dev));
+}
+
+static void
+iommu_pci_delete(void *arg, device_t dev)
+{
+
+	iommu_remove_device(host_domain, pci_get_rid(dev));
+}
+
+static void
 iommu_init(void)
 {
 	int error, bus, slot, func;
 	vm_paddr_t maxaddr;
-	const char *name;
 	device_t dev;
 
+	if (!iommu_enable)
+		return;
+
 	if (vmm_is_intel())
 		ops = &iommu_ops_intel;
 	else if (vmm_is_amd())
@@ -174,8 +197,13 @@ iommu_init(void)
 	 */
 	maxaddr = vmm_mem_maxaddr();
 	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
-	if (host_domain == NULL)
-		panic("iommu_init: unable to create a host domain");
+	if (host_domain == NULL) {
+		printf("iommu_init: unable to create a host domain");
+		IOMMU_CLEANUP();
+		ops = NULL;
+		iommu_avail = 0;
+		return;
+	}
 
 	/*
 	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
@@ -183,6 +211,9 @@ iommu_init(void)
 	 */
 	iommu_create_mapping(host_domain, 0, 0, maxaddr);
 
+	add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0);
+	delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete,
+	    NULL, 0);
 	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
 		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
 			for (func = 0; func <= PCI_FUNCMAX; func++) {
@@ -190,12 +221,7 @@ iommu_init(void)
 				if (dev == NULL)
 					continue;
 
-				/* skip passthrough devices */
-				name = device_get_name(dev);
-				if (name != NULL && strcmp(name, "ppt") == 0)
-					continue;
-
-				/* everything else belongs to the host domain */
+				/* Everything belongs to the host domain. */
 				iommu_add_device(host_domain,
 				    pci_get_rid(dev));
 			}
@@ -208,6 +234,15 @@ iommu_init(void)
 void
 iommu_cleanup(void)
 {
+
+	if (add_tag != NULL) {
+		EVENTHANDLER_DEREGISTER(pci_add_device, add_tag);
+		add_tag = NULL;
+	}
+	if (delete_tag != NULL) {
+		EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag);
+		delete_tag = NULL;
+	}
 	IOMMU_DISABLE();
 	IOMMU_DESTROY_DOMAIN(host_domain);
 	IOMMU_CLEANUP();
@@ -216,7 +251,16 @@ iommu_cleanup(void)
 void *
 iommu_create_domain(vm_paddr_t maxaddr)
 {
-
+	static volatile int iommu_initted;
+
+	if (iommu_initted < 2) {
+		if (atomic_cmpset_int(&iommu_initted, 0, 1)) {
+			iommu_init();
+			atomic_store_rel_int(&iommu_initted, 2);
+		} else
+			while (iommu_initted == 1)
+				cpu_spinwait();
+	}
 	return (IOMMU_CREATE_DOMAIN(maxaddr));
 }
 
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
index 36b44fa..a941c77 100644
--- a/sys/amd64/vmm/io/iommu.h
+++ b/sys/amd64/vmm/io/iommu.h
@@ -61,7 +61,6 @@ struct iommu_ops {
 extern struct iommu_ops iommu_ops_intel;
 extern struct iommu_ops iommu_ops_amd;
 
-void	iommu_init(void);
 void	iommu_cleanup(void);
 void	*iommu_host_domain(void);
 void	*iommu_create_domain(vm_paddr_t maxaddr);
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index 692190a..4c9ff47 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -362,7 +362,13 @@ ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 		if (ppt->vm != NULL && ppt->vm != vm)
 			return (EBUSY);
 
+		pci_save_state(ppt->dev);
+		pcie_flr(ppt->dev,
+		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
+		    true);
+		pci_restore_state(ppt->dev);
 		ppt->vm = vm;
+		iommu_remove_device(iommu_host_domain(), pci_get_rid(ppt->dev));
 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
 		return (0);
 	}
@@ -381,10 +387,17 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
 		 */
 		if (ppt->vm != vm)
 			return (EBUSY);
+
+		pci_save_state(ppt->dev);
+		pcie_flr(ppt->dev,
+		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
+		    true);
+		pci_restore_state(ppt->dev);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
 		ppt_teardown_msix(ppt);
 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
+		iommu_add_device(iommu_host_domain(), pci_get_rid(ppt->dev));
 		ppt->vm = NULL;
 		return (0);
 	}
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index ebd6360..537454a 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -224,11 +224,6 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
-static int vmm_force_iommu = 0;
-TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu);
-SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
-    "Force use of I/O MMU even if no passthrough devices were found.");
-
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
@@ -358,8 +353,6 @@ vmm_handler(module_t mod, int what, void *arg)
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (vmm_force_iommu || ppt_avail_devices() > 0)
-			iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
@@ -396,9 +389,6 @@ static moduledata_t vmm_kmod = {
 /*
  * vmm initialization has the following dependencies:
  *
- * - iommu initialization must happen after the pci passthru driver has had
- *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
- *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
@@ -893,6 +883,8 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = sysmem_maxaddr(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
+		if (vm->iommu == NULL)
+			return (ENXIO);
 		vm_iommu_map(vm);
 	}
author	Renato Botelho <renato@netgate.com>	2016-10-06 07:51:36 -0300
committer	Renato Botelho <renato@netgate.com>	2016-10-06 07:51:36 -0300
commit	5dd7ab172435dc99e1abc1f7d73fd5e720050bbc (patch)
tree	19b1dffa928be3fe0fb3b046f47c39e471c592bc /sys/amd64
parent	32988844e41ba2dafefd4b0ca819b8c48ecbbb81 (diff)
parent	22e3935b200286024203db77aec100fca35e3e95 (diff)
download	FreeBSD-src-5dd7ab172435dc99e1abc1f7d73fd5e720050bbc.zip FreeBSD-src-5dd7ab172435dc99e1abc1f7d73fd5e720050bbc.tar.gz