summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libvmmapi/vmmapi.c25
-rw-r--r--lib/libvmmapi/vmmapi.h4
-rw-r--r--sys/amd64/amd64/machdep.c2
-rw-r--r--sys/amd64/amd64/pmap.c1053
-rw-r--r--sys/amd64/amd64/trap.c11
-rw-r--r--sys/amd64/include/pmap.h115
-rw-r--r--sys/amd64/include/vmm.h43
-rw-r--r--sys/amd64/include/vmm_dev.h12
-rw-r--r--sys/amd64/include/vmm_instruction_emul.h4
-rw-r--r--sys/amd64/vmm/amd/amdv.c43
-rw-r--r--sys/amd64/vmm/intel/ept.c304
-rw-r--r--sys/amd64/vmm/intel/ept.h12
-rw-r--r--sys/amd64/vmm/intel/vmcs.c5
-rw-r--r--sys/amd64/vmm/intel/vmcs.h13
-rw-r--r--sys/amd64/vmm/intel/vmx.c175
-rw-r--r--sys/amd64/vmm/intel/vmx.h16
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c9
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S89
-rw-r--r--sys/amd64/vmm/io/ppt.c46
-rw-r--r--sys/amd64/vmm/io/ppt.h14
-rw-r--r--sys/amd64/vmm/vmm.c664
-rw-r--r--sys/amd64/vmm/vmm_dev.c60
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c28
-rw-r--r--sys/amd64/vmm/vmm_mem.c141
-rw-r--r--sys/amd64/vmm/vmm_mem.h10
-rw-r--r--usr.sbin/bhyve/bhyverun.c28
-rw-r--r--usr.sbin/bhyve/pci_emul.c2
-rw-r--r--usr.sbin/bhyve/rtc.c4
-rw-r--r--usr.sbin/bhyvectl/bhyvectl.c39
-rw-r--r--usr.sbin/bhyveload/bhyveload.c4
30 files changed, 2113 insertions, 862 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 95be94a..5559803 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
}
int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+ int *wired)
{
int error;
struct vm_memory_segment seg;
@@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
seg.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
*ret_len = seg.len;
+ if (wired != NULL)
+ *wired = seg.wired;
return (error);
}
@@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu)
done:
return (error);
}
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+ int error, i;
+ struct vm_gpa_pte gpapte;
+
+ bzero(&gpapte, sizeof(gpapte));
+ gpapte.gpa = gpa;
+
+ error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+ if (error == 0) {
+ *num = gpapte.ptenum;
+ for (i = 0; i < gpapte.ptenum; i++)
+ pte[i] = gpapte.pte[i];
+ }
+
+ return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 441e75f..8b53ae0 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -45,9 +45,11 @@ enum vm_mmap_style {
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
-int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+ int *wired);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index f3969d3..2b2e47f 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
/*
* map page into kernel: valid, read/write,non-cacheable
*/
- *pte = pa | PG_V | PG_RW | PG_N;
+ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 5cc99a5..74d3d13 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -76,6 +76,8 @@
* SUCH DAMAGE.
*/
+#define AMD64_NPT_AWARE
+
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
@@ -143,6 +145,120 @@ __FBSDID("$FreeBSD$");
#include <machine/smp.h>
#endif
+static __inline boolean_t
+pmap_emulate_ad_bits(pmap_t pmap)
+{
+
+ return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
+}
+
+static __inline pt_entry_t
+pmap_valid_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_V;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_EMUL_V;
+ else
+ mask = EPT_PG_READ;
+ break;
+ default:
+ panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_rw_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_RW;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_EMUL_RW;
+ else
+ mask = EPT_PG_WRITE;
+ break;
+ default:
+ panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_global_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_G;
+ break;
+ case PT_EPT:
+ mask = 0;
+ break;
+ default:
+ panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_accessed_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_A;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_READ;
+ else
+ mask = EPT_PG_A;
+ break;
+ default:
+ panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_modified_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_M;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_WRITE;
+ else
+ mask = EPT_PG_M;
+ break;
+ default:
+ panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
#if !defined(DIAGNOSTIC)
#ifdef __GNUC_GNU_INLINE__
#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
@@ -247,6 +363,8 @@ static struct md_page *pv_table;
pt_entry_t *CMAP1 = 0;
caddr_t CADDR1 = 0;
+static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
+
static struct unrhdr pcid_unr;
static struct mtx pcid_mtx;
int pmap_pcid_enabled = 1;
@@ -306,12 +424,12 @@ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
-static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
+static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
struct rwlock **lockp);
static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
vm_prot_t prot);
-static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
+static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
struct spglist *free, struct rwlock **lockp);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
@@ -323,7 +441,7 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
vm_page_t m, struct rwlock **lockp);
static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
pd_entry_t newpde);
-static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
struct rwlock **lockp);
@@ -416,7 +534,9 @@ static __inline pdp_entry_t *
pmap_pdpe(pmap_t pmap, vm_offset_t va)
{
pml4_entry_t *pml4e;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pml4e = pmap_pml4e(pmap, va);
if ((*pml4e & PG_V) == 0)
return (NULL);
@@ -438,7 +558,9 @@ static __inline pd_entry_t *
pmap_pde(pmap_t pmap, vm_offset_t va)
{
pdp_entry_t *pdpe;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pdpe = pmap_pdpe(pmap, va);
if (pdpe == NULL || (*pdpe & PG_V) == 0)
return (NULL);
@@ -460,7 +582,9 @@ static __inline pt_entry_t *
pmap_pte(pmap_t pmap, vm_offset_t va)
{
pd_entry_t *pde;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pde = pmap_pde(pmap, va);
if (pde == NULL || (*pde & PG_V) == 0)
return (NULL);
@@ -490,6 +614,8 @@ vtopte(vm_offset_t va)
{
u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
+
return (PTmap + ((va >> PAGE_SHIFT) & mask));
}
@@ -498,6 +624,8 @@ vtopde(vm_offset_t va)
{
u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
+
return (PDmap + ((va >> PDRSHIFT) & mask));
}
@@ -601,22 +729,24 @@ create_pagetables(vm_paddr_t *firstaddr)
/* XXX not fully used, underneath 2M pages */
pt_p = (pt_entry_t *)KPTphys;
for (i = 0; ptoa(i) < *firstaddr; i++)
- pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
+ pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
/* Now map the page tables at their location within PTmap */
pd_p = (pd_entry_t *)KPDphys;
for (i = 0; i < nkpt; i++)
- pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
+ pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
/* Map from zero to end of allocations under 2M pages */
/* This replaces some of the KPTphys entries above */
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
- pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
+ pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
+ X86_PG_G;
/* And connect up the PD to the PDP (leaving room for L4 pages) */
pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
for (i = 0; i < nkpdpe; i++)
- pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
+ pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+ PG_U;
/*
* Now, set up the direct map region using 2MB and/or 1GB pages. If
@@ -630,36 +760,36 @@ create_pagetables(vm_paddr_t *firstaddr)
for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
- PG_M | PG_A;
+ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ X86_PG_M | X86_PG_A;
}
pdp_p = (pdp_entry_t *)DMPDPphys;
for (i = 0; i < ndm1g; i++) {
pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
- PG_M | PG_A;
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ X86_PG_M | X86_PG_A;
}
for (j = 0; i < ndmpdp; i++, j++) {
pdp_p[i] = DMPDphys + ptoa(j);
- pdp_p[i] |= PG_RW | PG_V | PG_U;
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
}
/* And recursively map PML4 to itself in order to get PTmap */
p4_p = (pml4_entry_t *)KPML4phys;
p4_p[PML4PML4I] = KPML4phys;
- p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
+ p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
/* Connect the Direct Map slot(s) up to the PML4. */
for (i = 0; i < ndmpdpphys; i++) {
p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
- p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+ p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
}
/* Connect the KVA slots up to the PML4 */
for (i = 0; i < NKPML4E; i++) {
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
- p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+ p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
}
}
@@ -705,6 +835,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
CPU_ZERO(&kernel_pmap->pm_save);
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+ kernel_pmap->pm_flags = pmap_flags;
/*
* Initialize the global pv list lock.
@@ -948,35 +1079,131 @@ SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
* Low level helper routines.....
***************************************************/
+static pt_entry_t
+pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
+{
+ int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ /* Verify that both PAT bits are not set at the same time */
+ KASSERT((entry & x86_pat_bits) != x86_pat_bits,
+ ("Invalid PAT bits in entry %#lx", entry));
+
+ /* Swap the PAT bits if one of them is set */
+ if ((entry & x86_pat_bits) != 0)
+ entry ^= x86_pat_bits;
+ break;
+ case PT_EPT:
+ /*
+ * Nothing to do - the memory attributes are represented
+ * the same way for regular pages and superpages.
+ */
+ break;
+ default:
+ panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
+ }
+
+ return (entry);
+}
+
/*
* Determine the appropriate bits to set in a PTE or PDE for a specified
* caching mode.
*/
static int
-pmap_cache_bits(int mode, boolean_t is_pde)
+pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
{
int cache_bits, pat_flag, pat_idx;
if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
panic("Unknown caching mode %d\n", mode);
- /* The PAT bit is different for PTE's and PDE's. */
- pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
-
- /* Map the caching mode to a PAT index. */
- pat_idx = pat_index[mode];
+ switch (pmap->pm_type) {
+ case PT_X86:
+ /* The PAT bit is different for PTE's and PDE's. */
+ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
+
+ /* Map the caching mode to a PAT index. */
+ pat_idx = pat_index[mode];
+
+ /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+ cache_bits = 0;
+ if (pat_idx & 0x4)
+ cache_bits |= pat_flag;
+ if (pat_idx & 0x2)
+ cache_bits |= PG_NC_PCD;
+ if (pat_idx & 0x1)
+ cache_bits |= PG_NC_PWT;
+ break;
+
+ case PT_EPT:
+ cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
+ break;
+
+ default:
+ panic("unsupported pmap type %d", pmap->pm_type);
+ }
- /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
- cache_bits = 0;
- if (pat_idx & 0x4)
- cache_bits |= pat_flag;
- if (pat_idx & 0x2)
- cache_bits |= PG_NC_PCD;
- if (pat_idx & 0x1)
- cache_bits |= PG_NC_PWT;
return (cache_bits);
}
+static int
+pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
+{
+ int mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
+ break;
+ case PT_EPT:
+ mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
+ break;
+ default:
+ panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline boolean_t
+pmap_ps_enabled(pmap_t pmap)
+{
+
+ return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
+}
+
+static void
+pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ break;
+ case PT_EPT:
+ /*
+ * XXX
+ * This is a little bogus since the generation number is
+ * supposed to be bumped up when a region of the address
+ * space is invalidated in the page tables.
+ *
+ * In this case the old PDE entry is valid but yet we want
+ * to make sure that any mappings using the old entry are
+ * invalidated in the TLB.
+ *
+ * The reason this works as expected is because we rendezvous
+ * "all" host cpus and force any vcpu context to exit as a
+ * side-effect.
+ */
+ atomic_add_acq_long(&pmap->pm_eptgen, 1);
+ break;
+ default:
+ panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
+ }
+ pde_store(pde, newpde);
+}
+
/*
* After changing the page size for the specified virtual address in the page
* table, flush the corresponding entries from the processor's TLB. Only the
@@ -985,8 +1212,17 @@ pmap_cache_bits(int mode, boolean_t is_pde)
* The calling thread must be pinned to a processor.
*/
static void
-pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
{
+ pt_entry_t PG_G;
+
+ if (pmap->pm_type == PT_EPT)
+ return;
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
+
+ PG_G = pmap_global_bit(pmap);
if ((newpde & PG_PS) == 0)
/* Demotion: flush a specific 2MB page mapping. */
@@ -1048,12 +1284,61 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
* immutable. The kernel page table is always active on every
* processor.
*/
+
+/*
+ * Interrupt the cpus that are executing in the guest context.
+ * This will force the vcpu to exit and the cached EPT mappings
+ * will be invalidated by the host before the next vmresume.
+ */
+static __inline void
+pmap_invalidate_ept(pmap_t pmap)
+{
+
+ sched_pin();
+ KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+ ("pmap_invalidate_ept: absurd pm_active"));
+
+ /*
+ * The TLB mappings associated with a vcpu context are not
+ * flushed each time a different vcpu is chosen to execute.
+ *
+ * This is in contrast with a process's vtop mappings that
+ * are flushed from the TLB on each context switch.
+ *
+ * Therefore we need to do more than just a TLB shootdown on
+ * the active cpus in 'pmap->pm_active'. To do this we keep
+ * track of the number of invalidations performed on this pmap.
+ *
+ * Each vcpu keeps a cache of this counter and compares it
+ * just before a vmresume. If the counter is out-of-date an
+ * invept will be done to flush stale mappings from the TLB.
+ */
+ atomic_add_acq_long(&pmap->pm_eptgen, 1);
+
+ /*
+ * Force the vcpu to exit and trap back into the hypervisor.
+ *
+ * XXX this is not optimal because IPI_AST builds a trapframe
+ * whereas all we need is an 'eoi' followed by 'iret'.
+ */
+ ipi_selected(pmap->pm_active, IPI_AST);
+ sched_unpin();
+}
+
void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
cpuset_t other_cpus;
u_int cpuid;
+ if (pmap->pm_type == PT_EPT) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
if (!pmap_pcid_enabled) {
@@ -1124,6 +1409,14 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
vm_offset_t addr;
u_int cpuid;
+ if (pmap->pm_type == PT_EPT) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
if (!pmap_pcid_enabled) {
@@ -1175,6 +1468,14 @@ pmap_invalidate_all(pmap_t pmap)
uint64_t cr3;
u_int cpuid;
+ if (pmap->pm_type == PT_EPT) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+
sched_pin();
cpuid = PCPU_GET(cpuid);
if (pmap == kernel_pmap ||
@@ -1243,6 +1544,7 @@ pmap_invalidate_cache(void)
struct pde_action {
cpuset_t invalidate; /* processors that invalidate their TLB */
+ pmap_t pmap;
vm_offset_t va;
pd_entry_t *pde;
pd_entry_t newpde;
@@ -1255,7 +1557,7 @@ pmap_update_pde_action(void *arg)
struct pde_action *act = arg;
if (act->store == PCPU_GET(cpuid))
- pde_store(act->pde, act->newpde);
+ pmap_update_pde_store(act->pmap, act->pde, act->newpde);
}
static void
@@ -1264,7 +1566,7 @@ pmap_update_pde_teardown(void *arg)
struct pde_action *act = arg;
if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
- pmap_update_pde_invalidate(act->va, act->newpde);
+ pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
}
/*
@@ -1286,7 +1588,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (pmap == kernel_pmap)
+ if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
active = all_cpus;
else {
active = pmap->pm_active;
@@ -1296,6 +1598,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
act.store = cpuid;
act.invalidate = active;
act.va = va;
+ act.pmap = pmap;
act.pde = pde;
act.newpde = newpde;
CPU_SET(cpuid, &active);
@@ -1303,9 +1606,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
smp_no_rendevous_barrier, pmap_update_pde_action,
pmap_update_pde_teardown, &act);
} else {
- pde_store(pde, newpde);
+ pmap_update_pde_store(pmap, pde, newpde);
if (CPU_ISSET(cpuid, &active))
- pmap_update_pde_invalidate(va, newpde);
+ pmap_update_pde_invalidate(pmap, va, newpde);
}
sched_unpin();
}
@@ -1318,8 +1621,17 @@ PMAP_INLINE void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- invlpg(va);
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invlpg(va);
+ break;
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
@@ -1327,17 +1639,35 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
vm_offset_t addr;
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ break;
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
pmap_invalidate_all(pmap_t pmap)
{
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- invltlb();
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invltlb();
+ break;
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
@@ -1351,9 +1681,9 @@ static void
pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
{
- pde_store(pde, newpde);
+ pmap_update_pde_store(pmap, pde, newpde);
if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- pmap_update_pde_invalidate(va, newpde);
+ pmap_update_pde_invalidate(pmap, va, newpde);
else
CPU_ZERO(&pmap->pm_save);
}
@@ -1455,10 +1785,11 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
{
pdp_entry_t *pdpe;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
vm_paddr_t pa;
pa = 0;
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK(pmap);
pdpe = pmap_pdpe(pmap, va);
if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1493,12 +1824,14 @@ vm_page_t
pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
{
pd_entry_t pde, *pdep;
- pt_entry_t pte;
+ pt_entry_t pte, PG_RW, PG_V;
vm_paddr_t pa;
vm_page_t m;
pa = 0;
m = NULL;
+ PG_RW = pmap_rw_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK(pmap);
retry:
pdep = pmap_pde(pmap, va);
@@ -1571,16 +1904,18 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa)
pt_entry_t *pte;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | PG_G);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
}
static __inline void
pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
{
pt_entry_t *pte;
+ int cache_bits;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
+ cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
}
/*
@@ -1629,20 +1964,22 @@ pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
{
pt_entry_t *endpte, oldpte, pa, *pte;
vm_page_t m;
+ int cache_bits;
oldpte = 0;
pte = vtopte(sva);
endpte = pte + count;
while (pte < endpte) {
m = *ma++;
- pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
- if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
+ cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
+ pa = VM_PAGE_TO_PHYS(m) | cache_bits;
+ if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
oldpte |= *pte;
- pte_store(pte, pa | PG_G | PG_RW | PG_V);
+ pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
}
pte++;
}
- if (__predict_false((oldpte & PG_V) != 0))
+ if (__predict_false((oldpte & X86_PG_V) != 0))
pmap_invalidate_range(kernel_pmap, sva, sva + count *
PAGE_SIZE);
}
@@ -1841,6 +2178,7 @@ pmap_pinit0(pmap_t pmap)
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+ pmap->pm_flags = pmap_flags;
}
/*
@@ -1848,9 +2186,10 @@ pmap_pinit0(pmap_t pmap)
* such as one in a vmspace structure.
*/
int
-pmap_pinit(pmap_t pmap)
+pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
vm_page_t pml4pg;
+ vm_paddr_t pml4phys;
int i;
/*
@@ -1860,41 +2199,61 @@ pmap_pinit(pmap_t pmap)
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
VM_WAIT;
- pmap->pm_cr3 = VM_PAGE_TO_PHYS(pml4pg);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pmap->pm_cr3);
+ pml4phys = VM_PAGE_TO_PHYS(pml4pg);
+ pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+ pmap->pm_pcid = -1;
+ pmap->pm_cr3 = ~0; /* initialize to an invalid value */
if ((pml4pg->flags & PG_ZERO) == 0)
pagezero(pmap->pm_pml4);
- /* Wire in kernel global address entries. */
- for (i = 0; i < NKPML4E; i++) {
- pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
- PG_RW | PG_V | PG_U;
- }
- for (i = 0; i < ndmpdpphys; i++) {
- pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
- PG_RW | PG_V | PG_U;
- }
+ /*
+ * Do not install the host kernel mappings in the nested page
+ * tables. These mappings are meaningless in the guest physical
+ * address space.
+ */
+ if ((pmap->pm_type = pm_type) == PT_X86) {
+ pmap->pm_cr3 = pml4phys;
- /* install self-referential address mapping entry(s) */
- pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
+ /* Wire in kernel global address entries. */
+ for (i = 0; i < NKPML4E; i++) {
+ pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | PG_U;
+ }
+ for (i = 0; i < ndmpdpphys; i++) {
+ pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | PG_U;
+ }
+
+ /* install self-referential address mapping entry(s) */
+ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+
+ if (pmap_pcid_enabled) {
+ pmap->pm_pcid = alloc_unr(&pcid_unr);
+ if (pmap->pm_pcid != -1)
+ pmap->pm_cr3 |= pmap->pm_pcid;
+ }
+ }
pmap->pm_root.rt_root = 0;
CPU_ZERO(&pmap->pm_active);
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
- if (pmap_pcid_enabled) {
- pmap->pm_pcid = alloc_unr(&pcid_unr);
- if (pmap->pm_pcid != -1)
- pmap->pm_cr3 |= pmap->pm_pcid;
- } else {
- pmap->pm_pcid = -1;
- }
+ pmap->pm_flags = flags;
+ pmap->pm_eptgen = 0;
CPU_ZERO(&pmap->pm_save);
return (1);
}
+int
+pmap_pinit(pmap_t pmap)
+{
+
+ return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
+}
+
/*
* This routine is called if the desired page table page does not exist.
*
@@ -1910,9 +2269,15 @@ static vm_page_t
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
{
vm_page_t m, pdppg, pdpg;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
/*
* Allocate a page table page.
*/
@@ -2040,9 +2405,11 @@ static vm_page_t
pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
{
vm_pindex_t pdpindex, ptepindex;
- pdp_entry_t *pdpe;
+ pdp_entry_t *pdpe, PG_V;
vm_page_t pdpg;
+ PG_V = pmap_valid_bit(pmap);
+
retry:
pdpe = pmap_pdpe(pmap, va);
if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -2064,9 +2431,11 @@ static vm_page_t
pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
{
vm_pindex_t ptepindex;
- pd_entry_t *pd;
+ pd_entry_t *pd, PG_V;
vm_page_t m;
+ PG_V = pmap_valid_bit(pmap);
+
/*
* Calculate pagetable page index
*/
@@ -2140,7 +2509,7 @@ pmap_release(pmap_t pmap)
pmap_invalidate_all(pmap);
}
- m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
for (i = 0; i < NKPML4E; i++) /* KVA */
pmap->pm_pml4[KPML4BASE + i] = 0;
@@ -2211,7 +2580,7 @@ pmap_growkernel(vm_offset_t addr)
addr = kernel_map->max_offset;
while (kernel_vm_end < addr) {
pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
- if ((*pdpe & PG_V) == 0) {
+ if ((*pdpe & X86_PG_V) == 0) {
/* We need a new PDP entry */
nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
@@ -2221,12 +2590,12 @@ pmap_growkernel(vm_offset_t addr)
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
- *pdpe = (pdp_entry_t)
- (paddr | PG_V | PG_RW | PG_A | PG_M);
+ *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
+ X86_PG_A | X86_PG_M);
continue; /* try again */
}
pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
- if ((*pde & PG_V) != 0) {
+ if ((*pde & X86_PG_V) != 0) {
kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
if (kernel_vm_end - 1 >= kernel_map->max_offset) {
kernel_vm_end = kernel_map->max_offset;
@@ -2243,7 +2612,7 @@ pmap_growkernel(vm_offset_t addr)
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
- newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
+ newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
pde_store(pde, newpdir);
kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
@@ -2323,13 +2692,14 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
pd_entry_t *pde;
pmap_t pmap;
pt_entry_t *pte, tpte;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW;
pv_entry_t pv;
vm_offset_t va;
vm_page_t m, m_pc;
struct spglist free;
uint64_t inuse;
int bit, field, freed;
-
+
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
@@ -2359,6 +2729,10 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
mtx_lock(&pv_chunks_mutex);
continue;
}
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
}
/*
@@ -2892,9 +3266,18 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
{
pd_entry_t newpde, oldpde;
pt_entry_t *firstpte, newpte;
+ pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
vm_paddr_t mptepa;
vm_page_t mpte;
struct spglist free;
+ int PG_PTE_CACHE;
+
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpde = *pde;
@@ -2944,8 +3327,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
("pmap_demote_pde: oldpde is missing PG_M"));
newpte = oldpde & ~PG_PS;
- if ((newpte & PG_PDE_PAT) != 0)
- newpte ^= PG_PDE_PAT | PG_PTE_PAT;
+ newpte = pmap_swap_pat(pmap, newpte);
/*
* If the page table page is new, initialize it.
@@ -3016,6 +3398,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
vm_paddr_t mptepa;
vm_page_t mpte;
+ KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
mpte = pmap_lookup_pt_page(pmap, va);
if (mpte == NULL)
@@ -3023,7 +3406,7 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
pmap_remove_pt_page(pmap, mpte);
mptepa = VM_PAGE_TO_PHYS(mpte);
- newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
+ newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
/*
* Initialize the page table page.
@@ -3055,6 +3438,12 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
pd_entry_t oldpde;
vm_offset_t eva, va;
vm_page_t m, mpte;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW;
+
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((sva & PDRMASK) == 0,
@@ -3111,9 +3500,13 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
- pt_entry_t oldpte;
+ pt_entry_t oldpte, PG_A, PG_M, PG_RW;
vm_page_t m;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpte = pte_load_clear(ptq);
if (oldpte & PG_W)
@@ -3145,8 +3538,9 @@ pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
struct spglist *free)
{
struct rwlock *lock;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if ((*pde & PG_V) == 0)
return;
@@ -3174,10 +3568,13 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_G, PG_V;
struct spglist free;
int anyvalid;
+ PG_G = pmap_global_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+
/*
* Perform an unsynchronized read. This is, however, safe.
*/
@@ -3326,7 +3723,7 @@ pmap_remove_all(vm_page_t m)
struct md_page *pvh;
pv_entry_t pv;
pmap_t pmap;
- pt_entry_t *pte, tpte;
+ pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
pd_entry_t *pde;
vm_offset_t va;
struct spglist free;
@@ -3350,6 +3747,9 @@ small_mappings:
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pmap_resident_count_dec(pmap, 1);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
@@ -3388,6 +3788,11 @@ pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
vm_offset_t eva, va;
vm_page_t m;
boolean_t anychanged;
+ pt_entry_t PG_G, PG_M, PG_RW;
+
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((sva & PDRMASK) == 0,
@@ -3428,7 +3833,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
boolean_t anychanged, pv_lists_locked;
if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
@@ -3440,6 +3845,10 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
(VM_PROT_WRITE|VM_PROT_EXECUTE))
return;
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pv_lists_locked = FALSE;
resume:
anychanged = FALSE;
@@ -3568,8 +3977,17 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
{
pd_entry_t newpde;
pt_entry_t *firstpte, oldpte, pa, *pte;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
vm_offset_t oldpteva;
vm_page_t mpte;
+ int PG_PTE_CACHE;
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3662,8 +4080,7 @@ setpte:
/*
* Propagate the PAT index to its proper position.
*/
- if ((newpde & PG_PTE_PAT) != 0)
- newpde ^= PG_PDE_PAT | PG_PTE_PAT;
+ newpde = pmap_swap_pat(pmap, newpde);
/*
* Map the superpage.
@@ -3696,12 +4113,18 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
{
struct rwlock *lock;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
pt_entry_t newpte, origpte;
pv_entry_t pv;
vm_paddr_t opa, pa;
vm_page_t mpte, om;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
va = trunc_page(va);
KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
@@ -3728,7 +4151,17 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
newpte |= PG_U;
if (pmap == kernel_pmap)
newpte |= PG_G;
- newpte |= pmap_cache_bits(m->md.pat_mode, 0);
+ newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
+
+ /*
+ * Set modified bit gratuitously for writeable mappings if
+ * the page is unmanaged. We do not want to take a fault
+ * to do the dirty bit accounting for these mappings.
+ */
+ if ((m->oflags & VPO_UNMANAGED) != 0) {
+ if ((newpte & PG_RW) != 0)
+ newpte |= PG_M;
+ }
mpte = NULL;
@@ -3877,7 +4310,8 @@ unchanged:
* populated, then attempt promotion.
*/
if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
- pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
+ pmap_ps_enabled(pmap) &&
+ (m->flags & PG_FICTITIOUS) == 0 &&
vm_reserv_level_iffullpop(m) == 0)
pmap_promote_pde(pmap, pde, va, &lock);
@@ -3898,11 +4332,14 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
struct rwlock **lockp)
{
pd_entry_t *pde, newpde;
+ pt_entry_t PG_V;
vm_page_t mpde;
struct spglist free;
+ PG_V = pmap_valid_bit(pmap);
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
" in pmap %p", va, pmap);
@@ -3918,7 +4355,7 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
" in pmap %p", va, pmap);
return (FALSE);
}
- newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
+ newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
PG_PS | PG_V;
if ((m->oflags & VPO_UNMANAGED) == 0) {
newpde |= PG_MANAGED;
@@ -3992,7 +4429,8 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
va = start + ptoa(diff);
if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
(VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
- pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
+ pmap_ps_enabled(pmap) &&
+ vm_reserv_level_iffullpop(m) == 0 &&
pmap_enter_pde(pmap, va, m, prot, &lock))
m = &m[NBPDR / PAGE_SIZE - 1];
else
@@ -4035,12 +4473,13 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
{
struct spglist free;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
vm_paddr_t pa;
KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
(m->oflags & VPO_UNMANAGED) != 0,
("pmap_enter_quick_locked: managed mapping within the clean submap"));
+ PG_V = pmap_valid_bit(pmap);
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -4120,7 +4559,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
*/
pmap_resident_count_inc(pmap, 1);
- pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
+ pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
if ((prot & VM_PROT_EXECUTE) == 0)
pa |= pg_nx;
@@ -4159,14 +4598,22 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
vm_pindex_t pindex, vm_size_t size)
{
pd_entry_t *pde;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa, ptepa;
vm_page_t p, pdpg;
int pat_mode;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
("pmap_object_init_pt: non-device object"));
if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+ if (!pmap_ps_enabled(pmap))
+ return;
if (!vm_object_populate(object, pindex, pindex + atop(size)))
return;
p = vm_page_lookup(object, pindex);
@@ -4204,8 +4651,8 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
* will not affect the termination of this loop.
*/
PMAP_LOCK(pmap);
- for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
- size; pa += NBPDR) {
+ for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
+ pa < ptepa + size; pa += NBPDR) {
pdpg = pmap_allocpde(pmap, addr, NULL);
if (pdpg == NULL) {
/*
@@ -4307,10 +4754,25 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t addr;
vm_offset_t end_addr = src_addr + len;
vm_offset_t va_next;
+ pt_entry_t PG_A, PG_M, PG_V;
if (dst_addr != src_addr)
return;
+ if (dst_pmap->pm_type != src_pmap->pm_type)
+ return;
+
+ /*
+ * EPT page table entries that require emulation of A/D bits are
+ * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
+ * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
+ * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
+ * implementations flag an EPT misconfiguration for exec-only
+ * mappings we skip this function entirely for emulated pmaps.
+ */
+ if (pmap_emulate_ad_bits(dst_pmap))
+ return;
+
lock = NULL;
rw_rlock(&pvh_global_lock);
if (dst_pmap < src_pmap) {
@@ -4320,6 +4782,11 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
PMAP_LOCK(src_pmap);
PMAP_LOCK(dst_pmap);
}
+
+ PG_A = pmap_accessed_bit(dst_pmap);
+ PG_M = pmap_modified_bit(dst_pmap);
+ PG_V = pmap_valid_bit(dst_pmap);
+
for (addr = src_addr; addr < end_addr; addr = va_next) {
pt_entry_t *src_pte, *dst_pte;
vm_page_t dstmpde, dstmpte, srcmpte;
@@ -4673,6 +5140,7 @@ pmap_remove_pages(pmap_t pmap)
{
pd_entry_t ptepde;
pt_entry_t *pte, tpte;
+ pt_entry_t PG_M, PG_RW, PG_V;
struct spglist free;
vm_page_t m, mpte, mt;
pv_entry_t pv;
@@ -4689,7 +5157,12 @@ pmap_remove_pages(pmap_t pmap)
printf("warning: pmap_remove_pages called with non-current pmap\n");
return;
}
+
lock = NULL;
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
SLIST_INIT(&free);
rw_rlock(&pvh_global_lock);
PMAP_LOCK(pmap);
@@ -4830,12 +5303,13 @@ pmap_remove_pages(pmap_t pmap)
}
static boolean_t
-pmap_page_test_mappings(vm_page_t m, pt_entry_t mask)
+pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
{
struct rwlock *lock;
pv_entry_t pv;
struct md_page *pvh;
- pt_entry_t *pte;
+ pt_entry_t *pte, mask;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
pmap_t pmap;
int md_gen, pvh_gen;
boolean_t rv;
@@ -4858,6 +5332,17 @@ restart:
}
}
pte = pmap_pte(pmap, pv->pv_va);
+ mask = 0;
+ if (modified) {
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ mask |= PG_RW | PG_M;
+ }
+ if (accessed) {
+ PG_A = pmap_accessed_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ mask |= PG_V | PG_A;
+ }
rv = (*pte & mask) == mask;
PMAP_UNLOCK(pmap);
if (rv)
@@ -4880,6 +5365,17 @@ restart:
}
}
pte = pmap_pde(pmap, pv->pv_va);
+ mask = 0;
+ if (modified) {
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ mask |= PG_RW | PG_M;
+ }
+ if (accessed) {
+ PG_A = pmap_accessed_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ mask |= PG_V | PG_A;
+ }
rv = (*pte & mask) == mask;
PMAP_UNLOCK(pmap);
if (rv)
@@ -4913,22 +5409,23 @@ pmap_is_modified(vm_page_t m)
VM_OBJECT_ASSERT_WLOCKED(m->object);
if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
return (FALSE);
- return (pmap_page_test_mappings(m, PG_M | PG_RW));
+ return (pmap_page_test_mappings(m, FALSE, TRUE));
}
/*
* pmap_is_prefaultable:
*
- * Return whether or not the specified virtual address is elgible
+ * Return whether or not the specified virtual address is eligible
* for prefault.
*/
boolean_t
pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
{
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
boolean_t rv;
+ PG_V = pmap_valid_bit(pmap);
rv = FALSE;
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, addr);
@@ -4952,7 +5449,7 @@ pmap_is_referenced(vm_page_t m)
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_is_referenced: page %p is not managed", m));
- return (pmap_page_test_mappings(m, PG_A | PG_V));
+ return (pmap_page_test_mappings(m, TRUE, FALSE));
}
/*
@@ -4966,7 +5463,7 @@ pmap_remove_write(vm_page_t m)
struct rwlock *lock;
pv_entry_t next_pv, pv;
pd_entry_t *pde;
- pt_entry_t oldpte, *pte;
+ pt_entry_t oldpte, *pte, PG_M, PG_RW;
vm_offset_t va;
int pvh_gen, md_gen;
@@ -5001,6 +5498,7 @@ retry_pv_loop:
goto retry_pv_loop;
}
}
+ PG_RW = pmap_rw_bit(pmap);
va = pv->pv_va;
pde = pmap_pde(pmap, va);
if ((*pde & PG_RW) != 0)
@@ -5026,6 +5524,8 @@ small_mappings:
goto retry_pv_loop;
}
}
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0,
("pmap_remove_write: found a 2mpage in page %p's pv list",
@@ -5048,6 +5548,33 @@ retry:
rw_runlock(&pvh_global_lock);
}
+static __inline boolean_t
+safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
+{
+
+ if (!pmap_emulate_ad_bits(pmap))
+ return (TRUE);
+
+ KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
+
+ /*
+ * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
+ * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
+ * if the EPT_PG_WRITE bit is set.
+ */
+ if ((pte & EPT_PG_WRITE) != 0)
+ return (FALSE);
+
+ /*
+ * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
+ */
+ if ((pte & EPT_PG_EXECUTE) == 0 ||
+ ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
#define PMAP_TS_REFERENCED_MAX 5
/*
@@ -5069,13 +5596,17 @@ pmap_ts_referenced(vm_page_t m)
pv_entry_t pv, pvf;
pmap_t pmap;
struct rwlock *lock;
- pd_entry_t *pde;
- pt_entry_t *pte;
+ pd_entry_t oldpde, *pde;
+ pt_entry_t *pte, PG_A;
+ vm_offset_t va;
vm_paddr_t pa;
int cleared, md_gen, not_cleared, pvh_gen;
+ struct spglist free;
+ boolean_t demoted;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_ts_referenced: page %p is not managed", m));
+ SLIST_INIT(&free);
cleared = 0;
pa = VM_PAGE_TO_PHYS(m);
lock = PHYS_TO_PV_LIST_LOCK(pa);
@@ -5089,6 +5620,8 @@ retry:
goto small_mappings;
pv = pvf;
do {
+ if (pvf == NULL)
+ pvf = pv;
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
@@ -5100,7 +5633,10 @@ retry:
goto retry;
}
}
+ PG_A = pmap_accessed_bit(pmap);
+ va = pv->pv_va;
pde = pmap_pde(pmap, pv->pv_va);
+ oldpde = *pde;
if ((*pde & PG_A) != 0) {
/*
* Since this reference bit is shared by 512 4KB
@@ -5123,15 +5659,50 @@ retry:
if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
(uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
(*pde & PG_W) == 0) {
- atomic_clear_long(pde, PG_A);
- pmap_invalidate_page(pmap, pv->pv_va);
+ if (safe_to_clear_referenced(pmap, oldpde)) {
+ atomic_clear_long(pde, PG_A);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ demoted = FALSE;
+ } else if (pmap_demote_pde_locked(pmap, pde,
+ pv->pv_va, &lock)) {
+ /*
+ * Remove the mapping to a single page
+ * so that a subsequent access may
+ * repromote. Since the underlying
+ * page table page is fully populated,
+ * this removal never frees a page
+ * table page.
+ */
+ demoted = TRUE;
+ va += VM_PAGE_TO_PHYS(m) - (oldpde &
+ PG_PS_FRAME);
+ pte = pmap_pde_to_pte(pde, va);
+ pmap_remove_pte(pmap, pte, va, *pde,
+ NULL, &lock);
+ pmap_invalidate_page(pmap, va);
+ } else
+ demoted = TRUE;
+
+ if (demoted) {
+ /*
+ * The superpage mapping was removed
+ * entirely and therefore 'pv' is no
+ * longer valid.
+ */
+ if (pvf == pv)
+ pvf = NULL;
+ pv = NULL;
+ }
cleared++;
+ KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+ ("inconsistent pv lock %p %p for page %p",
+ lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
} else
not_cleared++;
}
PMAP_UNLOCK(pmap);
/* Rotate the PV list if it has more than one entry. */
- if (TAILQ_NEXT(pv, pv_next) != NULL) {
+ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
pvh->pv_gen++;
@@ -5144,6 +5715,8 @@ small_mappings:
goto out;
pv = pvf;
do {
+ if (pvf == NULL)
+ pvf = pv;
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
@@ -5156,19 +5729,40 @@ small_mappings:
goto retry;
}
}
+ PG_A = pmap_accessed_bit(pmap);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0,
("pmap_ts_referenced: found a 2mpage in page %p's pv list",
m));
pte = pmap_pde_to_pte(pde, pv->pv_va);
if ((*pte & PG_A) != 0) {
- atomic_clear_long(pte, PG_A);
- pmap_invalidate_page(pmap, pv->pv_va);
- cleared++;
+ if (safe_to_clear_referenced(pmap, *pte)) {
+ atomic_clear_long(pte, PG_A);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ cleared++;
+ } else if ((*pte & PG_W) == 0) {
+ /*
+ * Wired pages cannot be paged out so
+ * doing accessed bit emulation for
+ * them is wasted effort. We do the
+ * hard work for unwired pages only.
+ */
+ pmap_remove_pte(pmap, pte, pv->pv_va,
+ *pde, &free, &lock);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ cleared++;
+ if (pvf == pv)
+ pvf = NULL;
+ pv = NULL;
+ KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+ ("inconsistent pv lock %p %p for page %p",
+ lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+ } else
+ not_cleared++;
}
PMAP_UNLOCK(pmap);
/* Rotate the PV list if it has more than one entry. */
- if (TAILQ_NEXT(pv, pv_next) != NULL) {
+ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
@@ -5178,6 +5772,7 @@ small_mappings:
out:
rw_wunlock(lock);
rw_runlock(&pvh_global_lock);
+ pmap_free_zero_pages(&free);
return (cleared + not_cleared);
}
@@ -5193,13 +5788,29 @@ pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t oldpde, *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
vm_offset_t va_next;
vm_page_t m;
boolean_t anychanged, pv_lists_locked;
if (advice != MADV_DONTNEED && advice != MADV_FREE)
return;
+
+ /*
+ * A/D bit emulation requires an alternate code path when clearing
+ * the modified and accessed bits below. Since this function is
+ * advisory in nature we skip it entirely for pmaps that require
+ * A/D bit emulation.
+ */
+ if (pmap_emulate_ad_bits(pmap))
+ return;
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
pv_lists_locked = FALSE;
resume:
anychanged = FALSE;
@@ -5313,7 +5924,7 @@ pmap_clear_modify(vm_page_t m)
pmap_t pmap;
pv_entry_t next_pv, pv;
pd_entry_t oldpde, *pde;
- pt_entry_t oldpte, *pte;
+ pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
struct rwlock *lock;
vm_offset_t va;
int md_gen, pvh_gen;
@@ -5350,6 +5961,9 @@ restart:
goto restart;
}
}
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
va = pv->pv_va;
pde = pmap_pde(pmap, va);
oldpde = *pde;
@@ -5392,6 +6006,8 @@ small_mappings:
goto restart;
}
}
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
" a 2mpage in page %p's pv list", m));
@@ -5412,7 +6028,7 @@ small_mappings:
/* Adjust the cache mode for a 4KB page mapped via a PTE. */
static __inline void
-pmap_pte_attr(pt_entry_t *pte, int cache_bits)
+pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
{
u_int opte, npte;
@@ -5422,14 +6038,14 @@ pmap_pte_attr(pt_entry_t *pte, int cache_bits)
*/
do {
opte = *(u_int *)pte;
- npte = opte & ~PG_PTE_CACHE;
+ npte = opte & ~mask;
npte |= cache_bits;
} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
}
/* Adjust the cache mode for a 2MB page mapped via a PDE. */
static __inline void
-pmap_pde_attr(pd_entry_t *pde, int cache_bits)
+pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
{
u_int opde, npde;
@@ -5439,7 +6055,7 @@ pmap_pde_attr(pd_entry_t *pde, int cache_bits)
*/
do {
opde = *(u_int *)pde;
- npde = opde & ~PG_PDE_CACHE;
+ npde = opde & ~mask;
npde |= cache_bits;
} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
}
@@ -5514,9 +6130,15 @@ pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
{
pdp_entry_t newpdpe, oldpdpe;
pd_entry_t *firstpde, newpde, *pde;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t mpdepa;
vm_page_t mpde;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpdpe = *pdpe;
KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
@@ -5633,8 +6255,8 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
if (base < DMAP_MIN_ADDRESS)
return (EINVAL);
- cache_bits_pde = pmap_cache_bits(mode, 1);
- cache_bits_pte = pmap_cache_bits(mode, 0);
+ cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
+ cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
changed = FALSE;
/*
@@ -5651,7 +6273,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
* memory type, then we need not demote this page. Just
* increment tmpva to the next 1GB page frame.
*/
- if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
+ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
tmpva = trunc_1gpage(tmpva) + NBPDP;
continue;
}
@@ -5678,7 +6300,7 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
* memory type, then we need not demote this page. Just
* increment tmpva to the next 2MB page frame.
*/
- if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
+ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
tmpva = trunc_2mpage(tmpva) + NBPDR;
continue;
}
@@ -5711,8 +6333,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
for (tmpva = base; tmpva < base + size; ) {
pdpe = pmap_pdpe(kernel_pmap, tmpva);
if (*pdpe & PG_PS) {
- if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
- pmap_pde_attr(pdpe, cache_bits_pde);
+ if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pdpe, cache_bits_pde,
+ X86_PG_PDE_CACHE);
changed = TRUE;
}
if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
@@ -5739,8 +6362,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
}
pde = pmap_pdpe_to_pde(pdpe, tmpva);
if (*pde & PG_PS) {
- if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
- pmap_pde_attr(pde, cache_bits_pde);
+ if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pde, cache_bits_pde,
+ X86_PG_PDE_CACHE);
changed = TRUE;
}
if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
@@ -5765,8 +6389,9 @@ pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
tmpva = trunc_2mpage(tmpva) + NBPDR;
} else {
pte = pmap_pde_to_pte(pde, tmpva);
- if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
- pmap_pte_attr(pte, cache_bits_pte);
+ if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
+ pmap_pte_attr(pte, cache_bits_pte,
+ X86_PG_PTE_CACHE);
changed = TRUE;
}
if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
@@ -5831,7 +6456,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
changed = FALSE;
PMAP_LOCK(kernel_pmap);
pdpe = pmap_pdpe(kernel_pmap, va);
- if ((*pdpe & PG_V) == 0)
+ if ((*pdpe & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDPE");
if ((*pdpe & PG_PS) != 0) {
if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
@@ -5840,7 +6465,7 @@ pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
}
if (len < NBPDR) {
pde = pmap_pdpe_to_pde(pdpe, va);
- if ((*pde & PG_V) == 0)
+ if ((*pde & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDE");
if ((*pde & PG_PS) != 0) {
if (!pmap_demote_pde(kernel_pmap, pde, va))
@@ -5861,10 +6486,15 @@ int
pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
{
pd_entry_t *pdep;
- pt_entry_t pte;
+ pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa;
int val;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK(pmap);
retry:
pdep = pmap_pde(pmap, addr);
@@ -5958,6 +6588,154 @@ pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
}
+#ifdef INVARIANTS
+static unsigned long num_dirty_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
+ &num_dirty_emulations, 0, NULL);
+
+static unsigned long num_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
+ &num_accessed_emulations, 0, NULL);
+
+static unsigned long num_superpage_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
+ &num_superpage_accessed_emulations, 0, NULL);
+
+static unsigned long ad_emulation_superpage_promotions;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
+ &ad_emulation_superpage_promotions, 0, NULL);
+#endif /* INVARIANTS */
+
+int
+pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
+{
+ int rv;
+ struct rwlock *lock;
+ vm_page_t m, mpte;
+ pd_entry_t *pde;
+ pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
+ boolean_t pv_lists_locked;
+
+ KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
+ ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
+
+ if (!pmap_emulate_ad_bits(pmap))
+ return (-1);
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+ rv = -1;
+ lock = NULL;
+ pv_lists_locked = FALSE;
+retry:
+ PMAP_LOCK(pmap);
+
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL || (*pde & PG_V) == 0)
+ goto done;
+
+ if ((*pde & PG_PS) != 0) {
+ if (ftype == VM_PROT_READ) {
+#ifdef INVARIANTS
+ atomic_add_long(&num_superpage_accessed_emulations, 1);
+#endif
+ *pde |= PG_A;
+ rv = 0;
+ }
+ goto done;
+ }
+
+ pte = pmap_pde_to_pte(pde, va);
+ if ((*pte & PG_V) == 0)
+ goto done;
+
+ if (ftype == VM_PROT_WRITE) {
+ if ((*pte & PG_RW) == 0)
+ goto done;
+ *pte |= PG_M;
+ }
+ *pte |= PG_A;
+
+ /* try to promote the mapping */
+ if (va < VM_MAXUSER_ADDRESS)
+ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+ else
+ mpte = NULL;
+
+ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+
+ if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
+ pmap_ps_enabled(pmap) &&
+ (m->flags & PG_FICTITIOUS) == 0 &&
+ vm_reserv_level_iffullpop(m) == 0) {
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ goto retry;
+ }
+ }
+ pmap_promote_pde(pmap, pde, va, &lock);
+#ifdef INVARIANTS
+ atomic_add_long(&ad_emulation_superpage_promotions, 1);
+#endif
+ }
+#ifdef INVARIANTS
+ if (ftype == VM_PROT_WRITE)
+ atomic_add_long(&num_dirty_emulations, 1);
+ else
+ atomic_add_long(&num_accessed_emulations, 1);
+#endif
+ rv = 0; /* success */
+done:
+ if (lock != NULL)
+ rw_wunlock(lock);
+ if (pv_lists_locked)
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+ return (rv);
+}
+
+void
+pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
+{
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pde;
+ pt_entry_t *pte, PG_V;
+ int idx;
+
+ idx = 0;
+ PG_V = pmap_valid_bit(pmap);
+ PMAP_LOCK(pmap);
+
+ pml4 = pmap_pml4e(pmap, va);
+ ptr[idx++] = *pml4;
+ if ((*pml4 & PG_V) == 0)
+ goto done;
+
+ pdp = pmap_pml4e_to_pdpe(pml4, va);
+ ptr[idx++] = *pdp;
+ if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
+ goto done;
+
+ pde = pmap_pdpe_to_pde(pdp, va);
+ ptr[idx++] = *pde;
+ if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
+ goto done;
+
+ pte = pmap_pde_to_pte(pde, va);
+ ptr[idx++] = *pte;
+
+done:
+ PMAP_UNLOCK(pmap);
+ *num = idx;
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
@@ -5968,7 +6746,7 @@ DB_SHOW_COMMAND(pte, pmap_print_pte)
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
vm_offset_t va;
if (have_addr) {
@@ -5978,6 +6756,7 @@ DB_SHOW_COMMAND(pte, pmap_print_pte)
db_printf("show pte addr\n");
return;
}
+ PG_V = pmap_valid_bit(pmap);
pml4 = pmap_pml4e(pmap, va);
db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
if ((*pml4 & PG_V) == 0) {
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 3eaf3fd..4691014 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -734,6 +734,14 @@ trap_pfault(frame, usermode)
}
/*
+ * If the trap was caused by errant bits in the PTE then panic.
+ */
+ if (frame->tf_err & PGEX_RSV) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+
+ /*
* PGEX_I is defined only if the execute disable bit capability is
* supported and enabled.
*/
@@ -822,10 +830,11 @@ trap_fatal(frame, eva)
#endif
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
- printf("fault code = %s %s %s, %s\n",
+ printf("fault code = %s %s %s%s, %s\n",
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_I ? "instruction" : "data",
+ code & PGEX_RSV ? " rsv" : "",
code & PGEX_P ? "protection violation" : "page not present");
}
printf("instruction pointer = 0x%lx:0x%lx\n",
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 11e1cf2..01de629 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -50,41 +50,74 @@
* of the fields not present here and there, depending on a lot of things.
*/
/* ---- Intel Nomenclature ---- */
-#define PG_V 0x001 /* P Valid */
-#define PG_RW 0x002 /* R/W Read/Write */
-#define PG_U 0x004 /* U/S User/Supervisor */
-#define PG_NC_PWT 0x008 /* PWT Write through */
-#define PG_NC_PCD 0x010 /* PCD Cache disable */
-#define PG_A 0x020 /* A Accessed */
-#define PG_M 0x040 /* D Dirty */
-#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
-#define PG_PTE_PAT 0x080 /* PAT PAT index */
-#define PG_G 0x100 /* G Global */
-#define PG_AVAIL1 0x200 /* / Available for system */
-#define PG_AVAIL2 0x400 /* < programmers use */
-#define PG_AVAIL3 0x800 /* \ */
-#define PG_PDE_PAT 0x1000 /* PAT PAT index */
-#define PG_NX (1ul<<63) /* No-execute */
+#define X86_PG_V 0x001 /* P Valid */
+#define X86_PG_RW 0x002 /* R/W Read/Write */
+#define X86_PG_U 0x004 /* U/S User/Supervisor */
+#define X86_PG_NC_PWT 0x008 /* PWT Write through */
+#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */
+#define X86_PG_A 0x020 /* A Accessed */
+#define X86_PG_M 0x040 /* D Dirty */
+#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
+#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */
+#define X86_PG_G 0x100 /* G Global */
+#define X86_PG_AVAIL1 0x200 /* / Available for system */
+#define X86_PG_AVAIL2 0x400 /* < programmers use */
+#define X86_PG_AVAIL3 0x800 /* \ */
+#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */
+#define X86_PG_NX (1ul<<63) /* No-execute */
+#define X86_PG_AVAIL(x) (1ul << (x))
+/* Page level cache control fields used to determine the PAT type */
+#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+
+/*
+ * Intel extended page table (EPT) bit definitions.
+ */
+#define EPT_PG_READ 0x001 /* R Read */
+#define EPT_PG_WRITE 0x002 /* W Write */
+#define EPT_PG_EXECUTE 0x004 /* X Execute */
+#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */
+#define EPT_PG_PS 0x080 /* PS Page size */
+#define EPT_PG_A 0x100 /* A Accessed */
+#define EPT_PG_M 0x200 /* D Dirty */
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */
+
+/*
+ * Define the PG_xx macros in terms of the bits on x86 PTEs.
+ */
+#define PG_V X86_PG_V
+#define PG_RW X86_PG_RW
+#define PG_U X86_PG_U
+#define PG_NC_PWT X86_PG_NC_PWT
+#define PG_NC_PCD X86_PG_NC_PCD
+#define PG_A X86_PG_A
+#define PG_M X86_PG_M
+#define PG_PS X86_PG_PS
+#define PG_PTE_PAT X86_PG_PTE_PAT
+#define PG_G X86_PG_G
+#define PG_AVAIL1 X86_PG_AVAIL1
+#define PG_AVAIL2 X86_PG_AVAIL2
+#define PG_AVAIL3 X86_PG_AVAIL3
+#define PG_PDE_PAT X86_PG_PDE_PAT
+#define PG_NX X86_PG_NX
+#define PG_PDE_CACHE X86_PG_PDE_CACHE
+#define PG_PTE_CACHE X86_PG_PTE_CACHE
/* Our various interpretations of the above */
-#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */
-#define PG_MANAGED PG_AVAIL2
+#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */
+#define PG_MANAGED X86_PG_AVAIL2
+#define EPT_PG_EMUL_V X86_PG_AVAIL(52)
+#define EPT_PG_EMUL_RW X86_PG_AVAIL(53)
#define PG_FRAME (0x000ffffffffff000ul)
#define PG_PS_FRAME (0x000fffffffe00000ul)
-#define PG_PROT (PG_RW|PG_U) /* all protection bits . */
-#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */
-
-/* Page level cache control fields used to determine the PAT type */
-#define PG_PDE_CACHE (PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
-#define PG_PTE_CACHE (PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
/*
* Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
* (PTE) page mappings have identical settings for the following fields:
*/
-#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
- PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
+#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
+ PG_M | PG_A | PG_U | PG_RW | PG_V)
/*
* Page Protection Exception bits
@@ -96,6 +129,28 @@
#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
#define PGEX_I 0x10 /* during an instruction fetch */
+/*
+ * undef the PG_xx macros that define bits in the regular x86 PTEs that
+ * have a different position in nested PTEs. This is done when compiling
+ * code that needs to be aware of the differences between regular x86 and
+ * nested PTEs.
+ *
+ * The appropriate bitmask will be calculated at runtime based on the pmap
+ * type.
+ */
+#ifdef AMD64_NPT_AWARE
+#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */
+#undef PG_G
+#undef PG_A
+#undef PG_M
+#undef PG_PDE_PAT
+#undef PG_PDE_CACHE
+#undef PG_PTE_PAT
+#undef PG_PTE_CACHE
+#undef PG_RW
+#undef PG_V
+#endif
+
/*
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
@@ -256,6 +311,11 @@ struct pmap {
int pm_flags;
};
+/* flags */
+#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */
+#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */
+#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */
+
typedef struct pmap *pmap_t;
#ifdef _KERNEL
@@ -272,6 +332,9 @@ extern struct pmap kernel_pmap_store;
#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx)
+
+int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
#endif
/*
@@ -330,7 +393,7 @@ void pmap_invalidate_all(pmap_t);
void pmap_invalidate_cache(void);
void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
-
+void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
#endif /* _KERNEL */
#endif /* !LOCORE */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index b741247..64dbc58 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -39,19 +39,18 @@ struct seg_desc;
struct vm_exit;
struct vm_run;
struct vlapic;
+struct vmspace;
+struct vm_object;
+struct pmap;
enum x2apic_state;
typedef int (*vmm_init_func_t)(void);
typedef int (*vmm_cleanup_func_t)(void);
-typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
-typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+ struct pmap *pmap);
typedef void (*vmi_cleanup_func_t)(void *vmi);
-typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
- vm_paddr_t hpa, size_t length,
- vm_memattr_t attr, int prot,
- boolean_t superpages_ok);
-typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
@@ -65,6 +64,8 @@ typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
uint32_t code, int code_valid);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
+typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
@@ -73,8 +74,6 @@ struct vmm_ops {
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
- vmi_mmap_set_func_t vmmmap_set;
- vmi_mmap_get_func_t vmmmap_get;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
@@ -82,6 +81,8 @@ struct vmm_ops {
vmi_inject_event_t vminject;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
+ vmi_vmspace_alloc vmspace_alloc;
+ vmi_vmspace_free vmspace_free;
};
extern struct vmm_ops vmm_ops_intel;
@@ -93,9 +94,14 @@ const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
+ void **cookie);
+void vm_gpa_release(void *cookie);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
+int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+ vm_offset_t *offset, struct vm_object **object);
+boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@@ -130,8 +136,9 @@ void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
+ VCPU_FROZEN,
VCPU_RUNNING,
- VCPU_CANNOT_RUN,
+ VCPU_SLEEPING,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
@@ -145,7 +152,9 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
void *vcpu_stats(struct vm *vm, int vcpu);
void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
-
+struct vmspace *vm_get_vmspace(struct vm *vm);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
#endif /* KERNEL */
#include <machine/vmm_instruction_emul.h>
@@ -247,6 +256,7 @@ enum vm_exitcode {
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
+ VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_MAX
};
@@ -266,8 +276,15 @@ struct vm_exit {
} inout;
struct {
uint64_t gpa;
- struct vie vie;
+ int fault_type;
+ int protection;
} paging;
+ struct {
+ uint64_t gpa;
+ uint64_t gla;
+ uint64_t cr3;
+ struct vie vie;
+ } inst_emul;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index c3b47c2..bf014cc 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -36,7 +36,8 @@ int vmmdev_cleanup(void);
struct vm_memory_segment {
vm_paddr_t gpa; /* in */
- size_t len; /* in */
+ size_t len;
+ int wired;
};
struct vm_register {
@@ -135,6 +136,12 @@ struct vm_x2apic {
enum x2apic_state state;
};
+struct vm_gpa_pte {
+ uint64_t gpa; /* in */
+ uint64_t pte[4]; /* out */
+ int ptenum;
+};
+
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@@ -145,6 +152,7 @@ enum {
/* memory apis */
IOCNUM_MAP_MEMORY = 10,
IOCNUM_GET_MEMORY_SEG = 11,
+ IOCNUM_GET_GPA_PMAP = 12,
/* register/state accessors */
IOCNUM_SET_REGISTER = 20,
@@ -215,4 +223,6 @@ enum {
_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_X2APIC_STATE \
_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_GPA_PMAP \
+ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index a812a73..a7480e7 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -102,11 +102,15 @@ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
#ifdef _KERNEL
/*
* APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
*/
int vmm_fetch_instruction(struct vm *vm, int cpuid,
uint64_t rip, int inst_length, uint64_t cr3,
struct vie *vie);
+void vie_init(struct vie *vie);
+
/*
* Decode the instruction fetched into 'vie' so it can be emulated.
*
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index dc071d3..1cc130f 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -54,7 +54,7 @@ amdv_cleanup(void)
}
static void *
-amdv_vminit(struct vm *vm)
+amdv_vminit(struct vm *vm, struct pmap *pmap)
{
printf("amdv_vminit: not implemented\n");
@@ -62,7 +62,7 @@ amdv_vminit(struct vm *vm)
}
static int
-amdv_vmrun(void *arg, int vcpu, register_t rip)
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
{
printf("amdv_vmrun: not implemented\n");
@@ -78,23 +78,6 @@ amdv_vmcleanup(void *arg)
}
static int
-amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
- vm_memattr_t attr, int prot, boolean_t spok)
-{
-
- printf("amdv_vmmmap_set: not implemented\n");
- return (EINVAL);
-}
-
-static vm_paddr_t
-amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-
- printf("amdv_vmmmap_get: not implemented\n");
- return (EINVAL);
-}
-
-static int
amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
{
@@ -151,21 +134,37 @@ amdv_setcap(void *arg, int vcpu, int type, int val)
return (EINVAL);
}
+static struct vmspace *
+amdv_vmspace_alloc(vm_offset_t min, vm_offset_t max)
+{
+
+ printf("amdv_vmspace_alloc: not implemented\n");
+ return (NULL);
+}
+
+static void
+amdv_vmspace_free(struct vmspace *vmspace)
+{
+
+ printf("amdv_vmspace_free: not implemented\n");
+ return;
+}
+
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
amdv_vminit,
amdv_vmrun,
amdv_vmcleanup,
- amdv_vmmmap_set,
- amdv_vmmmap_get,
amdv_getreg,
amdv_setreg,
amdv_getdesc,
amdv_setdesc,
amdv_inject_event,
amdv_getcap,
- amdv_setcap
+ amdv_setcap,
+ amdv_vmspace_alloc,
+ amdv_vmspace_free,
};
static int
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
index 1ff1580..18e90f3 100644
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@@ -29,32 +29,31 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/kernel.h>
#include <sys/types.h>
-#include <sys/errno.h>
#include <sys/systm.h>
-#include <sys/malloc.h>
#include <sys/smp.h>
+#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
-
-#include <machine/param.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-#include <machine/vmparam.h>
+#include <vm/vm_extern.h>
#include <machine/vmm.h>
+
#include "vmx_cpufunc.h"
#include "vmx_msr.h"
-#include "vmx.h"
#include "ept.h"
+#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0))
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
-#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21))
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
@@ -64,28 +63,22 @@ __FBSDID("$FreeBSD$");
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
-#define EPT_PG_RD (1 << 0)
-#define EPT_PG_WR (1 << 1)
-#define EPT_PG_EX (1 << 2)
-#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
-#define EPT_PG_IGNORE_PAT (1 << 6)
-#define EPT_PG_SUPERPAGE (1 << 7)
-
-#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPT_ENABLE_AD_BITS (1 << 6)
-MALLOC_DECLARE(M_VMX);
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
-static uint64_t page_sizes_mask;
+static int ept_enable_ad_bits;
-/*
- * Set this to 1 to have the EPT tables respect the guest PAT settings
- */
-static int ept_pat_passthru;
+static int ept_pmap_flags;
+SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
+ &ept_pmap_flags, 0, NULL);
int
ept_init(void)
{
- int page_shift;
+ int use_hw_ad_bits, use_superpages, use_exec_only;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@@ -105,17 +98,22 @@ ept_init(void)
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
- /* Set bits in 'page_sizes_mask' for each valid page size */
- page_shift = PAGE_SHIFT;
- page_sizes_mask = 1UL << page_shift; /* 4KB page */
+ use_superpages = 1;
+ TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
+ if (use_superpages && EPT_PDE_SUPERPAGE(cap))
+ ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */
- page_shift += 9;
- if (EPT_PDE_SUPERPAGE(cap))
- page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+ use_hw_ad_bits = 1;
+ TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
+ if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
+ ept_enable_ad_bits = 1;
+ else
+ ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
- page_shift += 9;
- if (EPT_PDPTE_SUPERPAGE(cap))
- page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+ use_exec_only = 1;
+ TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
+ if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
+ ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
return (0);
}
@@ -154,247 +152,53 @@ ept_dump(uint64_t *ptp, int nlevels)
}
#endif
-static size_t
-ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
- vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
-{
- int spshift, ptpshift, ptpindex, nlevels;
-
- /*
- * Compute the size of the mapping that we can accomodate.
- *
- * This is based on three factors:
- * - super page sizes supported by the processor
- * - alignment of the region starting at 'gpa' and 'hpa'
- * - length of the region 'len'
- */
- spshift = PAGE_SHIFT;
- if (spok)
- spshift += (EPT_PWLEVELS - 1) * 9;
- while (spshift >= PAGE_SHIFT) {
- uint64_t spsize = 1UL << spshift;
- if ((page_sizes_mask & spsize) != 0 &&
- (gpa & (spsize - 1)) == 0 &&
- (hpa & (spsize - 1)) == 0 &&
- length >= spsize) {
- break;
- }
- spshift -= 9;
- }
-
- if (spshift < PAGE_SHIFT) {
- panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
- "length 0x%016lx, page_sizes_mask 0x%016lx",
- gpa, hpa, length, page_sizes_mask);
- }
-
- nlevels = EPT_PWLEVELS;
- while (--nlevels >= 0) {
- ptpshift = PAGE_SHIFT + nlevels * 9;
- ptpindex = (gpa >> ptpshift) & 0x1FF;
-
- /* We have reached the leaf mapping */
- if (spshift >= ptpshift)
- break;
-
- /*
- * We are working on a non-leaf page table page.
- *
- * Create the next level page table page if necessary and point
- * to it from the current page table.
- */
- if (ptp[ptpindex] == 0) {
- void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
- ptp[ptpindex] = vtophys(nlp);
- ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
- }
-
- /* Work our way down to the next level page table page */
- ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
- }
-
- if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
- panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
- "mismatch\n", gpa, ptpshift);
- }
-
- if (prot != VM_PROT_NONE) {
- /* Do the mapping */
- ptp[ptpindex] = hpa;
-
- /* Apply the access controls */
- if (prot & VM_PROT_READ)
- ptp[ptpindex] |= EPT_PG_RD;
- if (prot & VM_PROT_WRITE)
- ptp[ptpindex] |= EPT_PG_WR;
- if (prot & VM_PROT_EXECUTE)
- ptp[ptpindex] |= EPT_PG_EX;
-
- /*
- * By default the PAT type is ignored - this appears to
- * be how other hypervisors handle EPT. Allow this to be
- * overridden.
- */
- ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
- if (!ept_pat_passthru)
- ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
-
- if (nlevels > 0)
- ptp[ptpindex] |= EPT_PG_SUPERPAGE;
- } else {
- /* Remove the mapping */
- ptp[ptpindex] = 0;
- }
-
- return (1UL << ptpshift);
-}
-
-static vm_paddr_t
-ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
-{
- int nlevels, ptpshift, ptpindex;
- uint64_t ptpval, hpabase, pgmask;
-
- nlevels = EPT_PWLEVELS;
- while (--nlevels >= 0) {
- ptpshift = PAGE_SHIFT + nlevels * 9;
- ptpindex = (gpa >> ptpshift) & 0x1FF;
-
- ptpval = ptp[ptpindex];
-
- /* Cannot make progress beyond this point */
- if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
- break;
-
- if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
- pgmask = (1UL << ptpshift) - 1;
- hpabase = ptpval & ~pgmask;
- return (hpabase | (gpa & pgmask));
- }
-
- /* Work our way down to the next level page table page */
- ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
- }
-
- return ((vm_paddr_t)-1);
-}
-
static void
-ept_free_pt_entry(pt_entry_t pte)
+invept_single_context(void *arg)
{
- if (pte == 0)
- return;
-
- /* sanity check */
- if ((pte & EPT_PG_SUPERPAGE) != 0)
- panic("ept_free_pt_entry: pte cannot have superpage bit");
+ struct invept_desc desc = *(struct invept_desc *)arg;
- return;
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
}
-static void
-ept_free_pd_entry(pd_entry_t pde)
+void
+ept_invalidate_mappings(u_long eptp)
{
- pt_entry_t *pt;
- int i;
+ struct invept_desc invept_desc = { 0 };
- if (pde == 0)
- return;
+ invept_desc.eptp = eptp;
- if ((pde & EPT_PG_SUPERPAGE) == 0) {
- pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
- for (i = 0; i < NPTEPG; i++)
- ept_free_pt_entry(pt[i]);
- free(pt, M_VMX); /* free the page table page */
- }
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}
-static void
-ept_free_pdp_entry(pdp_entry_t pdpe)
+static int
+ept_pinit(pmap_t pmap)
{
- pd_entry_t *pd;
- int i;
- if (pdpe == 0)
- return;
-
- if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
- pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
- for (i = 0; i < NPDEPG; i++)
- ept_free_pd_entry(pd[i]);
- free(pd, M_VMX); /* free the page directory page */
- }
+ return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
}
-static void
-ept_free_pml4_entry(pml4_entry_t pml4e)
+struct vmspace *
+ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
- pdp_entry_t *pdp;
- int i;
-
- if (pml4e == 0)
- return;
- if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
- for (i = 0; i < NPDPEPG; i++)
- ept_free_pdp_entry(pdp[i]);
- free(pdp, M_VMX); /* free the page directory ptr page */
- }
+ return (vmspace_alloc(min, max, ept_pinit));
}
void
-ept_vmcleanup(struct vmx *vmx)
-{
- int i;
-
- for (i = 0; i < NPML4EPG; i++)
- ept_free_pml4_entry(vmx->pml4ept[i]);
-}
-
-int
-ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
- vm_memattr_t attr, int prot, boolean_t spok)
+ept_vmspace_free(struct vmspace *vmspace)
{
- size_t n;
- struct vmx *vmx = arg;
-
- while (len > 0) {
- n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
- prot, spok);
- len -= n;
- gpa += n;
- hpa += n;
- }
- return (0);
+ vmspace_free(vmspace);
}
-vm_paddr_t
-ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+uint64_t
+eptp(uint64_t pml4)
{
- vm_paddr_t hpa;
- struct vmx *vmx;
-
- vmx = arg;
- hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
- return (hpa);
-}
+ uint64_t eptp_val;
-static void
-invept_single_context(void *arg)
-{
- struct invept_desc desc = *(struct invept_desc *)arg;
+ eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
+ if (ept_enable_ad_bits)
+ eptp_val |= EPT_ENABLE_AD_BITS;
- invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
-}
-
-void
-ept_invalidate_mappings(u_long pml4ept)
-{
- struct invept_desc invept_desc = { 0 };
-
- invept_desc.eptp = EPTP(pml4ept);
-
- smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+ return (eptp_val);
}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
index 2d7258d..dfd3a44 100644
--- a/sys/amd64/vmm/intel/ept.h
+++ b/sys/amd64/vmm/intel/ept.h
@@ -31,13 +31,9 @@
struct vmx;
-#define EPT_PWLEVELS 4 /* page walk levels */
-#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
-
int ept_init(void);
-int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
- vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
-vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
-void ept_invalidate_mappings(u_long ept_pml4);
-void ept_vmcleanup(struct vmx *vmx);
+void ept_invalidate_mappings(u_long eptp);
+struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
+void ept_vmspace_free(struct vmspace *vmspace);
+uint64_t eptp(uint64_t pml4);
#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 97a7ef2..f8b3f95 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -318,14 +318,14 @@ done:
int
vmcs_set_defaults(struct vmcs *vmcs,
- u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ u_long host_rip, u_long host_rsp, uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
- uint64_t eptp, pat, fsbase, idtrbase;
+ uint64_t pat, fsbase, idtrbase;
uint32_t exc_bitmap;
codesel = vmm_get_host_codesel();
@@ -432,7 +432,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
goto done;
/* eptp */
- eptp = EPTP(ept_pml4);
if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
goto done;
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 6272d21..be16b85 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -47,7 +47,7 @@ struct msr_entry {
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
- u_long ept_pml4,
+ uint64_t eptp,
uint32_t pinbased_ctls, uint32_t procbased_ctls,
uint32_t procbased_ctls2, uint32_t exit_ctls,
uint32_t entry_ctls, u_long msr_bitmap,
@@ -68,6 +68,8 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO)
+#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR)
#endif /* _KERNEL */
@@ -314,6 +316,12 @@ uint64_t vmcs_read(uint32_t encoding);
#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
/*
+ * VMCS IDT-Vectoring information fields
+ */
+#define VMCS_IDT_VEC_VALID (1 << 31)
+#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11)
+
+/*
* VMCS Guest interruptibility field
*/
#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
@@ -332,6 +340,9 @@ uint64_t vmcs_read(uint32_t encoding);
#define EPT_VIOLATION_DATA_READ (1UL << 0)
#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GPA_READABLE (1UL << 3)
+#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4)
+#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index c365111..5b65de6 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/vmparam.h>
-#include <x86/apicreg.h>
-
#include <machine/vmm.h>
#include "vmm_host.h"
#include "vmm_lapic.h"
@@ -167,9 +165,6 @@ static int cap_pause_exit;
static int cap_unrestricted_guest;
static int cap_monitor_trap;
-/* statistics */
-static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
-
static struct unrhdr *vpid_unr;
static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
@@ -740,7 +735,7 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
static void *
-vmx_vminit(struct vm *vm)
+vmx_vminit(struct vm *vm, pmap_t pmap)
{
uint16_t vpid[VM_MAXCPU];
int i, error, guest_msr_count;
@@ -753,6 +748,8 @@ vmx_vminit(struct vm *vm)
}
vmx->vm = vm;
+ vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+
/*
* Clean up EPTP-tagged guest physical and combined mappings
*
@@ -762,7 +759,7 @@ vmx_vminit(struct vm *vm)
*
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
*/
- ept_invalidate_mappings(vtophys(vmx->pml4ept));
+ ept_invalidate_mappings(vmx->eptp);
msr_bitmap_initialize(vmx->msr_bitmap);
@@ -818,7 +815,7 @@ vmx_vminit(struct vm *vm)
error = vmcs_set_defaults(&vmx->vmcs[i],
(u_long)vmx_longjmp,
(u_long)&vmx->ctx[i],
- vtophys(vmx->pml4ept),
+ vmx->eptp,
pinbased_ctls,
procbased_ctls,
procbased_ctls2,
@@ -856,6 +853,9 @@ vmx_vminit(struct vm *vm)
error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
if (error != 0)
panic("vmx_setup_cr4_shadow %d", error);
+
+ vmx->ctx[i].pmap = pmap;
+ vmx->ctx[i].eptp = vmx->eptp;
}
return (vmx);
@@ -1281,21 +1281,49 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
}
static int
-vmx_ept_fault(struct vm *vm, int cpu,
- uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
- uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+ept_fault_type(uint64_t ept_qual)
+{
+ int fault_type;
+
+ if (ept_qual & EPT_VIOLATION_DATA_WRITE)
+ fault_type = VM_PROT_WRITE;
+ else if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ fault_type = VM_PROT_EXECUTE;
+ else
+ fault_type= VM_PROT_READ;
+
+ return (fault_type);
+}
+
+static int
+ept_protection(uint64_t ept_qual)
+{
+ int prot = 0;
+
+ if (ept_qual & EPT_VIOLATION_GPA_READABLE)
+ prot |= VM_PROT_READ;
+ if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
+ prot |= VM_PROT_WRITE;
+ if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
+ prot |= VM_PROT_EXECUTE;
+
+ return (prot);
+}
+
+static boolean_t
+ept_emulation_fault(uint64_t ept_qual)
{
- int read, write, error;
+ int read, write;
- /* EPT violation on an instruction fetch doesn't make sense here */
+ /* EPT fault on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
- return (UNHANDLED);
+ return (FALSE);
- /* EPT violation must be a read fault or a write fault */
+ /* EPT fault must be a read fault or a write fault */
read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
if ((read | write) == 0)
- return (UNHANDLED);
+ return (FALSE);
/*
* The EPT violation must have been caused by accessing a
@@ -1304,26 +1332,10 @@ vmx_ept_fault(struct vm *vm, int cpu,
*/
if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
- return (UNHANDLED);
+ return (FALSE);
}
- /* Fetch, decode and emulate the faulting instruction */
- if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
- return (UNHANDLED);
-
- if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
- return (UNHANDLED);
-
- /*
- * Check if this is a local apic access
- */
- if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
- return (UNHANDLED);
-
- error = vmm_emulate_instruction(vm, cpu, gpa, vie,
- lapic_mmio_read, lapic_mmio_write, 0);
-
- return (error ? UNHANDLED : HANDLED);
+ return (TRUE);
}
static int
@@ -1332,18 +1344,47 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
int error, handled;
struct vmcs *vmcs;
struct vmxctx *vmxctx;
- uint32_t eax, ecx, edx;
- uint64_t qual, gla, gpa, cr3, intr_info;
+ uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
+ uint64_t qual, gpa;
handled = 0;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
qual = vmexit->u.vmx.exit_qualification;
+ reason = vmexit->u.vmx.exit_reason;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
- switch (vmexit->u.vmx.exit_reason) {
+ /*
+ * VM exits that could be triggered during event injection on the
+ * previous VM entry need to be handled specially by re-injecting
+ * the event.
+ *
+ * See "Information for VM Exits During Event Delivery" in Intel SDM
+ * for details.
+ */
+ switch (reason) {
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ case EXIT_REASON_APIC:
+ case EXIT_REASON_TASK_SWITCH:
+ case EXIT_REASON_EXCEPTION:
+ idtvec_info = vmcs_idt_vectoring_info();
+ if (idtvec_info & VMCS_IDT_VEC_VALID) {
+ idtvec_info &= ~(1 << 12); /* clear undefined bit */
+ vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
+ if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+ idtvec_err = vmcs_idt_vectoring_err();
+ vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
+ }
+ vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+ }
+ default:
+ break;
+ }
+
+ switch (reason) {
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
handled = vmx_emulate_cr_access(vmx, vcpu, qual);
@@ -1374,19 +1415,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_HLT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
- /*
- * If there is an event waiting to be injected then there is
- * no need to 'hlt'.
- */
- error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
- if (error)
- panic("vmx_exit_process: vmread(intrinfo) %d", error);
-
- if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
- handled = 1;
- vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
- } else
- vmexit->exitcode = VM_EXITCODE_HLT;
+ vmexit->exitcode = VM_EXITCODE_HLT;
break;
case EXIT_REASON_MTF:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@@ -1440,15 +1469,22 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
break;
case EXIT_REASON_EPT_FAULT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
- gla = vmcs_gla();
+ /*
+ * If 'gpa' lies within the address space allocated to
+ * memory then this must be a nested page fault otherwise
+ * this must be an instruction that accesses MMIO space.
+ */
gpa = vmcs_gpa();
- cr3 = vmcs_guest_cr3();
- handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
- vmexit->rip, vmexit->inst_length,
- cr3, qual, &vmexit->u.paging.vie);
- if (!handled) {
+ if (vm_mem_allocated(vmx->vm, gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.gpa = gpa;
+ vmexit->u.paging.fault_type = ept_fault_type(qual);
+ vmexit->u.paging.protection = ept_protection(qual);
+ } else if (ept_emulation_fault(qual)) {
+ vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+ vmexit->u.inst_emul.gpa = gpa;
+ vmexit->u.inst_emul.gla = vmcs_gla();
+ vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
}
break;
default:
@@ -1470,14 +1506,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vm_exit_update_rip(vmexit);
vmexit->rip += vmexit->inst_length;
vmexit->inst_length = 0;
-
- /*
- * Special case for spinning up an AP - exit to userspace to
- * give the controlling process a chance to intercept and
- * spin up a thread for the AP.
- */
- if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
- handled = 0;
} else {
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
/*
@@ -1497,7 +1525,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
}
static int
-vmx_run(void *arg, int vcpu, register_t rip)
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
{
int error, vie, rc, handled, astpending;
uint32_t exit_reason;
@@ -1505,7 +1533,7 @@ vmx_run(void *arg, int vcpu, register_t rip)
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
-
+
vmx = arg;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
@@ -1514,6 +1542,11 @@ vmx_run(void *arg, int vcpu, register_t rip)
astpending = 0;
vmexit = vm_exitinfo(vmx->vm, vcpu);
+ KASSERT(vmxctx->pmap == pmap,
+ ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
+ KASSERT(vmxctx->eptp == vmx->eptp,
+ ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
+
/*
* XXX Can we avoid doing this every time we do a vm run?
*/
@@ -1576,6 +1609,9 @@ vmx_run(void *arg, int vcpu, register_t rip)
vmxctx->launch_error, vie);
#endif
goto err_exit;
+ case VMX_RETURN_INVEPT:
+ panic("vm %s:%d invept error %d",
+ vm_name(vmx->vm), vcpu, vmxctx->launch_error);
default:
panic("vmx_setjmp returned %d", rc);
}
@@ -1654,7 +1690,6 @@ vmx_vmcleanup(void *arg)
if (error != 0)
panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
- ept_vmcleanup(vmx);
free(vmx, M_VMX);
return;
@@ -2000,13 +2035,13 @@ struct vmm_ops vmm_ops_intel = {
vmx_vminit,
vmx_run,
vmx_vmcleanup,
- ept_vmmmap_set,
- ept_vmmmap_get,
vmx_getreg,
vmx_setreg,
vmx_getdesc,
vmx_setdesc,
vmx_inject,
vmx_getcap,
- vmx_setcap
+ vmx_setcap,
+ ept_vmspace_alloc,
+ ept_vmspace_free,
};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index c7cd567..8e57764 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -31,6 +31,8 @@
#include "vmcs.h"
+struct pmap;
+
#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
struct vmxctx {
@@ -68,6 +70,15 @@ struct vmxctx {
int launched; /* vmcs launch state */
int launch_error;
+
+ long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
+
+ /*
+ * The 'eptp' and the 'pmap' do not change during the lifetime of
+ * the VM so it is safe to keep a copy in each vcpu's vmxctx.
+ */
+ vm_paddr_t eptp;
+ struct pmap *pmap;
};
struct vmxcap {
@@ -82,16 +93,15 @@ struct vmxstate {
/* virtual machine softc */
struct vmx {
- pml4_entry_t pml4ept[NPML4EPG];
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
char msr_bitmap[PAGE_SIZE];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
+ uint64_t eptp;
struct vm *vm;
};
-CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
@@ -101,6 +111,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
#define VMX_RETURN_VMRESUME 2
#define VMX_RETURN_VMLAUNCH 3
#define VMX_RETURN_AST 4
+#define VMX_RETURN_INVEPT 5
/*
* vmx_setjmp() returns:
* - 0 when it returns directly
@@ -108,6 +119,7 @@ CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
* - 2 when it returns from vmx_resume (which would only be in the error case)
* - 3 when it returns from vmx_launch (which would only be in the error case)
* - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ * - 5 when it returns from vmx_launch/vmx_resume because of invept error
*/
int vmx_setjmp(struct vmxctx *ctx);
void vmx_longjmp(void); /* returns via vmx_setjmp */
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
index 823a05d..9dcfcfb 100644
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -72,6 +72,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
+
+ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
+ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));
ASSYM(VM_SUCCESS, VM_SUCCESS);
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
@@ -82,8 +86,13 @@ ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
ASSYM(VMX_RETURN_AST, VMX_RETURN_AST);
+ASSYM(VMX_RETURN_INVEPT, VMX_RETURN_INVEPT);
ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
+ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
+
+ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index 4ba582a..fd7b4aa 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -30,6 +30,12 @@
#include "vmx_assym.s"
+#ifdef SMP
+#define LK lock ;
+#else
+#define LK
+#endif
+
/*
* Disable interrupts before updating %rsp in VMX_CHECK_AST or
* VMX_GUEST_RESTORE.
@@ -86,15 +92,73 @@
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
-#define VM_INSTRUCTION_ERROR(reg) \
+/*
+ * Check for an error after executing a VMX instruction.
+ * 'errreg' will be zero on success and non-zero otherwise.
+ * 'ctxreg' points to the 'struct vmxctx' associated with the vcpu.
+ */
+#define VM_INSTRUCTION_ERROR(errreg, ctxreg) \
jnc 1f; \
- movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ movl $VM_FAIL_INVALID,errreg; /* CF is set */ \
jmp 3f; \
1: jnz 2f; \
- movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ movl $VM_FAIL_VALID,errreg; /* ZF is set */ \
jmp 3f; \
-2: movl $VM_SUCCESS,reg; \
-3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+2: movl $VM_SUCCESS,errreg; \
+3: movl errreg,VMXCTX_LAUNCH_ERROR(ctxreg)
+
+/*
+ * set or clear the appropriate bit in 'pm_active'
+ * %rdi = vmxctx
+ * %rax, %r11 = scratch registers
+ */
+#define VMX_SET_PM_ACTIVE \
+ movq VMXCTX_PMAP(%rdi), %r11; \
+ movl PCPU(CPUID), %eax; \
+ LK btsl %eax, PM_ACTIVE(%r11)
+
+#define VMX_CLEAR_PM_ACTIVE \
+ movq VMXCTX_PMAP(%rdi), %r11; \
+ movl PCPU(CPUID), %eax; \
+ LK btrl %eax, PM_ACTIVE(%r11)
+
+/*
+ * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
+ * then we must invalidate all mappings associated with this eptp.
+ *
+ * %rdi = vmxctx
+ * %rax, %rbx, %r11 = scratch registers
+ */
+#define VMX_CHECK_EPTGEN \
+ movl PCPU(CPUID), %ebx; \
+ movq VMXCTX_PMAP(%rdi), %r11; \
+ movq PM_EPTGEN(%r11), %rax; \
+ cmpq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
+ je 9f; \
+ \
+ /* Refresh 'vmxctx->eptgen[curcpu]' */ \
+ movq %rax, VMXCTX_EPTGEN(%rdi, %rbx, 8); \
+ \
+ /* Setup the invept descriptor at the top of tmpstk */ \
+ mov %rdi, %r11; \
+ addq $VMXCTX_TMPSTKTOP, %r11; \
+ movq VMXCTX_EPTP(%rdi), %rax; \
+ movq %rax, -16(%r11); \
+ movq $0x0, -8(%r11); \
+ mov $0x1, %eax; /* Single context invalidate */ \
+ invept -16(%r11), %rax; \
+ \
+ /* Check for invept error */ \
+ VM_INSTRUCTION_ERROR(%eax, %rdi); \
+ testl %eax, %eax; \
+ jz 9f; \
+ \
+ /* Return via vmx_setjmp with retval of VMX_RETURN_INVEPT */ \
+ movq $VMX_RETURN_INVEPT, %rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP, %rsp; \
+ callq vmx_return; \
+9: ;
.text
/*
@@ -129,6 +193,9 @@ END(vmx_setjmp)
* Return to vmm context through vmx_setjmp() with a value of 'retval'.
*/
ENTRY(vmx_return)
+ /* The pmap is no longer active on the host cpu */
+ VMX_CLEAR_PM_ACTIVE
+
/* Restore host context. */
movq VMXCTX_HOST_R15(%rdi),%r15
movq VMXCTX_HOST_R14(%rdi),%r14
@@ -193,6 +260,10 @@ ENTRY(vmx_resume)
VMX_CHECK_AST
+ VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
+
+ VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
+
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@@ -203,7 +274,7 @@ ENTRY(vmx_resume)
/*
* Capture the reason why vmresume failed.
*/
- VM_INSTRUCTION_ERROR(%eax)
+ VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
movq %rsp,%rdi
@@ -225,6 +296,10 @@ ENTRY(vmx_launch)
VMX_CHECK_AST
+ VMX_SET_PM_ACTIVE /* This vcpu is now active on the host cpu */
+
+ VMX_CHECK_EPTGEN /* Check if we have to invalidate TLB */
+
/*
* Restore guest state that is not automatically loaded from the vmcs.
*/
@@ -235,7 +310,7 @@ ENTRY(vmx_launch)
/*
* Capture the reason why vmlaunch failed.
*/
- VM_INSTRUCTION_ERROR(%eax)
+ VM_INSTRUCTION_ERROR(%eax, %rsp)
/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
movq %rsp,%rdi
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
index 64d9ff5..a6061e9 100644
--- a/sys/amd64/vmm/io/ppt.c
+++ b/sys/amd64/vmm/io/ppt.c
@@ -282,6 +282,43 @@ ppt_teardown_msix(struct pptdev *ppt)
}
int
+ppt_num_devices(struct vm *vm)
+{
+ int i, num;
+
+ num = 0;
+ for (i = 0; i < num_pptdevs; i++) {
+ if (pptdevs[i].vm == vm)
+ num++;
+ }
+ return (num);
+}
+
+boolean_t
+ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
+{
+ int i, n;
+ struct pptdev *ppt;
+ struct vm_memory_segment *seg;
+
+ for (n = 0; n < num_pptdevs; n++) {
+ ppt = &pptdevs[n];
+ if (ppt->vm != vm)
+ continue;
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0)
+ continue;
+ if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
+ return (TRUE);
+ }
+ }
+
+ return (FALSE);
+}
+
+int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
@@ -336,7 +373,7 @@ ppt_unassign_all(struct vm *vm)
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
- ppt_unassign_device(vm, bus, slot, func);
+ vm_unassign_pptdev(vm, bus, slot, func);
}
}
@@ -591,10 +628,3 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
return (0);
}
-
-int
-ppt_num_devices(void)
-{
-
- return (num_pptdevs);
-}
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
index 931893c..7670bc4 100644
--- a/sys/amd64/vmm/io/ppt.h
+++ b/sys/amd64/vmm/io/ppt.h
@@ -29,14 +29,20 @@
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
-int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
-int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
- int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
-int ppt_num_devices(void);
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+int ppt_num_devices(struct vm *vm);
+boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
+
+/*
+ * The following functions should never be called directly.
+ * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
+ */
+int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 5fc6b94..5c2f202 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
#include <machine/vm.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <x86/apicreg.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
#include <machine/vmm.h>
+#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
@@ -84,15 +94,23 @@ struct vcpu {
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
+struct mem_seg {
+ vm_paddr_t gpa;
+ size_t len;
+ boolean_t wired;
+ vm_object_t object;
+};
#define VM_MAX_MEMORY_SEGMENTS 2
struct vm {
void *cookie; /* processor-specific data */
void *iommu; /* iommu-specific data */
+ struct vmspace *vmspace; /* guest's address space */
struct vcpu vcpu[VM_MAXCPU];
int num_mem_segs;
- struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
char name[VM_MAX_NAMELEN];
/*
@@ -109,16 +127,14 @@ static struct vmm_ops *ops;
#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
-#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
-#define VMRUN(vmi, vcpu, rip) \
- (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define VMRUN(vmi, vcpu, rip, pmap) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
- (ops != NULL ? \
- (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
- ENXIO)
-#define VMMMAP_GET(vmi, gpa) \
- (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMSPACE_ALLOC(min, max) \
+ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define VMSPACE_FREE(vmspace) \
+ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
@@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
switch (what) {
case MOD_LOAD:
vmmdev_init();
- if (ppt_num_devices() > 0)
- iommu_init();
+ iommu_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
@@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
{
int i;
struct vm *vm;
- vm_paddr_t maxaddr;
+ struct vmspace *vmspace;
const int BSP = 0;
@@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
+ vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+ if (vmspace == NULL)
+ return (ENOMEM);
+
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
- vm->cookie = VMINIT(vm);
+ vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
for (i = 0; i < VM_MAXCPU; i++) {
vcpu_init(vm, i);
guest_msrs_init(vm, i);
}
- maxaddr = vmm_mem_maxaddr();
- vm->iommu = iommu_create_domain(maxaddr);
vm_activate_cpu(vm, BSP);
+ vm->vmspace = vmspace;
*retvm = vm;
return (0);
}
static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
{
- size_t len;
- vm_paddr_t hpa;
- void *host_domain;
-
- host_domain = iommu_host_domain();
-
- len = 0;
- while (len < seg->len) {
- hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
- if (hpa == (vm_paddr_t)-1) {
- panic("vm_free_mem_segs: cannot free hpa "
- "associated with gpa 0x%016lx", seg->gpa + len);
- }
-
- /*
- * Remove the 'gpa' to 'hpa' mapping in VMs domain.
- * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
- */
- iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
- iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-
- vmm_mem_free(hpa, PAGE_SIZE);
-
- len += PAGE_SIZE;
- }
- /*
- * Invalidate cached translations associated with 'vm->iommu' since
- * we have now moved some pages from it.
- */
- iommu_invalidate_tlb(vm->iommu);
+ if (seg->object != NULL)
+ vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
- bzero(seg, sizeof(struct vm_memory_segment));
+ bzero(seg, sizeof(*seg));
}
void
@@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)
ppt_unassign_all(vm);
+ if (vm->iommu != NULL)
+ iommu_destroy_domain(vm->iommu);
+
for (i = 0; i < vm->num_mem_segs; i++)
vm_free_mem_seg(vm, &vm->mem_segs[i]);
@@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(&vm->vcpu[i]);
- iommu_destroy_domain(vm->iommu);
+ VMSPACE_FREE(vm->vmspace);
VMCLEANUP(vm->cookie);
@@ -365,52 +358,48 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
- const boolean_t spok = TRUE; /* superpage mappings are ok */
+ vm_object_t obj;
- return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
- VM_PROT_RW, spok));
+ if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+ return (ENOMEM);
+ else
+ return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- const boolean_t spok = TRUE; /* superpage mappings are ok */
- return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
- VM_PROT_NONE, spok));
+ vmm_mmio_free(vm->vmspace, gpa, len);
+ return (0);
}
-/*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
- */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+boolean_t
+vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
{
int i;
vm_paddr_t gpabase, gpalimit;
- if (gpa & PAGE_MASK)
- panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
-
for (i = 0; i < vm->num_mem_segs; i++) {
gpabase = vm->mem_segs[i].gpa;
gpalimit = gpabase + vm->mem_segs[i].len;
if (gpa >= gpabase && gpa < gpalimit)
- return (FALSE);
+ return (TRUE); /* 'gpa' is regular memory */
}
- return (TRUE);
+ if (ppt_is_mmio(vm, gpa))
+ return (TRUE); /* 'gpa' is pci passthru mmio */
+
+ return (FALSE);
}
int
vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- int error, available, allocated;
- struct vm_memory_segment *seg;
- vm_paddr_t g, hpa;
- void *host_domain;
-
- const boolean_t spok = TRUE; /* superpage mappings are ok */
+ int available, allocated;
+ struct mem_seg *seg;
+ vm_object_t object;
+ vm_paddr_t g;
if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
return (EINVAL);
@@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
available = allocated = 0;
g = gpa;
while (g < gpa + len) {
- if (vm_gpa_available(vm, g))
- available++;
- else
+ if (vm_mem_allocated(vm, g))
allocated++;
+ else
+ available++;
g += PAGE_SIZE;
}
@@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
return (E2BIG);
- host_domain = iommu_host_domain();
-
seg = &vm->mem_segs[vm->num_mem_segs];
- error = 0;
+ if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+ return (ENOMEM);
+
seg->gpa = gpa;
- seg->len = 0;
- while (seg->len < len) {
- hpa = vmm_mem_alloc(PAGE_SIZE);
- if (hpa == 0) {
- error = ENOMEM;
- break;
- }
+ seg->len = len;
+ seg->object = object;
+ seg->wired = FALSE;
- error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
- VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
- if (error)
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+static void
+vm_gpa_unwire(struct vm *vm)
+{
+ int i, rv;
+ struct mem_seg *seg;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ if (!seg->wired)
+ continue;
+
+ rv = vm_map_unwire(&vm->vmspace->vm_map,
+ seg->gpa, seg->gpa + seg->len,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
+ "%#lx/%ld could not be unwired: %d",
+ vm_name(vm), seg->gpa, seg->len, rv));
+
+ seg->wired = FALSE;
+ }
+}
+
+static int
+vm_gpa_wire(struct vm *vm)
+{
+ int i, rv;
+ struct mem_seg *seg;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ if (seg->wired)
+ continue;
+
+ /* XXX rlimits? */
+ rv = vm_map_wire(&vm->vmspace->vm_map,
+ seg->gpa, seg->gpa + seg->len,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ if (rv != KERN_SUCCESS)
break;
+ seg->wired = TRUE;
+ }
+
+ if (i < vm->num_mem_segs) {
/*
- * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
- * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ * Undo the wiring before returning an error.
*/
- iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
- iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+ vm_gpa_unwire(vm);
+ return (EAGAIN);
+ }
+
+ return (0);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+ int i, sz;
+ vm_paddr_t gpa, hpa;
+ struct mem_seg *seg;
+ void *vp, *cookie, *host_domain;
- seg->len += PAGE_SIZE;
+ sz = PAGE_SIZE;
+ host_domain = iommu_host_domain();
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
+ vm_name(vm), seg->gpa, seg->len));
+
+ gpa = seg->gpa;
+ while (gpa < seg->gpa + seg->len) {
+ vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
+ &cookie);
+ KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+ vm_name(vm), gpa));
+
+ vm_gpa_release(cookie);
+
+ hpa = DMAP_TO_PHYS((uintptr_t)vp);
+ if (map) {
+ iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+ iommu_remove_mapping(host_domain, hpa, sz);
+ } else {
+ iommu_remove_mapping(vm->iommu, gpa, sz);
+ iommu_create_mapping(host_domain, hpa, hpa, sz);
+ }
+
+ gpa += PAGE_SIZE;
+ }
}
- if (error) {
- vm_free_mem_seg(vm, seg);
+ /*
+ * Invalidate the cached translations associated with the domain
+ * from which pages were removed.
+ */
+ if (map)
+ iommu_invalidate_tlb(host_domain);
+ else
+ iommu_invalidate_tlb(vm->iommu);
+}
+
+#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
+#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+ int error;
+
+ error = ppt_unassign_device(vm, bus, slot, func);
+ if (error)
return (error);
+
+ if (ppt_num_devices(vm) == 0) {
+ vm_iommu_unmap(vm);
+ vm_gpa_unwire(vm);
}
+ return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+ int error;
+ vm_paddr_t maxaddr;
/*
- * Invalidate cached translations associated with 'host_domain' since
- * we have now moved some pages from it.
+ * Virtual machines with pci passthru devices get special treatment:
+ * - the guest physical memory is wired
+ * - the iommu is programmed to do the 'gpa' to 'hpa' translation
+ *
+ * We need to do this before the first pci passthru device is attached.
*/
- iommu_invalidate_tlb(host_domain);
+ if (ppt_num_devices(vm) == 0) {
+ KASSERT(vm->iommu == NULL,
+ ("vm_assign_pptdev: iommu must be NULL"));
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
- vm->num_mem_segs++;
+ error = vm_gpa_wire(vm);
+ if (error)
+ return (error);
- return (0);
+ vm_iommu_map(vm);
+ }
+
+ error = ppt_assign_device(vm, bus, slot, func);
+ return (error);
}
-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+ void **cookie)
{
- vm_paddr_t nextpage;
+ int count, pageoff;
+ vm_page_t m;
+
+ pageoff = gpa & PAGE_MASK;
+ if (len > PAGE_SIZE - pageoff)
+ panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
- nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
- if (len > nextpage - gpa)
- panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+ count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+ trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
- return (VMMMAP_GET(vm->cookie, gpa));
+ if (count == 1) {
+ *cookie = m;
+ return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+ } else {
+ *cookie = NULL;
+ return (NULL);
+ }
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+ vm_page_t m = cookie;
+
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ vm_page_unlock(m);
}
int
@@ -508,7 +639,9 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
for (i = 0; i < vm->num_mem_segs; i++) {
if (gpabase == vm->mem_segs[i].gpa) {
- *seg = vm->mem_segs[i];
+ seg->gpa = vm->mem_segs[i].gpa;
+ seg->len = vm->mem_segs[i].len;
+ seg->wired = vm->mem_segs[i].wired;
return (0);
}
}
@@ -516,6 +649,33 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
}
int
+vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+ vm_offset_t *offset, struct vm_object **object)
+{
+ int i;
+ size_t seg_len;
+ vm_paddr_t seg_gpa;
+ vm_object_t seg_obj;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if ((seg_obj = vm->mem_segs[i].object) == NULL)
+ continue;
+
+ seg_gpa = vm->mem_segs[i].gpa;
+ seg_len = vm->mem_segs[i].len;
+
+ if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
+ *offset = gpa - seg_gpa;
+ *object = seg_obj;
+ vm_object_reference(seg_obj);
+ return (0);
+ }
+ }
+
+ return (EINVAL);
+}
+
+int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
@@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+ int error;
+
+ vcpu_assert_locked(vcpu);
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> FROZEN -> IDLE
+ * FROZEN -> RUNNING -> FROZEN
+ * FROZEN -> SLEEPING -> FROZEN
+ */
+ switch (vcpu->state) {
+ case VCPU_IDLE:
+ case VCPU_RUNNING:
+ case VCPU_SLEEPING:
+ error = (newstate != VCPU_FROZEN);
+ break;
+ case VCPU_FROZEN:
+ error = (newstate == VCPU_FROZEN);
+ break;
+ default:
+ error = 1;
+ break;
+ }
+
+ if (error == 0)
+ vcpu->state = newstate;
+ else
+ error = EBUSY;
+
+ return (error);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
+ panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
+ panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ struct vcpu *vcpu;
+ int sleepticks, t;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+
+ /*
+ * Figure out the number of host ticks until the next apic
+ * timer interrupt in the guest.
+ */
+ sleepticks = lapic_timer_tick(vm, vcpuid);
+
+ /*
+ * If the guest local apic timer is disabled then sleep for
+ * a long time but not forever.
+ */
+ if (sleepticks < 0)
+ sleepticks = hz;
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
+ if (sleepticks <= 0)
+ panic("invalid sleepticks %d", sleepticks);
+ t = ticks;
+ vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+ vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+ vcpu_unlock(vcpu);
+
+ return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ int rv, ftype;
+ struct vm_map *map;
+ struct vcpu *vcpu;
+ struct vm_exit *vme;
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+
+ ftype = vme->u.paging.fault_type;
+ KASSERT(ftype == VM_PROT_READ ||
+ ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+ ("vm_handle_paging: invalid fault_type %d", ftype));
+
+ if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+ rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+ vme->u.paging.gpa, ftype);
+ if (rv == 0)
+ goto done;
+ }
+
+ map = &vm->vmspace->vm_map;
+ rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+ VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+ rv, vme->u.paging.gpa, ftype);
+
+ if (rv != KERN_SUCCESS)
+ return (EFAULT);
+done:
+ /* restart execution at the faulting instruction */
+ vme->inst_length = 0;
+
+ return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ struct vie *vie;
+ struct vcpu *vcpu;
+ struct vm_exit *vme;
+ int error, inst_length;
+ uint64_t rip, gla, gpa, cr3;
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+
+ rip = vme->rip;
+ inst_length = vme->inst_length;
+
+ gla = vme->u.inst_emul.gla;
+ gpa = vme->u.inst_emul.gpa;
+ cr3 = vme->u.inst_emul.cr3;
+ vie = &vme->u.inst_emul.vie;
+
+ vie_init(vie);
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
+ return (EFAULT);
+
+ if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
+ return (EFAULT);
+
+ /* return to userland unless this is a local apic access */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
+ *retu = TRUE;
+ return (0);
+ }
+
+ error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ /* return to userland to spin up the AP */
+ if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
+ *retu = TRUE;
+
+ return (error);
+}
+
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
- int error, vcpuid, sleepticks, t;
+ int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval, rip;
struct vm_exit *vme;
+ boolean_t retu;
+ pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
+ pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
- vme = &vmrun->vm_exit;
+ vme = &vcpu->exitinfo;
rip = vmrun->rip;
restart:
critical_enter();
+ KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+ ("vm_run: absurd pm_active"));
+
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
@@ -661,62 +1010,44 @@ restart:
restore_guest_msrs(vm, vcpuid);
restore_guest_fpustate(vcpu);
+ vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
- error = VMRUN(vm->cookie, vcpuid, rip);
+ error = VMRUN(vm->cookie, vcpuid, rip, pmap);
vcpu->hostcpu = NOCPU;
+ vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
restore_host_msrs(vm, vcpuid);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
- /* copy the exit information */
- bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
-
critical_exit();
- /*
- * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
- * is ready to run.
- */
- if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
- vcpu_lock(vcpu);
-
- /*
- * Figure out the number of host ticks until the next apic
- * timer interrupt in the guest.
- */
- sleepticks = lapic_timer_tick(vm, vcpuid);
-
- /*
- * If the guest local apic timer is disabled then sleep for
- * a long time but not forever.
- */
- if (sleepticks < 0)
- sleepticks = hz;
-
- /*
- * Do a final check for pending NMI or interrupts before
- * really putting this thread to sleep.
- *
- * These interrupts could have happened any time after we
- * returned from VMRUN() and before we grabbed the vcpu lock.
- */
- if (!vm_nmi_pending(vm, vcpuid) &&
- lapic_pending_intr(vm, vcpuid) < 0) {
- if (sleepticks <= 0)
- panic("invalid sleepticks %d", sleepticks);
- t = ticks;
- msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
- vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ if (error == 0) {
+ retu = FALSE;
+ switch (vme->exitcode) {
+ case VM_EXITCODE_HLT:
+ error = vm_handle_hlt(vm, vcpuid, &retu);
+ break;
+ case VM_EXITCODE_PAGING:
+ error = vm_handle_paging(vm, vcpuid, &retu);
+ break;
+ case VM_EXITCODE_INST_EMUL:
+ error = vm_handle_inst_emul(vm, vcpuid, &retu);
+ break;
+ default:
+ retu = TRUE; /* handled in userland */
+ break;
}
+ }
- vcpu_unlock(vcpu);
-
+ if (error == 0 && retu == FALSE) {
rip = vme->rip + vme->inst_length;
goto restart;
}
+ /* copy the exit information */
+ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
@@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
}
int
-vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
struct vcpu *vcpu;
@@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
-
- /*
- * The following state transitions are allowed:
- * IDLE -> RUNNING -> IDLE
- * IDLE -> CANNOT_RUN -> IDLE
- */
- if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
- (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
- error = 0;
- vcpu->state = state;
- } else {
- error = EBUSY;
- }
-
+ error = vcpu_set_state_locked(vcpu, newstate);
vcpu_unlock(vcpu);
return (error);
@@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
vcpu_lock(vcpu);
hostcpu = vcpu->hostcpu;
if (hostcpu == NOCPU) {
- /*
- * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
- * the host thread must be sleeping waiting for an event to
- * kick the vcpu out of 'hlt'.
- *
- * XXX this is racy because the condition exists right before
- * and after calling VMRUN() in vm_run(). The wakeup() is
- * benign in this case.
- */
- if (vcpu->state == VCPU_RUNNING)
+ if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
} else {
if (vcpu->state != VCPU_RUNNING)
@@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
}
vcpu_unlock(vcpu);
}
+
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+ return (vm->vmspace);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 8c03b3e..2f57f82 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/pmap.h>
+#include <vm/vm_map.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
@@ -95,8 +96,9 @@ vmmdev_lookup2(struct cdev *cdev)
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
- int error, off, c;
- vm_paddr_t hpa, gpa;
+ int error, off, c, prot;
+ vm_paddr_t gpa;
+ void *hpa, *cookie;
struct vmmdev_softc *sc;
static char zerobuf[PAGE_SIZE];
@@ -107,6 +109,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
if (sc == NULL)
error = ENXIO;
+ prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
@@ -120,14 +123,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
- hpa = vm_gpa2hpa(sc->vm, gpa, c);
- if (hpa == (vm_paddr_t)-1) {
+ hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
+ if (hpa == NULL) {
if (uio->uio_rw == UIO_READ)
error = uiomove(zerobuf, c, uio);
else
error = EFAULT;
- } else
- error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+ } else {
+ error = uiomove(hpa, c, uio);
+ vm_gpa_release(cookie);
+ }
}
mtx_unlock(&vmmdev_mtx);
@@ -139,7 +144,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu, state_changed;
- enum vcpu_state new_state;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
@@ -156,6 +160,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
struct vm_x2apic *x2apic;
+ struct vm_gpa_pte *gpapte;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@@ -189,12 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
goto done;
}
- if (cmd == VM_RUN)
- new_state = VCPU_RUNNING;
- else
- new_state = VCPU_CANNOT_RUN;
-
- error = vcpu_set_state(sc->vm, vcpu, new_state);
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
goto done;
@@ -211,7 +211,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
*/
error = 0;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
- error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN);
if (error)
break;
}
@@ -271,13 +271,13 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
- error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
- pptdev->func);
+ error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
- error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
- pptdev->func);
+ error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
break;
case VM_INJECT_EVENT:
vmevent = (struct vm_event *)data;
@@ -348,6 +348,12 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
error = vm_get_x2apic_state(sc->vm,
x2apic->cpuid, &x2apic->state);
break;
+ case VM_GET_GPA_PMAP:
+ gpapte = (struct vm_gpa_pte *)data;
+ pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
+ gpapte->gpa, gpapte->pte, &gpapte->ptenum);
+ error = 0;
+ break;
default:
error = ENOTTY;
break;
@@ -361,25 +367,25 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
}
done:
+ /* Make sure that no handler returns a bogus value like ERESTART */
+ KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
return (error);
}
static int
-vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
- int nprot, vm_memattr_t *memattr)
+vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
+ vm_size_t size, struct vm_object **object, int nprot)
{
int error;
struct vmmdev_softc *sc;
- error = -1;
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup2(cdev);
- if (sc != NULL && (nprot & PROT_EXEC) == 0) {
- *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
- if (*paddr != (vm_paddr_t)-1)
- error = 0;
- }
+ if (sc != NULL && (nprot & PROT_EXEC) == 0)
+ error = vm_get_memobj(sc->vm, *offset, size, offset, object);
+ else
+ error = EINVAL;
mtx_unlock(&vmmdev_mtx);
@@ -446,7 +452,7 @@ static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
- .d_mmap = vmmdev_mmap,
+ .d_mmap_single = vmmdev_mmap_single,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 94aa492..2b4b59b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -465,7 +465,7 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
}
#ifdef _KERNEL
-static void
+void
vie_init(struct vie *vie)
{
@@ -479,9 +479,9 @@ static int
gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
uint64_t *gpa, uint64_t *gpaend)
{
- vm_paddr_t hpa;
int nlevels, ptpshift, ptpindex;
uint64_t *ptpbase, pte, pgsize;
+ void *cookie;
/*
* XXX assumes 64-bit guest with 4 page walk levels
@@ -491,18 +491,19 @@ gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
- hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
- if (hpa == -1)
+ ptpbase = vm_gpa_hold(vm, ptpphys, PAGE_SIZE, VM_PROT_READ,
+ &cookie);
+ if (ptpbase == NULL)
goto error;
- ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
-
ptpshift = PAGE_SHIFT + nlevels * 9;
ptpindex = (gla >> ptpshift) & 0x1FF;
pgsize = 1UL << ptpshift;
pte = ptpbase[ptpindex];
+ vm_gpa_release(cookie);
+
if ((pte & PG_V) == 0)
goto error;
@@ -530,18 +531,18 @@ int
vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
uint64_t cr3, struct vie *vie)
{
- int n, err;
- uint64_t hpa, gpa, gpaend, off;
+ int n, err, prot;
+ uint64_t gpa, gpaend, off;
+ void *hpa, *cookie;
/*
* XXX cache previously fetched instructions using 'rip' as the tag
*/
+ prot = VM_PROT_READ | VM_PROT_EXECUTE;
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
- vie_init(vie);
-
/* Copy the instruction into 'vie' */
while (vie->num_valid < inst_length) {
err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
@@ -551,11 +552,12 @@ vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
off = gpa & PAGE_MASK;
n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
- hpa = vm_gpa2hpa(vm, gpa, n);
- if (hpa == -1)
+ if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
break;
- bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+ bcopy(hpa, &vie->inst[vie->num_valid], n);
+
+ vm_gpa_release(cookie);
rip += n;
vie->num_valid += n;
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
index 04f99b1..1019f2b 100644
--- a/sys/amd64/vmm/vmm_mem.c
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -30,40 +30,24 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/linker.h>
#include <sys/systm.h>
#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
#include <machine/md_var.h>
-#include <machine/metadata.h>
-#include <machine/pc/bios.h>
-#include <machine/vmparam.h>
-#include <machine/pmap.h>
-#include "vmm_util.h"
#include "vmm_mem.h"
-SYSCTL_DECL(_hw_vmm);
-
-static u_long pages_allocated;
-SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
- &pages_allocated, 0, "4KB pages allocated");
-
-static void
-update_pages_allocated(int howmany)
-{
- pages_allocated += howmany; /* XXX locking? */
-}
-
int
vmm_mem_init(void)
{
@@ -71,60 +55,95 @@ vmm_mem_init(void)
return (0);
}
-vm_paddr_t
-vmm_mem_alloc(size_t size)
+vm_object_t
+vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
+ vm_paddr_t hpa)
{
- int flags;
- vm_page_t m;
- vm_paddr_t pa;
-
- if (size != PAGE_SIZE)
- panic("vmm_mem_alloc: invalid allocation size %lu", size);
+ int error;
+ vm_object_t obj;
+ struct sglist *sg;
- flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
- VM_ALLOC_ZERO;
+ sg = sglist_alloc(1, M_WAITOK);
+ error = sglist_append_phys(sg, hpa, len);
+ KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
- while (1) {
+ obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
+ if (obj != NULL) {
/*
- * XXX need policy to determine when to back off the allocation
+ * VT-x ignores the MTRR settings when figuring out the
+ * memory type for translations obtained through EPT.
+ *
+ * Therefore we explicitly force the pages provided by
+ * this object to be mapped as uncacheable.
*/
- m = vm_page_alloc(NULL, 0, flags);
- if (m == NULL)
- VM_WAIT;
- else
- break;
+ VM_OBJECT_WLOCK(obj);
+ error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+ VM_OBJECT_WUNLOCK(obj);
+ if (error != KERN_SUCCESS) {
+ panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
+ error);
+ }
+ error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+ VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(obj);
+ obj = NULL;
+ }
}
- pa = VM_PAGE_TO_PHYS(m);
-
- if ((m->flags & PG_ZERO) == 0)
- pagezero((void *)PHYS_TO_DMAP(pa));
- m->valid = VM_PAGE_BITS_ALL;
-
- update_pages_allocated(1);
-
- return (pa);
+ /*
+ * Drop the reference on the sglist.
+ *
+ * If the scatter/gather object was successfully allocated then it
+ * has incremented the reference count on the sglist. Dropping the
+ * initial reference count ensures that the sglist will be freed
+ * when the object is deallocated.
+ *
+ * If the object could not be allocated then we end up freeing the
+ * sglist.
+ */
+ sglist_free(sg);
+
+ return (obj);
}
void
-vmm_mem_free(vm_paddr_t base, size_t length)
+vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
- vm_page_t m;
- if (base & PAGE_MASK) {
- panic("vmm_mem_free: base 0x%0lx must be aligned on a "
- "0x%0x boundary\n", base, PAGE_SIZE);
+ vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
+}
+
+vm_object_t
+vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{
+ int error;
+ vm_object_t obj;
+
+ if (gpa & PAGE_MASK)
+ panic("vmm_mem_alloc: invalid gpa %#lx", gpa);
+
+ if (len == 0 || (len & PAGE_MASK) != 0)
+ panic("vmm_mem_alloc: invalid allocation size %lu", len);
+
+ obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+ if (obj != NULL) {
+ error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+ VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(obj);
+ obj = NULL;
+ }
}
- if (length != PAGE_SIZE)
- panic("vmm_mem_free: invalid length %lu", length);
+ return (obj);
+}
- m = PHYS_TO_VM_PAGE(base);
- m->wire_count--;
- vm_page_free(m);
- atomic_subtract_int(&cnt.v_wire_count, 1);
+void
+vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{
- update_pages_allocated(-1);
+ vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
}
vm_paddr_t
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
index 7d45c74..a375070 100644
--- a/sys/amd64/vmm/vmm_mem.h
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -29,9 +29,15 @@
#ifndef _VMM_MEM_H_
#define _VMM_MEM_H_
+struct vmspace;
+struct vm_object;
+
int vmm_mem_init(void);
-vm_paddr_t vmm_mem_alloc(size_t size);
-void vmm_mem_free(vm_paddr_t start, size_t size);
+struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size);
+struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+ vm_paddr_t hpa);
+void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size);
+void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
vm_paddr_t vmm_mem_maxaddr(void);
#endif
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 5550f77..f37177f 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -101,7 +101,7 @@ struct bhyvestats {
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
- uint64_t vmexit_paging;
+ uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
@@ -208,14 +208,12 @@ fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
vmexit[vcpu].rip = rip;
vmexit[vcpu].inst_length = 0;
- if (vcpu == BSP) {
- mt_vmm_info[vcpu].mt_ctx = ctx;
- mt_vmm_info[vcpu].mt_vcpu = vcpu;
-
- error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
- fbsdrun_start_thread, &mt_vmm_info[vcpu]);
- assert(error == 0);
- }
+ mt_vmm_info[vcpu].mt_ctx = ctx;
+ mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+ error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+ assert(error == 0);
}
static int
@@ -385,13 +383,13 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
}
static int
-vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err;
- stats.vmexit_paging++;
+ stats.vmexit_inst_emul++;
- err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
- &vmexit->u.paging.vie);
+ err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+ &vmexit->u.inst_emul.vie);
if (err) {
if (err == EINVAL) {
@@ -400,7 +398,7 @@ vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
vmexit->rip);
} else if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
- vmexit->u.paging.gpa);
+ vmexit->u.inst_emul.gpa);
}
return (VMEXIT_ABORT);
@@ -416,7 +414,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
- [VM_EXITCODE_PAGING] = vmexit_paging,
+ [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
};
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 626dfdf..a95f261 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1048,7 +1048,7 @@ init_pci(struct vmctx *ctx)
* Accesses to memory addresses that are not allocated to system
* memory or PCI devices return 0xff's.
*/
- error = vm_get_memory_seg(ctx, 0, &lowmem);
+ error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
assert(error == 0);
memset(&memp, 0, sizeof(struct mem_range));
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
index e2b9f30..c1a84d74 100644
--- a/usr.sbin/bhyve/rtc.c
+++ b/usr.sbin/bhyve/rtc.c
@@ -341,14 +341,14 @@ rtc_init(struct vmctx *ctx)
* 0x34/0x35 - 64KB chunks above 16MB, below 4GB
* 0x5b/0x5c/0x5d - 64KB chunks above 4GB
*/
- err = vm_get_memory_seg(ctx, 0, &lomem);
+ err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
assert(err == 0);
lomem = (lomem - m_16MB) / m_64KB;
rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
- if (vm_get_memory_seg(ctx, m_4GB, &himem) == 0) {
+ if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {
himem /= m_64KB;
rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8;
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 438d01c..d6b32b8 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -188,12 +188,13 @@ usage(void)
" [--unassign-pptdev=<bus/slot/func>]\n"
" [--set-mem=<memory in units of MB>]\n"
" [--get-lowmem]\n"
- " [--get-highmem]\n",
+ " [--get-highmem]\n"
+ " [--get-gpa-pmap]\n",
progname);
exit(1);
}
-static int get_stats, getcap, setcap, capval;
+static int get_stats, getcap, setcap, capval, get_gpa_pmap;
static const char *capname;
static int create, destroy, get_lowmem, get_highmem;
static uint64_t memsize;
@@ -377,18 +378,20 @@ enum {
SET_CAP,
CAPNAME,
UNASSIGN_PPTDEV,
+ GET_GPA_PMAP,
};
int
main(int argc, char *argv[])
{
char *vmname;
- int error, ch, vcpu;
- vm_paddr_t gpa;
+ int error, ch, vcpu, ptenum;
+ vm_paddr_t gpa, gpa_pmap;
size_t len;
struct vm_exit vmexit;
- uint64_t ctl, eptp, bm, addr, u64;
+ uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
struct vmctx *ctx;
+ int wired;
uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@@ -427,6 +430,7 @@ main(int argc, char *argv[])
{ "capname", REQ_ARG, 0, CAPNAME },
{ "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV },
{ "setcap", REQ_ARG, 0, SET_CAP },
+ { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP },
{ "getcap", NO_ARG, &getcap, 1 },
{ "get-stats", NO_ARG, &get_stats, 1 },
{ "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
@@ -666,6 +670,10 @@ main(int argc, char *argv[])
capval = strtoul(optarg, NULL, 0);
setcap = 1;
break;
+ case GET_GPA_PMAP:
+ gpa_pmap = strtoul(optarg, NULL, 0);
+ get_gpa_pmap = 1;
+ break;
case CAPNAME:
capname = optarg;
break;
@@ -819,16 +827,18 @@ main(int argc, char *argv[])
if (!error && (get_lowmem || get_all)) {
gpa = 0;
- error = vm_get_memory_seg(ctx, gpa, &len);
+ error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
- printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+ printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
+ wired ? " wired" : "");
}
if (!error && (get_highmem || get_all)) {
gpa = 4 * GB;
- error = vm_get_memory_seg(ctx, gpa, &len);
+ error = vm_get_memory_seg(ctx, gpa, &len, &wired);
if (error == 0)
- printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+ printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
+ wired ? " wired" : "");
}
if (!error && (get_efer || get_all)) {
@@ -1457,6 +1467,17 @@ main(int argc, char *argv[])
printf("Capability \"%s\" is not available\n", capname);
}
+ if (!error && get_gpa_pmap) {
+ error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
+ if (error == 0) {
+ printf("gpa %#lx:", gpa_pmap);
+ pte = &pteval[0];
+ while (ptenum-- > 0)
+ printf(" %#lx", *pte++);
+ printf("\n");
+ }
+ }
+
if (!error && (getcap || get_all)) {
int captype, val, getcaptype;
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index 4cd280c..c4bafd3 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -492,8 +492,8 @@ static void
cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
{
- vm_get_memory_seg(ctx, 0, ret_lowmem);
- vm_get_memory_seg(ctx, 4 * GB, ret_highmem);
+ vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
+ vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
}
static const char *
OpenPOWER on IntegriCloud