diff options
author | alc <alc@FreeBSD.org> | 2008-03-27 04:34:17 +0000 |
---|---|---|
committer | alc <alc@FreeBSD.org> | 2008-03-27 04:34:17 +0000 |
commit | 2a244be0948ebf54f47b701a69bdc0acbc452299 (patch) | |
tree | 71fd377cd39a489531c67dbd62417bb6bf1cd6d3 /sys | |
parent | 669aee5e4400017ab51274cecbb9a12d7913543b (diff) | |
download | FreeBSD-src-2a244be0948ebf54f47b701a69bdc0acbc452299.zip FreeBSD-src-2a244be0948ebf54f47b701a69bdc0acbc452299.tar.gz |
MFamd64 with few changes:
1. Add support for automatic promotion of 4KB page mappings to 2MB page
mappings. Automatic promotion can be enabled by setting the tunable
"vm.pmap.pg_ps_enabled" to a non-zero value. By default, automatic
promotion is disabled. Tested by: kris
2. To date, we have assumed that the TLB will only set the PG_M bit in a
PTE if that PTE has the PG_RW bit set. However, this assumption does
not hold on recent processors from Intel. For example, consider a PTE
that has the PG_RW bit set but the PG_M bit clear. Suppose this PTE
is cached in the TLB and later the PG_RW bit is cleared in the PTE,
but the corresponding TLB entry is not (yet) invalidated.
Historically, upon a write access using this (stale) TLB entry, the
TLB would observe that the PG_RW bit had been cleared and initiate a
page fault, aborting the setting of the PG_M bit in the PTE. Now,
however, P4- and Core2-family processors will set the PG_M bit before
observing that the PG_RW bit is clear and initiating a page fault. In
other words, the write does not occur but the PG_M bit is still set.
The real impact of this difference is not that great. Specifically,
we should no longer assert that any PTE with the PG_M bit set must
also have the PG_RW bit set, and we should ignore the state of the
PG_M bit unless the PG_RW bit is set.
Diffstat (limited to 'sys')
-rw-r--r-- | sys/i386/i386/pmap.c | 1183 | ||||
-rw-r--r-- | sys/i386/include/pmap.h | 16 |
2 files changed, 1108 insertions, 91 deletions
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index ddcf449..98a2fdf 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -5,7 +5,7 @@ * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. - * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> + * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -112,6 +112,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mman.h> @@ -135,6 +136,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_extern.h> #include <vm/vm_pageout.h> #include <vm/vm_pager.h> +#include <vm/vm_reserv.h> #include <vm/uma.h> #include <machine/cpu.h> @@ -171,6 +173,9 @@ __FBSDID("$FreeBSD$"); #define PV_STAT(x) do { } while (0) #endif +#define pa_index(pa) ((pa) >> PDRSHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) + /* * Get PDEs and PTEs for user/kernel address space */ @@ -206,10 +211,17 @@ pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pg_ps_enabled; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, + "Are large page mappings enabled?"); + /* * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ @@ -259,11 +271,29 @@ static struct mtx PMAP2mutex; static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); - +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); + +static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static boolean_t pmap_is_modified_pvh(struct md_page *pvh); +static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); +static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); +static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, + vm_prot_t prot); +static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, vm_page_t *free); +static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, @@ -351,6 +381,7 @@ pmap_bootstrap(vm_paddr_t firstaddr) #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif + kernel_pmap->pm_root = NULL; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); LIST_INIT(&allpmaps); @@ -599,6 +630,22 @@ pmap_ptelist_init(vm_offset_t *head, void *base, int npages) void pmap_init(void) { + vm_page_t mpte; + vm_size_t s; + int i, pv_npg; + + /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + for (i = 0; i < nkpt; i++) { + mpte = PHYS_TO_VM_PAGE(PTD[i + KPTDI] & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range")); + mpte->pindex = i + KPTDI; + mpte->phys_addr = PTD[i + KPTDI] & PG_FRAME; + } /* * Initialize the address space (zone) for the pv entries. Set a @@ -611,6 +658,26 @@ pmap_init(void) pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + + /* + * Calculate the size of the pv head table for superpages. + */ + for (i = 0; phys_avail[i + 1]; i += 2); + pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_alloc(kernel_map, s); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE * pv_maxchunks); @@ -626,12 +693,30 @@ pmap_init(void) } -SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); +SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, + "2/4MB page mapping counters"); + +static u_long pmap_pde_demotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_pde_demotions, 0, "2/4MB page demotions"); + +static u_long pmap_pde_mappings; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pde_mappings, 0, "2/4MB page mappings"); + +static u_long pmap_pde_p_failures; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); + +static u_long pmap_pde_promotions; +SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_pde_promotions, 0, "2/4MB page promotions"); + /*************************************************** * Low level helper routines..... ***************************************************/ @@ -1154,11 +1239,104 @@ pmap_free_zero_pages(vm_page_t free) while (free != NULL) { m = free; free = m->right; - vm_page_free_zero(m); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); } } /* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + m->right = *free; + *free = m; +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static void +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + root = pmap->pm_root; + if (root == NULL) { + mpte->left = NULL; + mpte->right = NULL; + } else { + root = vm_page_splay(mpte->pindex, root); + if (mpte->pindex < root->pindex) { + mpte->left = root->left; + mpte->right = root; + root->left = NULL; + } else if (mpte->pindex == root->pindex) + panic("pmap_insert_pt_page: pindex already inserted"); + else { + mpte->right = root->right; + mpte->left = root; + root->right = NULL; + } + } + pmap->pm_root = mpte; +} + +/* + * Looks for a page table page mapping the specified virtual address in the + * specified pmap's collection of idle page table pages. Returns NULL if there + * is no page table page corresponding to the specified virtual address. + */ +static vm_page_t +pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +{ + vm_page_t mpte; + vm_pindex_t pindex = va >> PDRSHIFT; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { + mpte = vm_page_splay(pindex, mpte); + if ((pmap->pm_root = mpte)->pindex != pindex) + mpte = NULL; + } + return (mpte); +} + +/* + * Removes the specified page table page from the specified pmap's collection + * of idle page table pages. The specified page table page must be a member of + * the pmap's collection. + */ +static void +pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) +{ + vm_page_t root; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (mpte != pmap->pm_root) + vm_page_splay(mpte->pindex, pmap->pm_root); + if (mpte->left == NULL) + root = mpte->right; + else { + root = vm_page_splay(mpte->pindex, mpte->left); + root->right = mpte->right; + } + pmap->pm_root = root; +} + +/* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ @@ -1202,8 +1380,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ - m->right = *free; - *free = m; + pmap_add_delayed_free_list(m, free, TRUE); return 1; } @@ -1234,6 +1411,7 @@ pmap_pinit0(pmap_t pmap) #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif + pmap->pm_root = NULL; pmap->pm_active = 0; PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); @@ -1277,7 +1455,10 @@ pmap_pinit(pmap_t pmap) KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif + pmap->pm_root = NULL; } + KASSERT(pmap->pm_root == NULL, + ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) @@ -1398,10 +1579,8 @@ retry: * normal 4K page. */ if (ptepa & PG_PS) { - pmap->pm_pdir[ptepindex] = 0; - ptepa = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_invalidate_all(kernel_pmap); + (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); + ptepa = pmap->pm_pdir[ptepindex]; } /* @@ -1535,6 +1714,8 @@ pmap_release(pmap_t pmap) KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); + KASSERT(pmap->pm_root == NULL, + ("pmap_release: pmap has reserved page table page(s)")); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); @@ -1716,6 +1897,8 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_ac static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { + struct md_page *pvh; + pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; @@ -1735,25 +1918,27 @@ pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; + pde = pmap_pde(pmap, va); + KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; pmap_unuse_pt(pmap, va, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); @@ -1895,24 +2080,112 @@ retry: return (pv); } -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == PV_PMAP(pv) && va == pv->pv_va) + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; + } } - KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + return (pv); +} + +static void +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_demote_pde: pa is not 4mpage aligned")); + + /* + * Transfer the 4mpage's pv entry for this mapping to the first + * page's pv list. + */ + pvh = pa_to_pvh(pa); + va = trunc_4mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + va += PAGE_SIZE; + pmap_insert_entry(pmap, va, m); + } while (va < va_last); +} + +static void +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT((pa & PDRMASK) == 0, + ("pmap_pv_promote_pde: pa is not 4mpage aligned")); + + /* + * Transfer the first page's pv entry for this mapping to the + * 4mpage's pv list. Aside from avoiding the cost of a call + * to get_pv_entry(), a transfer avoids the possibility that + * get_pv_entry() calls pmap_collect() and that pmap_collect() + * removes one of the mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_4mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + NBPDR - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + struct md_page *pvh; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } +} + /* * Create a pv entry for page at pa for * (pmap, va). @@ -1949,6 +2222,222 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) } /* + * Create the pv entries for each of the pages within a superpage. + */ +static boolean_t +pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); + return (TRUE); + } else + return (FALSE); +} + +/* + * Tries to demote a 2- or 4MB page mapping. + */ +static boolean_t +pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde, oldpde; + pmap_t allpmaps_entry; + pt_entry_t *firstpte, newpte, *pte; + vm_paddr_t mptepa; + vm_page_t free, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_lookup_pt_page(pmap, va); + if (mpte != NULL) + pmap_remove_pt_page(pmap, mpte); + else { + KASSERT((*pde & PG_W) == 0, + ("pmap_demote_pde: page table page for a wired mapping" + " is missing")); + free = NULL; + pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); + pmap_invalidate_page(pmap, trunc_4mpage(va)); + pmap_free_zero_pages(free); + CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return (FALSE); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + + /* + * Temporarily map the page table page (mpte) into the kernel's + * address space at either PADDR1 or PADDR2. + */ + if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) { + if ((*PMAP1 & PG_FRAME) != mptepa) { + *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + invlcaddr(PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + firstpte = PADDR1; + } else { + mtx_lock(&PMAP2mutex); + if ((*PMAP2 & PG_FRAME) != mptepa) { + *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); + } + firstpte = PADDR2; + } + oldpde = *pde; + newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; + KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V), + ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pde: oldpde is missing PG_M")); + KASSERT((oldpde & PG_PS) != 0, + ("pmap_demote_pde: oldpde is missing PG_PS")); + newpte = oldpde & ~PG_PS; + if ((newpte & PG_PDE_PAT) != 0) + newpte ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), + ("pmap_demote_pde: firstpte and newpte map different physical" + " addresses")); + if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += PAGE_SIZE; + } + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + if (pmap == kernel_pmap) { + /* + * A harmless race exists between this loop and the bcopy() + * in pmap_pinit() that initializes the kernel segment of + * the new page table. Specifically, that bcopy() may copy + * the new PDE from the PTD, which is first in allpmaps, to + * the new page table before this loop updates that new + * page table. + */ + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { + pde = pmap_pde(allpmaps_entry, va); + KASSERT(*pde == newpde || (*pde & PG_PTE_PROMOTE) == + (oldpde & PG_PTE_PROMOTE), + ("pmap_demote_pde: pde was %#jx, expected %#jx", + (uintmax_t)*pde, (uintmax_t)oldpde)); + pde_store(pde, newpde); + } + mtx_unlock_spin(&allpmaps_lock); + } else + pde_store(pde, newpde); + if (firstpte == PADDR2) + mtx_unlock(&PMAP2mutex); + + /* + * Invalidate the recursive mapping of the page table page. + */ + pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); + + /* + * Demote the pv entry. This depends on the earlier demotion + * of the mapping. Specifically, the (re)creation of a per- + * page pv entry might trigger the execution of pmap_collect(), + * which might reclaim a newly (re)created per-page pv entry + * and destroy the associated mapping. In order to destroy + * the mapping, the PDE must have already changed from mapping + * the 2mpage to referencing the page table page. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_pde(pmap, va, oldpde & PG_FRAME); + + pmap_pde_demotions++; + CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* + * pmap_remove_pde: do the things to unmap a superpage in a process + */ +static void +pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, + vm_page_t *free) +{ + struct md_page *pvh; + pd_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_remove_pde: sva is not 4mpage aligned")); + oldpde = pte_load_clear(pdq); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpde & PG_G) + pmap_invalidate_page(kernel_pmap, sva); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + if (oldpde & PG_MANAGED) { + pvh = pa_to_pvh(oldpde & PG_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + if (!pmap_demote_pde(pmap, pdq, sva)) + panic("pmap_remove_pde: failed demotion"); + } else { + mpte = pmap_lookup_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pde: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + } +} + +/* * pmap_remove_pte: do the things to unmap a page in a process */ static int @@ -1971,12 +2460,8 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); - if (oldpte & PG_M) { - KASSERT((oldpte & PG_RW), - ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)oldpte)); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); @@ -2065,10 +2550,25 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - pmap->pm_pdir[pdirindex] = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - anyvalid = 1; - continue; + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == pdnxt && eva >= pdnxt) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_remove_pde(). + */ + if ((ptpaddr & PG_G) == 0) + anyvalid = 1; + pmap_remove_pde(pmap, + &pmap->pm_pdir[pdirindex], sva, &free); + continue; + } else if (!pmap_demote_pde(pmap, + &pmap->pm_pdir[pdirindex], sva)) { + /* The large page mapping was destroyed. */ + continue; + } } /* @@ -2119,19 +2619,34 @@ out: void pmap_remove_all(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; + pd_entry_t *pde; + vm_offset_t va; vm_page_t free; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_remove_all: page %p is fictitious", m)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) @@ -2142,12 +2657,8 @@ pmap_remove_all(vm_page_t m) /* * Update the vm_page_t clean and reference bits. */ - if (tpte & PG_M) { - KASSERT((tpte & PG_RW), - ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", - pv->pv_va, (uintmax_t)tpte)); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); - } free = NULL; pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); @@ -2161,6 +2672,56 @@ pmap_remove_all(vm_page_t m) } /* + * pmap_protect_pde: do the things to protect a 4mpage in a process + */ +static boolean_t +pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) +{ + pd_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PDRMASK) == 0, + ("pmap_protect_pde: sva is not 4mpage aligned")); + anychanged = FALSE; +retry: + oldpde = newpde = *pde; + if (oldpde & PG_MANAGED) { + eva = sva + NBPDR; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); + va < eva; va += PAGE_SIZE, m++) { + /* + * In contrast to the analogous operation on a 4KB page + * mapping, the mapping's PG_A flag is not cleared and + * the page's PG_REFERENCED flag is not set. The + * reason is that pmap_demote_pde() expects that a 2/4MB + * page mapping with a stored page table page has PG_A + * set. + */ + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + } + } + if ((prot & VM_PROT_WRITE) == 0) + newpde &= ~(PG_RW | PG_M); +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; +#endif + if (newpde != oldpde) { + if (!pde_cmpset(pde, oldpde, newpde)) + goto retry; + if (oldpde & PG_G) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + return (anychanged); +} + +/* * Set the physical protection on the * specified range of this map as requested. */ @@ -2213,14 +2774,24 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { - if ((prot & VM_PROT_WRITE) == 0) - pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pmap->pm_pdir[pdirindex] |= pg_nx; -#endif - anychanged = 1; - continue; + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + NBPDR == pdnxt && eva >= pdnxt) { + /* + * The TLB entry for a PG_G mapping is + * invalidated by pmap_protect_pde(). + */ + if (pmap_protect_pde(pmap, + &pmap->pm_pdir[pdirindex], sva, prot)) + anychanged = 1; + continue; + } else if (!pmap_demote_pde(pmap, + &pmap->pm_pdir[pdirindex], sva)) { + /* The large page mapping was destroyed. */ + continue; + } } if (pdnxt > eva) @@ -2246,7 +2817,7 @@ retry: vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } - if ((pbits & PG_M) != 0) { + if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); @@ -2284,6 +2855,127 @@ retry: } /* + * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are + * within a single page table page to a single 2- or 4MB page mapping. For + * promotion to occur, two conditions must be met: (1) the 4KB page mappings + * must map aligned, contiguous physical memory and (2) the 4KB page mappings + * must have identical characteristics. + * + * Managed (PG_MANAGED) mappings within the kernel address space are not + * promoted. The reason is that kernel PDEs are replicated in each pmap but + * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel + * pmap. + */ +static void +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) +{ + pd_entry_t newpde; + pmap_t allpmaps_entry; + pt_entry_t *firstpte, oldpte, *pte; + vm_offset_t oldpteva; + vm_paddr_t pa; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + firstpte = vtopte(trunc_4mpage(va)); + KASSERT((*firstpte & PG_V) != 0, + ("pmap_promote_pde: firstpte is missing PG_V")); + if ((*firstpte & PG_A) == 0) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + pa = *firstpte & PG_PS_FRAME; + newpde = *firstpte; + if ((newpde & (PG_M | PG_RW)) == PG_RW) + newpde &= ~PG_RW; + + /* + * Check all the ptes before promotion + */ + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { +retry: + oldpte = *pte; + if ((oldpte & PG_FRAME) != pa) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_int((u_int *)pte, oldpte, + oldpte & ~PG_RW)) + goto retry; + oldpte &= ~PG_RW; + oldpteva = (oldpte & PG_FRAME & PDRMASK) | + (va & ~PDRMASK); + CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" + " in pmap %p", oldpteva, pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + pmap_pde_p_failures++; + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" + " in pmap %p", va, pmap); + return; + } + pa += PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_pde: page table page is out of range")); + KASSERT(mpte->pindex == va >> PDRSHIFT, + ("pmap_promote_pde: page table page's pindex is wrong")); + pmap_insert_pt_page(pmap, mpte); + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_pde(pmap, va, newpde & PG_FRAME); + + /* + * Propagate the PAT index to its proper position. + */ + if ((newpde & PG_PTE_PAT) != 0) + newpde ^= PG_PDE_PAT | PG_PTE_PAT; + + /* + * Map the superpage. + */ + if (pmap == kernel_pmap) { + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) { + pde = pmap_pde(allpmaps_entry, va); + pde_store(pde, PG_PS | newpde); + } + mtx_unlock_spin(&allpmaps_lock); + } else + pde_store(pde, PG_PS | newpde); + + pmap_pde_promotions++; + CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" + " in pmap %p", va, pmap); +} + +/* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. @@ -2436,9 +3128,12 @@ validate: * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { + newpte |= PG_A; + if ((access & VM_PROT_WRITE) != 0) + newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; - origpte = pte_load_store(pte, newpte | PG_A); + origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); @@ -2450,10 +3145,7 @@ validate: invlva = TRUE; #endif } - if (origpte & PG_M) { - KASSERT((origpte & PG_RW), - ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", - va, (uintmax_t)origpte)); + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) @@ -2462,14 +3154,78 @@ validate: if (invlva) pmap_invalidate_page(pmap, va); } else - pte_store(pte, newpte | PG_A); + pte_store(pte, newpte); } + + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->wire_count == NPTEPG) && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pde(pmap, pde, va); + sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* + * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and + * FALSE otherwise. Fails if (1) a page table page cannot be allocated without + * blocking, (2) a mapping already exists at the specified virtual address, or + * (3) a pv entry cannot be allocated without reclaiming another pv entry. + */ +static boolean_t +pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + pd_entry_t *pde, newpde; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pde = pmap_pde(pmap, va); + if (*pde != 0) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { + newpde |= PG_MANAGED; + + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_pde(pmap, va, m)) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + } +#ifdef PAE + if ((prot & VM_PROT_EXECUTE) == 0) + newpde |= pg_nx; +#endif + if (va < VM_MAXUSER_ADDRESS) + newpde |= PG_U; + + /* + * Increment counters. + */ + pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; + + /* + * Map the superpage. + */ + pde_store(pde, newpde); + + pmap_pde_mappings++; + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is @@ -2485,6 +3241,7 @@ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { + vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; @@ -2494,8 +3251,15 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, - prot, mpte); + va = start + ptoa(diff); + if ((va & PDRMASK) == 0 && va + NBPDR <= end && + (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && + pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && + pmap_enter_pde(pmap, va, m, prot)) + m = &m[NBPDR / PAGE_SIZE - 1]; + else + mpte = pmap_enter_quick_locked(pmap, va, m, prot, + mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); @@ -2559,7 +3323,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ if (ptepa) { if (ptepa & PG_PS) - panic("pmap_enter_quick: unexpected mapping into 4MB page"); + return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { @@ -2723,9 +3487,29 @@ out: void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { + pd_entry_t *pde; pt_entry_t *pte; + boolean_t are_queues_locked; + are_queues_locked = FALSE; +retry: PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) { + if (!wired != ((*pde & PG_W) == 0)) { + if (!are_queues_locked) { + are_queues_locked = TRUE; + if (!mtx_trylock(&vm_page_queue_mtx)) { + PMAP_UNLOCK(pmap); + vm_page_lock_queues(); + goto retry; + } + } + if (!pmap_demote_pde(pmap, pde, va)) + panic("pmap_change_wiring: demotion failed"); + } else + goto out; + } pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) @@ -2739,6 +3523,9 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) */ pmap_pte_set_w(pte, wired); pmap_pte_release(pte); +out: + if (are_queues_locked) + vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } @@ -2795,7 +3582,10 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, continue; if (srcptepaddr & PG_PS) { - if (dst_pmap->pm_pdir[ptepindex] == 0) { + if (dst_pmap->pm_pdir[ptepindex] == 0 && + ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_pde(dst_pmap, addr, + PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME)))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += @@ -2981,6 +3771,7 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { + struct md_page *pvh; pv_entry_t pv; int loops = 0; @@ -2996,6 +3787,16 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) if (loops >= 16) break; } + if (loops < 16) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { + if (PV_PMAP(pv) == pmap) + return (TRUE); + loops++; + if (loops >= 16) + break; + } + } return (FALSE); } @@ -3031,6 +3832,25 @@ pmap_page_wired_mappings(vm_page_t m) } /* + * Returns TRUE if the given page is mapped individually or as part of + * a 4mpage. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + struct md_page *pvh; + + if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) + return (FALSE); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + return (!TAILQ_EMPTY(&pvh->pv_list)); + } else + return (TRUE); +} + +/* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but @@ -3042,8 +3862,10 @@ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; - vm_page_t m, free = NULL; + vm_page_t free = NULL; + vm_page_t m, mpte, mt; pv_entry_t pv; + struct md_page *pvh; struct pv_chunk *pc, *npc; int field, idx; int32_t bit; @@ -3068,8 +3890,12 @@ pmap_remove_pages(pmap_t pmap) pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; - pte = vtopte(pv->pv_va); + pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; + if ((tpte & PG_PS) == 0) { + pte = vtopte(pv->pv_va); + tpte = *pte & ~PG_PTE_PAT; + } if (tpte == 0) { printf( @@ -3096,26 +3922,52 @@ pmap_remove_pages(pmap_t pmap) ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); - pmap->pm_stats.resident_count--; - pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ - if (tpte & PG_M) - vm_page_dirty(m); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if ((tpte & PG_PS) != 0) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); - - pmap_unuse_pt(pmap, pv->pv_va, &free); + if ((tpte & PG_PS) != 0) { + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + pvh = pa_to_pvh(tpte & PG_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_flag_clear(mt, PG_WRITEABLE); + } + mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_remove_pt_page(pmap, mpte); + KASSERT(mpte->wire_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->wire_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + atomic_subtract_int(&cnt.v_wire_count, 1); + } + } else { + pmap->pm_stats.resident_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + } + pmap_unuse_pt(pmap, pv->pv_va, &free); + } } } if (allfree) { @@ -3146,22 +3998,35 @@ pmap_remove_pages(pmap_t pmap) boolean_t pmap_is_modified(vm_page_t m) { + + if (m->flags & PG_FICTITIOUS) + return (FALSE); + if (pmap_is_modified_pvh(&m->md)) + return (TRUE); + return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); +} + +/* + * Returns TRUE if any of the given mappings were used to modify + * physical memory. Otherwise, returns FALSE. Both page and 2mpage + * mappings are supported. + */ +static boolean_t +pmap_is_modified_pvh(struct md_page *pvh) +{ pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; + mtx_assert(&vm_page_queue_mtx, MA_OWNED); rv = FALSE; - if (m->flags & PG_FICTITIOUS) - return (rv); - sched_pin(); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & PG_M) != 0; + rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; @@ -3179,12 +4044,14 @@ pmap_is_modified(vm_page_t m) boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { + pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); - if (*pmap_pde(pmap, addr)) { + pde = pmap_pde(pmap, addr); + if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } @@ -3198,18 +4065,34 @@ pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) void pmap_remove_write(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; + pd_entry_t *pde; pt_entry_t oldpte, *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + if ((*pde & PG_RW) != 0) + (void)pmap_demote_pde(pmap, pde, va); + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; @@ -3247,15 +4130,49 @@ retry: int pmap_ts_referenced(vm_page_t m) { + struct md_page *pvh; pv_entry_t pv, pvf, pvn; pmap_t pmap; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Remove the mapping to a single page + * so that a subsequent access may + * repromote. Since the underlying + * page table page is fully populated, + * this removal never frees a page + * table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_FRAME); + pmap_remove_page(pmap, va, NULL); + rtval++; + if (rtval > 4) { + PMAP_UNLOCK(pmap); + return (rtval); + } + } + } + } + PMAP_UNLOCK(pmap); + } if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { @@ -3264,6 +4181,9 @@ pmap_ts_referenced(vm_page_t m) TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" + " found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); @@ -3285,19 +4205,62 @@ pmap_ts_referenced(vm_page_t m) void pmap_clear_modify(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; - pt_entry_t *pte; + pd_entry_t oldpde, *pde; + pt_entry_t oldpte, *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_RW) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + if ((oldpde & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & + PG_FRAME); + pte = pmap_pte_quick(pmap, va); + oldpte = *pte; + if ((oldpte & PG_V) != 0) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ + while (!atomic_cmpset_int((u_int *)pte, + oldpte, + oldpte & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_M) != 0) { + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant @@ -3319,17 +4282,45 @@ pmap_clear_modify(vm_page_t m) void pmap_clear_reference(vm_page_t m) { - pv_entry_t pv; + struct md_page *pvh; + pv_entry_t next_pv, pv; pmap_t pmap; + pd_entry_t oldpde, *pde; pt_entry_t *pte; + vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pde = pmap_pde(pmap, va); + oldpde = *pde; + if ((oldpde & PG_A) != 0) { + if (pmap_demote_pde(pmap, pde, va)) { + /* + * Remove the mapping to a single page so + * that a subsequent access may repromote. + * Since the underlying page table page is + * fully populated, this removal never frees + * a page table page. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME); + pmap_remove_page(pmap, va, NULL); + } + } + PMAP_UNLOCK(pmap); + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); + pde = pmap_pde(pmap, pv->pv_va); + KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" + " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { /* @@ -3479,31 +4470,43 @@ pmap_change_attr(va, size, mode) int pmap_mincore(pmap_t pmap, vm_offset_t addr) { + pd_entry_t *pdep; pt_entry_t *ptep, pte; + vm_paddr_t pa; vm_page_t m; int val = 0; PMAP_LOCK(pmap); - ptep = pmap_pte(pmap, addr); - pte = (ptep != NULL) ? *ptep : 0; - pmap_pte_release(ptep); + pdep = pmap_pde(pmap, addr); + if (*pdep != 0) { + if (*pdep & PG_PS) { + pte = *pdep; + /* Compute the physical address of the 4KB page. */ + pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & + PG_FRAME; + } else { + ptep = pmap_pte(pmap, addr); + pte = *ptep; + pmap_pte_release(ptep); + pa = pte & PG_FRAME; + } + } else { + pte = 0; + pa = 0; + } PMAP_UNLOCK(pmap); if (pte != 0) { - vm_paddr_t pa; - val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; - pa = pte & PG_FRAME; - m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ - if (pte & PG_M) + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 3edec14..74c7312 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -82,6 +82,13 @@ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ /* + * Promotion to a 2 or 4MB (PDE) page mapping requires that the corresponding + * 4KB (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \ + PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V) + +/* * Page Protection Exception bits */ @@ -213,6 +220,9 @@ pmap_kextract(vm_offset_t va) #ifdef PAE +#define pde_cmpset(pdep, old, new) \ + atomic_cmpset_64((pdep), (old), (new)) + static __inline pt_entry_t pte_load(pt_entry_t *ptep) { @@ -269,6 +279,9 @@ extern pt_entry_t pg_nx; #else /* PAE */ +#define pde_cmpset(pdep, old, new) \ + atomic_cmpset_int((pdep), (old), (new)) + static __inline pt_entry_t pte_load(pt_entry_t *ptep) { @@ -330,6 +343,7 @@ struct pmap { pdpt_entry_t *pm_pdpt; /* KVA of page director pointer table */ #endif + vm_page_t pm_root; /* spare page table pages */ }; typedef struct pmap *pmap_t; @@ -393,7 +407,6 @@ extern char *ptvmmap; /* poor name! */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) void pmap_bootstrap(vm_paddr_t); @@ -406,6 +419,7 @@ void pmap_kremove(vm_offset_t); void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_unmapdev(vm_offset_t, vm_size_t); pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; void pmap_set_pg(void); |