diff options
author | alc <alc@FreeBSD.org> | 2013-08-29 15:49:05 +0000 |
---|---|---|
committer | alc <alc@FreeBSD.org> | 2013-08-29 15:49:05 +0000 |
commit | aa9a7bb9e6c374d0ee2f00489abd48f856d79aee (patch) | |
tree | 548fcd5f51d9d645c37a8a20f515a3b39c70329c /sys | |
parent | 4212d5ac38bc35cd7d1671a7a93ad65425dbdded (diff) | |
download | FreeBSD-src-aa9a7bb9e6c374d0ee2f00489abd48f856d79aee.zip FreeBSD-src-aa9a7bb9e6c374d0ee2f00489abd48f856d79aee.tar.gz |
Significantly reduce the cost, i.e., run time, of calls to madvise(...,
MADV_DONTNEED) and madvise(..., MADV_FREE). Specifically, introduce a new
pmap function, pmap_advise(), that operates on a range of virtual addresses
within the specified pmap, allowing for a more efficient implementation of
MADV_DONTNEED and MADV_FREE. Previously, the implementation of
MADV_DONTNEED and MADV_FREE relied on per-page pmap operations, such as
pmap_clear_reference(). Intuitively, the problem with this implementation
is that the pmap-level locks are acquired and released and the page table
traversed repeatedly, once for each resident page in the range
that was specified to madvise(2). A more subtle flaw with the previous
implementation is that pmap_clear_reference() would clear the reference bit
on all mappings to the specified page, not just the mapping in the range
specified to madvise(2).
Since our malloc(3) makes heavy use of madvise(2), this change can have a
measureable impact. For example, the system time for completing a parallel
"buildworld" on a 6-core amd64 machine was reduced by about 1.5% to 2.0%.
Note: This change only contains pmap_advise() implementations for a subset
of our supported architectures. I will commit implementations for the
remaining architectures after further testing. For now, a stub function is
sufficient because of the advisory nature of pmap_advise().
Discussed with: jeff, jhb, kib
Tested by: pho (i386), marcel (ia64)
Sponsored by: EMC / Isilon Storage Division
Diffstat (limited to 'sys')
-rw-r--r-- | sys/amd64/amd64/pmap.c | 122 | ||||
-rw-r--r-- | sys/arm/arm/pmap-v6.c | 8 | ||||
-rw-r--r-- | sys/arm/arm/pmap.c | 8 | ||||
-rw-r--r-- | sys/i386/i386/pmap.c | 106 | ||||
-rw-r--r-- | sys/i386/xen/pmap.c | 66 | ||||
-rw-r--r-- | sys/ia64/ia64/pmap.c | 44 | ||||
-rw-r--r-- | sys/mips/mips/pmap.c | 8 | ||||
-rw-r--r-- | sys/powerpc/powerpc/mmu_if.m | 19 | ||||
-rw-r--r-- | sys/powerpc/powerpc/pmap_dispatch.c | 9 | ||||
-rw-r--r-- | sys/sparc64/sparc64/pmap.c | 8 | ||||
-rw-r--r-- | sys/vm/pmap.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_map.c | 21 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 9 |
13 files changed, 419 insertions, 11 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 7fb1277..851f92a 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4967,6 +4967,128 @@ out: } /* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + struct rwlock *lock; + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t oldpde, *pde; + pt_entry_t *pte; + vm_offset_t va_next; + vm_page_t m; + boolean_t anychanged, pv_lists_locked; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + pv_lists_locked = FALSE; +resume: + anychanged = FALSE; + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + pml4e = pmap_pml4e(pmap, sva); + if ((*pml4e & PG_V) == 0) { + va_next = (sva + NBPML4) & ~PML4MASK; + if (va_next < sva) + va_next = eva; + continue; + } + pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + if ((*pdpe & PG_V) == 0) { + va_next = (sva + NBPDP) & ~PDPMASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + pde = pmap_pdpe_to_pde(pdpe, sva); + oldpde = *pde; + if ((oldpde & PG_V) == 0) + continue; + else if ((oldpde & PG_PS) != 0) { + if ((oldpde & PG_MANAGED) == 0) + continue; + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_rlock(&pvh_global_lock)) { + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + rw_rlock(&pvh_global_lock); + goto resume; + } + } + lock = NULL; + if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if ((oldpde & PG_W) == 0) { + pte = pmap_pde_to_pte(pde, sva); + KASSERT((*pte & PG_V) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, sva, *pde, NULL, + &lock); + anychanged = TRUE; + } + if (lock != NULL) + rw_wunlock(lock); + } + if (va_next > eva) + va_next = eva; + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | + PG_V)) + continue; + else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + vm_page_dirty(m); + } + atomic_clear_long(pte, PG_M | PG_A); + } else if ((*pte & PG_A) != 0) + atomic_clear_long(pte, PG_A); + else + continue; + if ((*pte & PG_G) != 0) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + } + if (anychanged) + pmap_invalidate_all(pmap); + if (pv_lists_locked) + rw_runlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 08763b8..b9447d3 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -4767,6 +4767,14 @@ pmap_is_modified(vm_page_t m) } /* + * This function is advisory. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c index f232b07..bc7912d 100644 --- a/sys/arm/arm/pmap.c +++ b/sys/arm/arm/pmap.c @@ -4516,6 +4516,14 @@ pmap_page_wired_mappings(vm_page_t m) } /* + * This function is advisory. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ +} + +/* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 8fa9026..f09abee 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -4834,6 +4834,112 @@ out: } /* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + pd_entry_t oldpde, *pde; + pt_entry_t *pte; + vm_offset_t pdnxt; + vm_page_t m; + boolean_t anychanged, pv_lists_locked; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + if (pmap_is_current(pmap)) + pv_lists_locked = FALSE; + else { + pv_lists_locked = TRUE; +resume: + rw_wlock(&pvh_global_lock); + sched_pin(); + } + anychanged = FALSE; + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pdnxt < sva) + pdnxt = eva; + pde = pmap_pde(pmap, sva); + oldpde = *pde; + if ((oldpde & PG_V) == 0) + continue; + else if ((oldpde & PG_PS) != 0) { + if ((oldpde & PG_MANAGED) == 0) + continue; + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_wlock(&pvh_global_lock)) { + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + goto resume; + } + sched_pin(); + } + if (!pmap_demote_pde(pmap, pde, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if ((oldpde & PG_W) == 0) { + pte = pmap_pte_quick(pmap, sva); + KASSERT((*pte & PG_V) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, sva, NULL); + anychanged = TRUE; + } + } + if (pdnxt > eva) + pdnxt = eva; + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | + PG_V)) + continue; + else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + vm_page_dirty(m); + } + atomic_clear_int((u_int *)pte, PG_M | PG_A); + } else if ((*pte & PG_A) != 0) + atomic_clear_int((u_int *)pte, PG_A); + else + continue; + if ((*pte & PG_G) != 0) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + } + if (anychanged) + pmap_invalidate_all(pmap); + if (pv_lists_locked) { + sched_unpin(); + rw_wunlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c index c34fe29..3abe7ef 100644 --- a/sys/i386/xen/pmap.c +++ b/sys/i386/xen/pmap.c @@ -3914,6 +3914,72 @@ pmap_ts_referenced(vm_page_t m) } /* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + pd_entry_t oldpde; + pt_entry_t *pte; + vm_offset_t pdnxt; + vm_page_t m; + boolean_t anychanged; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + anychanged = FALSE; + rw_wlock(&pvh_global_lock); + sched_pin(); + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pdnxt < sva) + pdnxt = eva; + oldpde = pmap->pm_pdir[sva >> PDRSHIFT]; + if ((oldpde & (PG_PS | PG_V)) != PG_V) + continue; + if (pdnxt > eva) + pdnxt = eva; + for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, + sva += PAGE_SIZE) { + if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | + PG_V)) + continue; + else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(xpmap_mtop(*pte) & + PG_FRAME); + vm_page_dirty(m); + } + PT_SET_VA_MA(pte, *pte & ~(PG_M | PG_A), TRUE); + } else if ((*pte & PG_A) != 0) + PT_SET_VA_MA(pte, *pte & ~PG_A, TRUE); + else + continue; + if ((*pte & PG_G) != 0) + pmap_invalidate_page(pmap, sva); + else + anychanged = TRUE; + } + } + PT_UPDATES_FLUSH(); + if (*PMAP1) + PT_SET_VA_MA(PMAP1, 0, TRUE); + if (anychanged) + pmap_invalidate_all(pmap); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c index b23c77c..442149f 100644 --- a/sys/ia64/ia64/pmap.c +++ b/sys/ia64/ia64/pmap.c @@ -2310,6 +2310,50 @@ pmap_is_referenced(vm_page_t m) } /* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + struct ia64_lpte *pte; + pmap_t oldpmap; + vm_page_t m; + + PMAP_LOCK(pmap); + oldpmap = pmap_switch(pmap); + for (; sva < eva; sva += PAGE_SIZE) { + /* If page is invalid, skip this page. */ + pte = pmap_find_vhpt(sva); + if (pte == NULL) + continue; + + /* If it isn't managed, skip it too. */ + if (!pmap_managed(pte)) + continue; + + /* Clear its modified and referenced bits. */ + if (pmap_dirty(pte)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() can be + * avoided by making the page dirty now. + */ + m = PHYS_TO_VM_PAGE(pmap_ppn(pte)); + vm_page_dirty(m); + } + pmap_clear_dirty(pte); + } else if (!pmap_accessed(pte)) + continue; + pmap_clear_accessed(pte); + pmap_invalidate_page(sva); + } + pmap_switch(oldpmap); + PMAP_UNLOCK(pmap); +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index fb22a89..90994cc 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2914,6 +2914,14 @@ pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) } /* + * This function is advisory. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ +} + +/* * Clear the modify bits on the specified physical page. */ void diff --git a/sys/powerpc/powerpc/mmu_if.m b/sys/powerpc/powerpc/mmu_if.m index 0382bd8..f9f37cb 100644 --- a/sys/powerpc/powerpc/mmu_if.m +++ b/sys/powerpc/powerpc/mmu_if.m @@ -133,6 +133,25 @@ CODE { /** + * @brief Apply the given advice to the specified range of addresses within + * the given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + * + * @param _pmap physical map + * @param _start virtual range start + * @param _end virtual range end + * @param _advice advice to apply + */ +METHOD void advise { + mmu_t _mmu; + pmap_t _pmap; + vm_offset_t _start; + vm_offset_t _end; + int _advice; +}; + + +/** * @brief Change the wiring attribute for the page in the given physical * map and virtual address. * diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c index 7fd98f4..24e6076 100644 --- a/sys/powerpc/powerpc/pmap_dispatch.c +++ b/sys/powerpc/powerpc/pmap_dispatch.c @@ -91,6 +91,15 @@ RB_GENERATE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); void +pmap_advise(pmap_t pmap, vm_offset_t start, vm_offset_t end, int advice) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %d)", __func__, pmap, start, end, + advice); + MMU_ADVISE(mmu_obj, pmap, start, end, advice); +} + +void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c index 47f2c49..a84e4c3 100644 --- a/sys/sparc64/sparc64/pmap.c +++ b/sys/sparc64/sparc64/pmap.c @@ -2126,6 +2126,14 @@ pmap_is_referenced(vm_page_t m) return (rv); } +/* + * This function is advisory. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ +} + void pmap_clear_modify(vm_page_t m) { diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h index c0f80a7..911298f 100644 --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -98,6 +98,8 @@ struct thread; extern vm_offset_t kernel_vm_end; void pmap_activate(struct thread *td); +void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + int advice); void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t); void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 71c44ad..1be62af 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -2125,7 +2125,7 @@ vm_map_madvise( (current != &map->header) && (current->start < end); current = current->next ) { - vm_offset_t useStart; + vm_offset_t useEnd, useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; @@ -2133,17 +2133,34 @@ vm_map_madvise( pstart = OFF_TO_IDX(current->offset); pend = pstart + atop(current->end - current->start); useStart = current->start; + useEnd = current->end; if (current->start < start) { pstart += atop(start - current->start); useStart = start; } - if (current->end > end) + if (current->end > end) { pend -= atop(current->end - end); + useEnd = end; + } if (pstart >= pend) continue; + /* + * Perform the pmap_advise() before clearing + * PGA_REFERENCED in vm_page_advise(). Otherwise, a + * concurrent pmap operation, such as pmap_remove(), + * could clear a reference in the pmap and set + * PGA_REFERENCED on the page before the pmap_advise() + * had completed. Consequently, the page would appear + * referenced based upon an old reference that + * occurred before this pmap_advise() ran. + */ + if (behav == MADV_DONTNEED || behav == MADV_FREE) + pmap_advise(map->pmap, useStart, useEnd, + behav); + vm_object_madvise(current->object.vm_object, pstart, pend, behav); if (behav == MADV_WILLNEED) { diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 7d8ecfa..7b4b57c 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2634,7 +2634,6 @@ vm_page_advise(vm_page_t m, int advice) * But we do make the page is freeable as we can without * actually taking the step of unmapping it. */ - pmap_clear_modify(m); m->dirty = 0; m->act_count = 0; } else if (advice != MADV_DONTNEED) @@ -2654,15 +2653,7 @@ vm_page_advise(vm_page_t m, int advice) /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. - * - * Perform the pmap_clear_reference() first. Otherwise, a concurrent - * pmap operation, such as pmap_remove(), could clear a reference in - * the pmap and set PGA_REFERENCED on the page before the - * pmap_clear_reference() had completed. Consequently, the page would - * appear referenced based upon an old reference that occurred before - * this function ran. */ - pmap_clear_reference(m); vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) |