summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorattilio <attilio@FreeBSD.org>2013-08-09 11:28:55 +0000
committerattilio <attilio@FreeBSD.org>2013-08-09 11:28:55 +0000
commite9f37cac7422f86c8a65b4c123705f5dccd43fa1 (patch)
tree589f2433c8a0e985a4f0aeb058fbbf1b412b6f98
parent3f74b0e634cf4f4b3796e44533e8318ef773c3e9 (diff)
downloadFreeBSD-src-e9f37cac7422f86c8a65b4c123705f5dccd43fa1.zip
FreeBSD-src-e9f37cac7422f86c8a65b4c123705f5dccd43fa1.tar.gz
On all the architectures, avoid to preallocate the physical memory
for nodes used in vm_radix. On architectures supporting direct mapping, also avoid to pre-allocate the KVA for such nodes. In order to do so make the operations derived from vm_radix_insert() to fail and handle all the deriving failure of those. vm_radix-wise introduce a new function called vm_radix_replace(), which can replace a leaf node, already present, with a new one, and take into account the possibility, during vm_radix_insert() allocation, that the operations on the radix trie can recurse. This means that if operations in vm_radix_insert() recursed vm_radix_insert() will start from scratch again. Sponsored by: EMC / Isilon storage division Reviewed by: alc (older version) Reviewed by: jeff Tested by: pho, scottl
-rw-r--r--sys/amd64/amd64/pmap.c14
-rw-r--r--sys/dev/drm2/i915/i915_gem.c11
-rw-r--r--sys/dev/drm2/ttm/ttm_bo_vm.c14
-rw-r--r--sys/i386/i386/pmap.c14
-rw-r--r--sys/kern/subr_uio.c8
-rw-r--r--sys/vm/_vm_radix.h4
-rw-r--r--sys/vm/device_pager.c3
-rw-r--r--sys/vm/sg_pager.c4
-rw-r--r--sys/vm/vm_fault.c8
-rw-r--r--sys/vm/vm_object.c76
-rw-r--r--sys/vm/vm_object.h2
-rw-r--r--sys/vm/vm_page.c243
-rw-r--r--sys/vm/vm_page.h6
-rw-r--r--sys/vm/vm_radix.c174
-rw-r--r--sys/vm/vm_radix.h4
15 files changed, 456 insertions, 129 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 32dbe8a..ef267f5 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -283,7 +283,7 @@ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
@@ -1526,12 +1526,12 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
* for mapping a distinct range of virtual addresses. The pmap's collection is
* ordered by this virtual address range.
*/
-static __inline void
+static __inline int
pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- vm_radix_insert(&pmap->pm_root, mpte);
+ return (vm_radix_insert(&pmap->pm_root, mpte));
}
/*
@@ -3439,7 +3439,13 @@ setpte:
("pmap_promote_pde: page table page is out of range"));
KASSERT(mpte->pindex == pmap_pde_pindex(va),
("pmap_promote_pde: page table page's pindex is wrong"));
- pmap_insert_pt_page(pmap, mpte);
+ if (pmap_insert_pt_page(pmap, mpte)) {
+ atomic_add_long(&pmap_pde_p_failures, 1);
+ CTR2(KTR_PMAP,
+ "pmap_promote_pde: failure for va %#lx in pmap %p", va,
+ pmap);
+ return;
+ }
/*
* Promote the pv entries.
diff --git a/sys/dev/drm2/i915/i915_gem.c b/sys/dev/drm2/i915/i915_gem.c
index a4276a2..b2edf1a 100644
--- a/sys/dev/drm2/i915/i915_gem.c
+++ b/sys/dev/drm2/i915/i915_gem.c
@@ -64,6 +64,9 @@ __FBSDID("$FreeBSD$");
#include <sys/sched.h>
#include <sys/sf_buf.h>
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+
static void i915_gem_object_flush_cpu_write_domain(
struct drm_i915_gem_object *obj);
static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size,
@@ -1443,8 +1446,14 @@ retry:
vm_page_busy_sleep(m, "915pbs");
goto retry;
}
+ if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
+ DRM_UNLOCK(dev);
+ VM_OBJECT_WUNLOCK(vm_obj);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(vm_obj);
+ goto retry;
+ }
m->valid = VM_PAGE_BITS_ALL;
- vm_page_insert(m, vm_obj, OFF_TO_IDX(offset));
have_page:
*mres = m;
vm_page_xbusy(m);
diff --git a/sys/dev/drm2/ttm/ttm_bo_vm.c b/sys/dev/drm2/ttm/ttm_bo_vm.c
index 0faf868..366a776 100644
--- a/sys/dev/drm2/ttm/ttm_bo_vm.c
+++ b/sys/dev/drm2/ttm/ttm_bo_vm.c
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm.h>
#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
#define TTM_BO_VM_NUM_PREFAULT 16
@@ -221,16 +222,23 @@ reserve:
ttm_bo_unreserve(bo);
goto retry;
}
- m->valid = VM_PAGE_BITS_ALL;
- *mres = m;
m1 = vm_page_lookup(vm_obj, OFF_TO_IDX(offset));
if (m1 == NULL) {
- vm_page_insert(m, vm_obj, OFF_TO_IDX(offset));
+ if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
+ VM_OBJECT_WUNLOCK(vm_obj);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(vm_obj);
+ ttm_mem_io_unlock(man);
+ ttm_bo_unreserve(bo);
+ goto retry;
+ }
} else {
KASSERT(m == m1,
("inconsistent insert bo %p m %p m1 %p offset %jx",
bo, m, m1, (uintmax_t)offset));
}
+ m->valid = VM_PAGE_BITS_ALL;
+ *mres = m;
vm_page_xbusy(m);
if (oldm != NULL) {
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index f657eec..49f3ee7 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -304,7 +304,7 @@ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
vm_page_t m, vm_prot_t prot, vm_page_t mpte);
static void pmap_flush_page(vm_page_t m);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
@@ -1604,12 +1604,12 @@ pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
* for mapping a distinct range of virtual addresses. The pmap's collection is
* ordered by this virtual address range.
*/
-static __inline void
+static __inline int
pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- vm_radix_insert(&pmap->pm_root, mpte);
+ return (vm_radix_insert(&pmap->pm_root, mpte));
}
/*
@@ -3401,7 +3401,13 @@ setpte:
("pmap_promote_pde: page table page is out of range"));
KASSERT(mpte->pindex == va >> PDRSHIFT,
("pmap_promote_pde: page table page's pindex is wrong"));
- pmap_insert_pt_page(pmap, mpte);
+ if (pmap_insert_pt_page(pmap, mpte)) {
+ pmap_pde_p_failures++;
+ CTR2(KTR_PMAP,
+ "pmap_promote_pde: failure for va %#x in pmap %p", va,
+ pmap);
+ return;
+ }
/*
* Promote the pv entries.
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index bc7ba98..53f87c0 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
#include <vm/vm_map.h>
#ifdef SOCKET_SEND_COW
#include <vm/vm_object.h>
@@ -122,7 +123,12 @@ retry:
if (uobject->backing_object != NULL)
pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
}
- vm_page_insert(kern_pg, uobject, upindex);
+ if (vm_page_insert(kern_pg, uobject, upindex)) {
+ VM_OBJECT_WUNLOCK(uobject);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(uobject);
+ goto retry;
+ }
vm_page_dirty(kern_pg);
VM_OBJECT_WUNLOCK(uobject);
vm_map_lookup_done(map, entry);
diff --git a/sys/vm/_vm_radix.h b/sys/vm/_vm_radix.h
index f066462..1d06d0a 100644
--- a/sys/vm/_vm_radix.h
+++ b/sys/vm/_vm_radix.h
@@ -36,8 +36,12 @@
*/
struct vm_radix {
uintptr_t rt_root;
+ uint8_t rt_flags;
};
+#define RT_INSERT_INPROG 0x01
+#define RT_TRIE_MODIFIED 0x02
+
#ifdef _KERNEL
static __inline boolean_t
diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c
index fd20664..ba193e9 100644
--- a/sys/vm/device_pager.c
+++ b/sys/vm/device_pager.c
@@ -348,11 +348,12 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
*/
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_WLOCK(object);
+ if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
+ panic("old_dev_pager_fault: invalid page replacement");
vm_page_lock(*mres);
vm_page_free(*mres);
vm_page_unlock(*mres);
*mres = page;
- vm_page_insert(page, object, pidx);
}
page->valid = VM_PAGE_BITS_ALL;
return (VM_PAGER_OK);
diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c
index 76cae68..2b3f117 100644
--- a/sys/vm/sg_pager.c
+++ b/sys/vm/sg_pager.c
@@ -186,11 +186,13 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
/* Free the original pages and insert this fake page into the object. */
for (i = 0; i < count; i++) {
+ if (i == reqpage &&
+ vm_page_replace(page, object, offset) != m[i])
+ panic("sg_pager_getpages: invalid place replacement");
vm_page_lock(m[i]);
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- vm_page_insert(page, object, offset);
m[reqpage] = page;
page->valid = VM_PAGE_BITS_ALL;
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 341932a..85eeca1 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -752,9 +752,11 @@ vnode_locked:
* process'es object. The page is
* automatically made dirty.
*/
- vm_page_lock(fs.m);
- vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
- vm_page_unlock(fs.m);
+ if (vm_page_rename(fs.m, fs.first_object,
+ fs.first_pindex)) {
+ unlock_and_deallocate(&fs);
+ goto RetryFault;
+ }
vm_page_xbusy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 74e580f..1e22cd3 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -201,10 +201,12 @@ vm_object_zinit(void *mem, int size, int flags)
/* These are true for any object that has been freed */
object->rtree.rt_root = 0;
+ object->rtree.rt_flags = 0;
object->paging_in_progress = 0;
object->resident_page_count = 0;
object->shadow_count = 0;
object->cache.rt_root = 0;
+ object->cache.rt_flags = 0;
return (0);
}
@@ -1351,6 +1353,16 @@ retry:
VM_OBJECT_WLOCK(new_object);
goto retry;
}
+
+ /* vm_page_rename() will handle dirty and cache. */
+ if (vm_page_rename(m, new_object, idx)) {
+ VM_OBJECT_WUNLOCK(new_object);
+ VM_OBJECT_WUNLOCK(orig_object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(orig_object);
+ VM_OBJECT_WLOCK(new_object);
+ goto retry;
+ }
#if VM_NRESERVLEVEL > 0
/*
* If some of the reservation's allocated pages remain with
@@ -1366,10 +1378,6 @@ retry:
*/
vm_reserv_rename(m, new_object, orig_object, offidxstart);
#endif
- vm_page_lock(m);
- vm_page_rename(m, new_object, idx);
- vm_page_unlock(m);
- /* page automatically made dirty by rename and cache handled */
if (orig_object->type == OBJT_SWAP)
vm_page_xbusy(m);
}
@@ -1525,21 +1533,14 @@ vm_object_backing_scan(vm_object_t object, int op)
("vm_object_backing_scan: object mismatch")
);
- /*
- * Destroy any associated swap
- */
- if (backing_object->type == OBJT_SWAP) {
- swap_pager_freespace(
- backing_object,
- p->pindex,
- 1
- );
- }
-
if (
p->pindex < backing_offset_index ||
new_pindex >= object->size
) {
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ p->pindex, 1);
+
/*
* Page is out of the parent object's range, we
* can simply destroy it.
@@ -1561,6 +1562,10 @@ vm_object_backing_scan(vm_object_t object, int op)
(op & OBSC_COLLAPSE_NOWAIT) != 0 &&
(pp != NULL && pp->valid == 0)
) {
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ p->pindex, 1);
+
/*
* The page in the parent is not (yet) valid.
* We don't know anything about the state of
@@ -1579,6 +1584,10 @@ vm_object_backing_scan(vm_object_t object, int op)
pp != NULL ||
vm_pager_has_page(object, new_pindex, NULL, NULL)
) {
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ p->pindex, 1);
+
/*
* page already exists in parent OR swap exists
* for this location in the parent. Destroy
@@ -1598,25 +1607,38 @@ vm_object_backing_scan(vm_object_t object, int op)
continue;
}
-#if VM_NRESERVLEVEL > 0
- /*
- * Rename the reservation.
- */
- vm_reserv_rename(p, object, backing_object,
- backing_offset_index);
-#endif
-
/*
* Page does not exist in parent, rename the
* page from the backing object to the main object.
*
* If the page was mapped to a process, it can remain
* mapped through the rename.
+ * vm_page_rename() will handle dirty and cache.
+ */
+ if (vm_page_rename(p, object, new_pindex)) {
+ if (op & OBSC_COLLAPSE_NOWAIT) {
+ p = next;
+ continue;
+ }
+ VM_OBJECT_WLOCK(backing_object);
+ VM_OBJECT_WUNLOCK(object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(object);
+ VM_OBJECT_WLOCK(backing_object);
+ p = TAILQ_FIRST(&backing_object->memq);
+ continue;
+ }
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object, p->pindex,
+ 1);
+
+#if VM_NRESERVLEVEL > 0
+ /*
+ * Rename the reservation.
*/
- vm_page_lock(p);
- vm_page_rename(p, object, new_pindex);
- vm_page_unlock(p);
- /* page automatically made dirty by rename */
+ vm_reserv_rename(p, object, backing_object,
+ backing_offset_index);
+#endif
}
p = next;
}
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index e083b72..d59a9e6 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -102,7 +102,7 @@ struct vm_object {
TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
- TAILQ_HEAD(, vm_page) memq; /* list of resident pages */
+ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
struct vm_radix rtree; /* root of the resident page radix trie*/
vm_pindex_t size; /* Object size */
int generation; /* generation ID */
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 6fccf45..42adc11 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -145,11 +145,14 @@ SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
static uma_zone_t fakepg_zone;
static struct vnode *vm_page_alloc_init(vm_page_t m);
+static void vm_page_cache_turn_free(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
static void vm_page_enqueue(int queue, vm_page_t m);
static void vm_page_init_fakepg(void *dummy);
-static void vm_page_insert_after(vm_page_t m, vm_object_t object,
+static int vm_page_insert_after(vm_page_t m, vm_object_t object,
vm_pindex_t pindex, vm_page_t mpred);
+static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
+ vm_page_t mpred);
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
@@ -930,14 +933,14 @@ vm_page_dirty_KBI(vm_page_t m)
*
* The object must be locked.
*/
-void
+int
vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
{
vm_page_t mpred;
VM_OBJECT_ASSERT_WLOCKED(object);
mpred = vm_radix_lookup_le(&object->rtree, pindex);
- vm_page_insert_after(m, object, pindex, mpred);
+ return (vm_page_insert_after(m, object, pindex, mpred));
}
/*
@@ -950,10 +953,12 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
*
* The object must be locked.
*/
-static void
+static int
vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
vm_page_t mpred)
{
+ vm_pindex_t sidx;
+ vm_object_t sobj;
vm_page_t msucc;
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -975,17 +980,53 @@ vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
/*
* Record the object/offset pair in this page
*/
+ sobj = m->object;
+ sidx = m->pindex;
m->object = object;
m->pindex = pindex;
/*
* Now link into the object's ordered list of backed pages.
*/
+ if (vm_radix_insert(&object->rtree, m)) {
+ m->object = sobj;
+ m->pindex = sidx;
+ return (1);
+ }
+ vm_page_insert_radixdone(m, object, mpred);
+ return (0);
+}
+
+/*
+ * vm_page_insert_radixdone:
+ *
+ * Complete page "m" insertion into the specified object after the
+ * radix trie hooking.
+ *
+ * The page "mpred" must precede the offset "m->pindex" within the
+ * specified object.
+ *
+ * The object must be locked.
+ */
+static void
+vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(object != NULL && m->object == object,
+ ("vm_page_insert_radixdone: page %p has inconsistent object", m));
+ if (mpred != NULL) {
+ KASSERT(mpred->object == object ||
+ (mpred->flags & PG_SLAB) != 0,
+ ("vm_page_insert_after: object doesn't contain mpred"));
+ KASSERT(mpred->pindex < m->pindex,
+ ("vm_page_insert_after: mpred doesn't precede pindex"));
+ }
+
if (mpred != NULL)
TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
else
TAILQ_INSERT_HEAD(&object->memq, m, listq);
- vm_radix_insert(&object->rtree, m);
/*
* Show that the object has one more resident page.
@@ -1131,6 +1172,54 @@ vm_page_prev(vm_page_t m)
}
/*
+ * Uses the page mnew as a replacement for an existing page at index
+ * pindex which must be already present in the object.
+ */
+vm_page_t
+vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
+{
+ vm_page_t mold, mpred;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ /*
+ * This function mostly follows vm_page_insert() and
+ * vm_page_remove() without the radix, object count and vnode
+ * dance. Double check such functions for more comments.
+ */
+ mpred = vm_radix_lookup(&object->rtree, pindex);
+ KASSERT(mpred != NULL,
+ ("vm_page_replace: replacing page not present with pindex"));
+ mpred = TAILQ_PREV(mpred, respgs, listq);
+ if (mpred != NULL)
+ KASSERT(mpred->pindex < pindex,
+ ("vm_page_insert_after: mpred doesn't precede pindex"));
+
+ mnew->object = object;
+ mnew->pindex = pindex;
+ mold = vm_radix_replace(&object->rtree, mnew, pindex);
+
+ /* Detach the old page from the resident tailq. */
+ TAILQ_REMOVE(&object->memq, mold, listq);
+ vm_page_lock(mold);
+ if (mold->oflags & VPO_BUSY) {
+ mold->oflags &= ~VPO_BUSY;
+ vm_page_flash(mold);
+ }
+ mold->object = NULL;
+ vm_page_unlock(mold);
+
+ /* Insert the new page in the resident tailq. */
+ if (mpred != NULL)
+ TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
+ else
+ TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+ if (pmap_page_is_write_mapped(mnew))
+ vm_object_set_writeable_dirty(object);
+ return (mold);
+}
+
+/*
* vm_page_rename:
*
* Move the given memory entry from its
@@ -1148,15 +1237,47 @@ vm_page_prev(vm_page_t m)
* or vm_page_dirty() will panic. Dirty pages are not allowed
* on the cache.
*
- * The objects must be locked. The page must be locked if it is managed.
+ * The objects must be locked.
*/
-void
+int
vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
{
+ vm_page_t mpred;
+ vm_pindex_t opidx;
+ VM_OBJECT_ASSERT_WLOCKED(new_object);
+
+ mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
+ KASSERT(mpred == NULL || mpred->pindex != new_pindex,
+ ("vm_page_rename: pindex already renamed"));
+
+ /*
+ * Create a custom version of vm_page_insert() which does not depend
+ * by m_prev and can cheat on the implementation aspects of the
+ * function.
+ */
+ opidx = m->pindex;
+ m->pindex = new_pindex;
+ if (vm_radix_insert(&new_object->rtree, m)) {
+ m->pindex = opidx;
+ return (1);
+ }
+
+ /*
+ * The operation cannot fail anymore. The removal must happen before
+ * the listq iterator is tainted.
+ */
+ m->pindex = opidx;
+ vm_page_lock(m);
vm_page_remove(m);
- vm_page_insert(m, new_object, new_pindex);
+
+ /* Return back to the new pindex to complete vm_page_insert(). */
+ m->pindex = new_pindex;
+ m->object = new_object;
+ vm_page_unlock(m);
+ vm_page_insert_radixdone(m, new_object, mpred);
vm_page_dirty(m);
+ return (0);
}
/*
@@ -1182,14 +1303,7 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
if (end != 0 && m->pindex >= end)
break;
vm_radix_remove(&object->cache, m->pindex);
- m->object = NULL;
- m->valid = 0;
- /* Clear PG_CACHED and set PG_FREE. */
- m->flags ^= PG_CACHED | PG_FREE;
- KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
- ("vm_page_cache_free: page %p has inconsistent flags", m));
- cnt.v_cache_count--;
- vm_phys_freecnt_adj(m, 1);
+ vm_page_cache_turn_free(m);
}
empty = vm_radix_is_empty(&object->cache);
mtx_unlock(&vm_page_queue_free_mtx);
@@ -1269,7 +1383,8 @@ vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
/* Update the page's object and offset. */
m->object = new_object;
m->pindex -= offidxstart;
- vm_radix_insert(&new_object->cache, m);
+ if (vm_radix_insert(&new_object->cache, m))
+ vm_page_cache_turn_free(m);
}
mtx_unlock(&vm_page_queue_free_mtx);
}
@@ -1361,7 +1476,13 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
KASSERT(mpred == NULL || mpred->pindex != pindex,
("vm_page_alloc: pindex already allocated"));
}
- mtx_lock(&vm_page_queue_free_mtx);
+
+ /*
+ * The page allocation request can came from consumers which already
+ * hold the free page queue mutex, like vm_page_insert() in
+ * vm_page_cache().
+ */
+ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
(req_class == VM_ALLOC_SYSTEM &&
cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
@@ -1486,11 +1607,20 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
m->act_count = 0;
if (object != NULL) {
+ if (vm_page_insert_after(m, object, pindex, mpred)) {
+ /* See the comment below about hold count. */
+ if (vp != NULL)
+ vdrop(vp);
+ pagedaemon_wakeup();
+ m->object = NULL;
+ vm_page_free(m);
+ return (NULL);
+ }
+
/* Ignore device objects; the pager sets "memattr" for them. */
if (object->memattr != VM_MEMATTR_DEFAULT &&
(object->flags & OBJ_FICTITIOUS) == 0)
pmap_page_set_memattr(m, object->memattr);
- vm_page_insert_after(m, object, pindex, mpred);
} else
m->pindex = pindex;
@@ -1557,7 +1687,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
vm_paddr_t boundary, vm_memattr_t memattr)
{
struct vnode *drop;
- vm_page_t deferred_vdrop_list, m, m_ret;
+ vm_page_t deferred_vdrop_list, m, m_tmp, m_ret;
u_int flags, oflags;
int req_class;
@@ -1660,12 +1790,29 @@ retry:
m->wire_count = 1;
/* Unmanaged pages don't use "act_count". */
m->oflags = oflags;
+ if (object != NULL) {
+ if (vm_page_insert(m, object, pindex)) {
+ while (deferred_vdrop_list != NULL) {
+ vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
+ deferred_vdrop_list =
+ deferred_vdrop_list->pageq.tqe_next;
+ }
+ if (vm_paging_needed())
+ pagedaemon_wakeup();
+ for (m = m_ret, m_tmp = m_ret;
+ m < &m_ret[npages]; m++) {
+ if (m_tmp < m)
+ m_tmp++;
+ else
+ m->object = NULL;
+ vm_page_free(m);
+ }
+ return (NULL);
+ }
+ } else
+ m->pindex = pindex;
if (memattr != VM_MEMATTR_DEFAULT)
pmap_page_set_memattr(m, memattr);
- if (object != NULL)
- vm_page_insert(m, object, pindex);
- else
- m->pindex = pindex;
pindex++;
}
while (deferred_vdrop_list != NULL) {
@@ -2042,6 +2189,28 @@ vm_page_free_wakeup(void)
}
/*
+ * Turn a cached page into a free page, by changing its attributes.
+ * Keep the statistics up-to-date.
+ *
+ * The free page queue must be locked.
+ */
+static void
+vm_page_cache_turn_free(vm_page_t m)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ m->object = NULL;
+ m->valid = 0;
+ /* Clear PG_CACHED and set PG_FREE. */
+ m->flags ^= PG_CACHED | PG_FREE;
+ KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+ ("vm_page_cache_free: page %p has inconsistent flags", m));
+ cnt.v_cache_count--;
+ vm_phys_freecnt_adj(m, 1);
+}
+
+/*
* vm_page_free_toq:
*
* Returns the given page to the free list,
@@ -2343,7 +2512,6 @@ vm_page_cache(vm_page_t m)
}
KASSERT((m->flags & PG_CACHED) == 0,
("vm_page_cache: page %p is already cached", m));
- PCPU_INC(cnt.v_tcached);
/*
* Remove the page from the paging queues.
@@ -2370,10 +2538,18 @@ vm_page_cache(vm_page_t m)
*/
m->flags &= ~PG_ZERO;
mtx_lock(&vm_page_queue_free_mtx);
+ cache_was_empty = vm_radix_is_empty(&object->cache);
+ if (vm_radix_insert(&object->cache, m)) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (object->resident_page_count == 0)
+ vdrop(object->handle);
+ m->object = NULL;
+ vm_page_free(m);
+ return;
+ }
m->flags |= PG_CACHED;
cnt.v_cache_count++;
- cache_was_empty = vm_radix_is_empty(&object->cache);
- vm_radix_insert(&object->cache, m);
+ PCPU_INC(cnt.v_tcached);
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m)) {
#else
@@ -2946,11 +3122,8 @@ vm_page_cowfault(vm_page_t m)
pindex = m->pindex;
retry_alloc:
- pmap_remove_all(m);
- vm_page_remove(m);
- mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
+ mnew = vm_page_alloc(NULL, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
if (mnew == NULL) {
- vm_page_insert(m, object, pindex);
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
VM_WAIT;
@@ -2976,8 +3149,14 @@ vm_page_cowfault(vm_page_t m)
vm_page_lock(mnew);
vm_page_free(mnew);
vm_page_unlock(mnew);
- vm_page_insert(m, object, pindex);
} else { /* clear COW & copy page */
+ pmap_remove_all(m);
+ mnew->object = object;
+ if (object->memattr != VM_MEMATTR_DEFAULT &&
+ (object->flags & OBJ_FICTITIOUS) == 0)
+ pmap_page_set_memattr(mnew, object->memattr);
+ if (vm_page_replace(mnew, object, pindex) != m)
+ panic("vm_page_cowfault: invalid page replacement");
if (!so_zerocp_fullpage)
pmap_copy_page(m, mnew);
mnew->valid = VM_PAGE_BITS_ALL;
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 53cf449..01c4967 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -438,7 +438,7 @@ void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
@@ -449,7 +449,9 @@ void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
void vm_page_reference(vm_page_t m);
void vm_page_remove (vm_page_t);
-void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
+ vm_pindex_t pindex);
void vm_page_requeue(vm_page_t m);
void vm_page_requeue_locked(vm_page_t m);
int vm_page_sbusied(vm_page_t m);
diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c
index 113a226..8698738 100644
--- a/sys/vm/vm_radix.c
+++ b/sys/vm/vm_radix.c
@@ -103,30 +103,16 @@ struct vm_radix_node {
static uma_zone_t vm_radix_node_zone;
/*
- * Allocate a radix node. Pre-allocation should ensure that the request
- * will always be satisfied.
+ * Allocate a radix node.
*/
static __inline struct vm_radix_node *
vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
{
struct vm_radix_node *rnode;
- rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT);
-
- /*
- * The required number of nodes should already be pre-allocated
- * by vm_radix_prealloc(). However, UMA can hold a few nodes
- * in per-CPU buckets, which will not be accessible by the
- * current CPU. Thus, the allocation could return NULL when
- * the pre-allocated pool is close to exhaustion. Anyway,
- * in practice this should never occur because a new node
- * is not always required for insert. Thus, the pre-allocated
- * pool should have some extra pages that prevent this from
- * becoming a problem.
- */
+ rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
if (rnode == NULL)
- panic("%s: uma_zalloc() returned NULL for a new node",
- __func__);
+ return (NULL);
rnode->rn_owner = owner;
rnode->rn_count = count;
rnode->rn_clev = clevel;
@@ -295,39 +281,30 @@ vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
}
#endif
+#ifndef UMA_MD_SMALL_ALLOC
/*
- * Radix node zone initializer.
- */
-static int
-vm_radix_node_zone_init(void *mem, int size __unused, int flags __unused)
-{
- struct vm_radix_node *rnode;
-
- rnode = mem;
- memset(rnode->rn_child, 0, sizeof(rnode->rn_child));
- return (0);
-}
-
-/*
- * Pre-allocate intermediate nodes from the UMA slab zone.
+ * Reserve the KVA necessary to satisfy the node allocation.
+ * This is mandatory in architectures not supporting direct
+ * mapping as they will need otherwise to carve into the kernel maps for
+ * every node allocation, resulting into deadlocks for consumers already
+ * working with kernel maps.
*/
static void
-vm_radix_prealloc(void *arg __unused)
+vm_radix_reserve_kva(void *arg __unused)
{
- int nodes;
/*
* Calculate the number of reserved nodes, discounting the pages that
* are needed to store them.
*/
- nodes = ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
- sizeof(struct vm_radix_node));
- if (!uma_zone_reserve_kva(vm_radix_node_zone, nodes))
- panic("%s: unable to create new zone", __func__);
- uma_prealloc(vm_radix_node_zone, nodes);
+ if (!uma_zone_reserve_kva(vm_radix_node_zone,
+ ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+ sizeof(struct vm_radix_node))))
+ panic("%s: unable to reserve KVA", __func__);
}
-SYSINIT(vm_radix_prealloc, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_prealloc,
- NULL);
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+ vm_radix_reserve_kva, NULL);
+#endif
/*
* Initialize the UMA slab zone.
@@ -345,15 +322,14 @@ vm_radix_init(void)
#else
NULL,
#endif
- vm_radix_node_zone_init, NULL, VM_RADIX_PAD, UMA_ZONE_VM |
- UMA_ZONE_NOFREE);
+ NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
}
/*
* Inserts the key-value pair into the trie.
* Panics if the key already exists.
*/
-void
+int
vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
{
vm_pindex_t index, newind;
@@ -365,6 +341,8 @@ vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
index = page->pindex;
+restart:
+
/*
* The owner of record for root is not really important because it
* will never be used.
@@ -372,7 +350,7 @@ vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
rnode = vm_radix_getroot(rtree);
if (rnode == NULL) {
rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
- return;
+ return (0);
}
parentp = (void **)&rtree->rt_root;
for (;;) {
@@ -382,19 +360,43 @@ vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
panic("%s: key %jx is already present",
__func__, (uintmax_t)index);
clev = vm_radix_keydiff(m->pindex, index);
+
+ /*
+ * During node allocation the trie that is being
+ * walked can be modified because of recursing radix
+ * trie operations.
+ * If this is the case, the recursing functions signal
+ * such situation and the insert operation must
+ * start from scratch again.
+ * The freed radix node will then be in the UMA
+ * caches very likely to avoid the same situation
+ * to happen.
+ */
+ rtree->rt_flags |= RT_INSERT_INPROG;
tmp = vm_radix_node_get(vm_radix_trimkey(index,
clev + 1), 2, clev);
+ rtree->rt_flags &= ~RT_INSERT_INPROG;
+ if (tmp == NULL) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ return (ENOMEM);
+ }
+ if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ tmp->rn_count = 0;
+ vm_radix_node_put(tmp);
+ goto restart;
+ }
*parentp = tmp;
vm_radix_addpage(tmp, index, clev, page);
vm_radix_addpage(tmp, m->pindex, clev, m);
- return;
+ return (0);
} else if (vm_radix_keybarr(rnode, index))
break;
slot = vm_radix_slot(index, rnode->rn_clev);
if (rnode->rn_child[slot] == NULL) {
rnode->rn_count++;
vm_radix_addpage(rnode, index, rnode->rn_clev, page);
- return;
+ return (0);
}
parentp = &rnode->rn_child[slot];
rnode = rnode->rn_child[slot];
@@ -407,12 +409,26 @@ vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
*/
newind = rnode->rn_owner;
clev = vm_radix_keydiff(newind, index);
- tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2,
- clev);
+
+ /* See the comments above. */
+ rtree->rt_flags |= RT_INSERT_INPROG;
+ tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
+ rtree->rt_flags &= ~RT_INSERT_INPROG;
+ if (tmp == NULL) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ return (ENOMEM);
+ }
+ if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ tmp->rn_count = 0;
+ vm_radix_node_put(tmp);
+ goto restart;
+ }
*parentp = tmp;
vm_radix_addpage(tmp, index, clev, page);
slot = vm_radix_slot(newind, clev);
tmp->rn_child[slot] = rnode;
+ return (0);
}
/*
@@ -677,6 +693,20 @@ vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
vm_page_t m;
int i, slot;
+ /*
+ * Detect if a page is going to be removed from a trie which is
+ * already undergoing another trie operation.
+ * Right now this is only possible for vm_radix_remove() recursing
+ * into vm_radix_insert().
+ * If this is the case, the caller must be notified about this
+ * situation. It will also takecare to update the RT_TRIE_MODIFIED
+ * accordingly.
+ * The RT_TRIE_MODIFIED bit is set here because the remove operation
+ * will always succeed.
+ */
+ if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
+ rtree->rt_flags |= RT_TRIE_MODIFIED;
+
rnode = vm_radix_getroot(rtree);
if (vm_radix_isleaf(rnode)) {
m = vm_radix_topage(rnode);
@@ -731,6 +761,9 @@ vm_radix_reclaim_allnodes(struct vm_radix *rtree)
{
struct vm_radix_node *root;
+ KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
+ ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
+
root = vm_radix_getroot(rtree);
if (root == NULL)
return;
@@ -739,6 +772,51 @@ vm_radix_reclaim_allnodes(struct vm_radix *rtree)
vm_radix_reclaim_allnodes_int(root);
}
+/*
+ * Replace an existing page into the trie with another one.
+ * Panics if the replacing page is not present or if the new page has an
+ * invalid key.
+ */
+vm_page_t
+vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage, vm_pindex_t index)
+{
+ struct vm_radix_node *rnode;
+ vm_page_t m;
+ int slot;
+
+ KASSERT(newpage->pindex == index, ("%s: newpage index invalid",
+ __func__));
+
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL)
+ panic("%s: replacing page on an empty trie", __func__);
+ if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex != index)
+ panic("%s: original replacing root key not found",
+ __func__);
+ rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
+ return (m);
+ }
+ for (;;) {
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ if (vm_radix_isleaf(rnode->rn_child[slot])) {
+ m = vm_radix_topage(rnode->rn_child[slot]);
+ if (m->pindex == index) {
+ rnode->rn_child[slot] =
+ (void *)((uintptr_t)newpage |
+ VM_RADIX_ISLEAF);
+ return (m);
+ } else
+ break;
+ } else if (rnode->rn_child[slot] == NULL ||
+ vm_radix_keybarr(rnode->rn_child[slot], index))
+ break;
+ rnode = rnode->rn_child[slot];
+ }
+ panic("%s: original replacing page not found", __func__);
+}
+
#ifdef DDB
/*
* Show details about the given radix node.
diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h
index 3414253..73c9cc1 100644
--- a/sys/vm/vm_radix.h
+++ b/sys/vm/vm_radix.h
@@ -36,12 +36,14 @@
#ifdef _KERNEL
void vm_radix_init(void);
-void vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+int vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
void vm_radix_reclaim_allnodes(struct vm_radix *rtree);
void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage,
+ vm_pindex_t index);
#endif /* _KERNEL */
#endif /* !_VM_RADIX_H_ */
OpenPOWER on IntegriCloud