- Support two types of nodes, red and black, within the same radix tree.

Black nodes support standard active pages and red nodes support cached pages. Red nodes may be removed without the object lock but will not collapse unused tree nodes. Red nodes may not be directly inserted, instead a new function is supplied to convert between black and red. - Handle cached pages and active pages in the same loop in vm_object_split, vm_object_backing_scan, and vm_object_terminate. - Retire the splay page handling as the ifdefs are too difficult to maintain. - Slightly optimize the vm_radix_lookupn() function.
author: jeff <jeff@FreeBSD.org> 2011-10-30 11:11:04 +0000
committer: jeff <jeff@FreeBSD.org> 2011-10-30 11:11:04 +0000
commit: 9e2e6a2980c96185bbe81d081d64d0a32bd42961 (patch)
tree: e1a2a2f82a025980770697c9b5d082a9e598706c
parent: b29030ce37e1b81d952e6533c3a29e7176e3ec9e (diff)
download: FreeBSD-src-9e2e6a2980c96185bbe81d081d64d0a32bd42961.zip
FreeBSD-src-9e2e6a2980c96185bbe81d081d64d0a32bd42961.tar.gz
8 files changed, 475 insertions, 681 deletions
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index ed39437..3ae7f51 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -162,6 +162,9 @@ vm_object_zdtor(void *mem, int size, void *arg)
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
+	KASSERT(object->resident_page_count == 0,
+	    ("object %p resident_page_count = %d",
+	    object, object->resident_page_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
@@ -170,15 +173,12 @@ vm_object_zdtor(void *mem, int size, void *arg)
 	    ("object %p has reservations",
 	    object));
 #endif
-	KASSERT(object->cache == NULL,
+	KASSERT(object->cached_page_count == 0,
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
-	KASSERT(object->resident_page_count == 0,
-	    ("object %p resident_page_count = %d",
-	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
@@ -208,11 +208,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
-#ifdef VM_RADIX
 	object->rtree.rt_root = 0;
-#else
-	object->root = NULL;
-#endif
 	object->type = type;
 	object->size = size;
 	object->generation = 1;
@@ -230,7 +226,6 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
-	object->cache = NULL;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -307,7 +302,7 @@ vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
-		if (!TAILQ_EMPTY(&object->memq))
+		if (object->resident_page_count == 0)
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
@@ -679,7 +674,10 @@ vm_object_destroy(vm_object_t object)
 void
 vm_object_terminate(vm_object_t object)
 {
-	vm_page_t p, p_next;
+	vm_page_t pa[VM_RADIX_STACK];
+	vm_page_t p;
+	vm_pindex_t start;
+	int n, i;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
@@ -724,23 +722,50 @@ vm_object_terminate(vm_object_t object)
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
-	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
-		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
-		    ("vm_object_terminate: freeing busy page %p", p));
-		vm_page_lock(p);
-		/*
-		 * Optimize the page's removal from the object by resetting
-		 * its "object" field.  Specifically, if the page is not
-		 * wired, then the effect of this assignment is that
-		 * vm_page_free()'s call to vm_page_remove() will return
-		 * immediately without modifying the page or the object.
-		 */ 
-		p->object = NULL;
-		if (p->wire_count == 0) {
-			vm_page_free(p);
-			PCPU_INC(cnt.v_pfree);
+	start = 0;
+	while ((n = vm_radix_lookupn(&object->rtree, start, 0, VM_RADIX_ANY,
+	    (void **)pa, VM_RADIX_STACK, &start)) != 0) {
+		for (i = 0; i < n; i++) {
+			p = pa[i];
+			/*
+			 * Another thread may allocate this cached page from
+			 * the queue before we acquire the page queue free
+			 * mtx.
+			 */
+			if (p->flags & PG_CACHED) {
+				mtx_lock(&vm_page_queue_free_mtx);
+				if (p->object == object) {
+					p->object = NULL;
+					p->valid = 0;
+					/* Clear PG_CACHED and set PG_FREE. */
+					p->flags ^= PG_CACHED | PG_FREE;
+					cnt.v_cache_count--;
+					cnt.v_free_count++;
+				}
+				mtx_unlock(&vm_page_queue_free_mtx);
+				continue;
+			} else if (p->object != object)
+				continue;
+			KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
+			    ("vm_object_terminate: freeing busy page %p", p));
+			vm_page_lock(p);
+			/*
+			 * Optimize the page's removal from the object by
+			 * resetting its "object" field.  Specifically, if
+			 * the page is not wired, then the effect of this
+			 * assignment is that vm_page_free()'s call to
+			 * vm_page_remove() will return immediately without
+			 * modifying the page or the object.
+			 */ 
+			p->object = NULL;
+			if (p->wire_count == 0) {
+				vm_page_free(p);
+				PCPU_INC(cnt.v_pfree);
+			}
+			vm_page_unlock(p);
 		}
-		vm_page_unlock(p);
+		if (n < VM_RADIX_STACK)
+			break;
 	}
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
@@ -748,19 +773,20 @@ vm_object_terminate(vm_object_t object)
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
-		object->root = NULL;
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
+	if (object->cached_page_count != 0 && object->type == OBJT_VNODE) {
+		object->cached_page_count = 0;
+		vdrop(object->handle);
+	}
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
-	if (__predict_false(object->cache != NULL))
-		vm_page_cache_free(object, 0, 0);
 
 	/*
 	 * Let the pager know object is dead.
@@ -1235,10 +1261,12 @@ vm_object_shadow(
 void
 vm_object_split(vm_map_entry_t entry)
 {
-	vm_page_t m, m_next;
+	vm_page_t ma[VM_RADIX_STACK];
+	vm_page_t m;
 	vm_object_t orig_object, new_object, source;
-	vm_pindex_t idx, offidxstart;
+	vm_pindex_t idx, offidxstart, start;
 	vm_size_t size;
+	int i, n;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
@@ -1291,31 +1319,50 @@ vm_object_split(vm_map_entry_t entry)
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
+	start = offidxstart;
 retry:
-	m = vm_page_find_least(orig_object, offidxstart);
-	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
-	    m = m_next) {
-		m_next = TAILQ_NEXT(m, listq);
-
-		/*
-		 * We must wait for pending I/O to complete before we can
-		 * rename the page.
-		 *
-		 * We do not have to VM_PROT_NONE the page as mappings should
-		 * not be changed by this operation.
-		 */
-		if ((m->oflags & VPO_BUSY) || m->busy) {
-			VM_OBJECT_UNLOCK(new_object);
-			m->oflags |= VPO_WANTED;
-			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
-			VM_OBJECT_LOCK(new_object);
-			goto retry;
+	while ((n = vm_radix_lookupn(&orig_object->rtree, start,
+	    offidxstart + size, VM_RADIX_ANY, (void **)ma, VM_RADIX_STACK,
+	    &start)) != 0) {
+		for (i = 0; i < n; i++) {
+			m = ma[i];
+			idx = m->pindex - offidxstart;
+			if (m->flags & PG_CACHED) {
+				mtx_lock(&vm_page_queue_free_mtx);
+				if (m->object == orig_object)
+					vm_page_cache_rename(m, new_object,
+					    idx);
+				mtx_unlock(&vm_page_queue_free_mtx);
+				continue;
+			} else if (m->object != orig_object)
+				continue;
+			/*
+			 * We must wait for pending I/O to complete before
+			 * we can rename the page.
+			 *
+			 * We do not have to VM_PROT_NONE the page as mappings
+			 * should not be changed by this operation.
+			 */
+			if ((m->oflags & VPO_BUSY) || m->busy) {
+				start = m->pindex;
+				VM_OBJECT_UNLOCK(new_object);
+				m->oflags |= VPO_WANTED;
+				msleep(m, VM_OBJECT_MTX(orig_object), PVM,
+				    "spltwt", 0);
+				VM_OBJECT_LOCK(new_object);
+				goto retry;
+			}
+			vm_page_lock(m);
+			vm_page_rename(m, new_object, idx);
+			vm_page_unlock(m);
+			/*
+			 * page automatically made dirty by rename and
+			 * cache handled
+			 */
+			vm_page_busy(m);
 		}
-		vm_page_lock(m);
-		vm_page_rename(m, new_object, idx);
-		vm_page_unlock(m);
-		/* page automatically made dirty by rename and cache handled */
-		vm_page_busy(m);
+		if (n < VM_RADIX_STACK)
+			break;
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
@@ -1323,13 +1370,6 @@ retry:
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
-
-		/*
-		 * Transfer any cached pages from orig_object to new_object.
-		 */
-		if (__predict_false(orig_object->cache != NULL))
-			vm_page_cache_transfer(orig_object, offidxstart,
-			    new_object);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
@@ -1348,10 +1388,13 @@ retry:
 static int
 vm_object_backing_scan(vm_object_t object, int op)
 {
-	int r = 1;
+	vm_page_t pa[VM_RADIX_STACK];
 	vm_page_t p;
 	vm_object_t backing_object;
-	vm_pindex_t backing_offset_index;
+	vm_pindex_t backing_offset_index, new_pindex;
+	vm_pindex_t start;
+	int color, i, n;
+	int r = 1;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
@@ -1379,15 +1422,39 @@ vm_object_backing_scan(vm_object_t object, int op)
 	if (op & OBSC_COLLAPSE_WAIT) {
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 	}
-
+	color = VM_RADIX_BLACK;
+	if (op & OBSC_COLLAPSE_WAIT)
+		color |= VM_RADIX_RED;
 	/*
 	 * Our scan
 	 */
-	p = TAILQ_FIRST(&backing_object->memq);
-	while (p) {
-		vm_page_t next = TAILQ_NEXT(p, listq);
-		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
+restart:
+	start = 0;
+	i = n = VM_RADIX_STACK;
+	for (;;) {
+		if (i == n) {
+			if (n < VM_RADIX_STACK)
+				break;
+			if ((n = vm_radix_lookupn(&backing_object->rtree,
+			    start, 0, color, (void **)pa, VM_RADIX_STACK,
+			    &start)) == 0)
+				break;
+			i = 0;
+		}
+		p = pa[i++];
+		/*
+		 * Free cached pages.  XXX Why?  Emulating old behavior here.
+		 */
+		if (p->flags & PG_CACHED) {
+			mtx_lock(&vm_page_queue_free_mtx);
+			if (p->object == backing_object)
+				vm_page_cache_free(p);
+			mtx_unlock(&vm_page_queue_free_mtx);
+			continue;
+		} else if (p->object != backing_object)
+			continue;
 
+		new_pindex = p->pindex - backing_offset_index;
 		if (op & OBSC_TEST_ALL_SHADOWED) {
 			vm_page_t pp;
 
@@ -1399,13 +1466,9 @@ vm_object_backing_scan(vm_object_t object, int op)
 			 * note that we do not busy the backing object's
 			 * page.
 			 */
-			if (
-			    p->pindex < backing_offset_index ||
-			    new_pindex >= object->size
-			) {
-				p = next;
+			if (p->pindex < backing_offset_index ||
+			    new_pindex >= object->size)
 				continue;
-			}
 
 			/*
 			 * See if the parent has the page or if the parent's
@@ -1434,12 +1497,9 @@ vm_object_backing_scan(vm_object_t object, int op)
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
-				if ((p->oflags & VPO_BUSY) ||
-				    !p->valid || 
-				    p->busy) {
-					p = next;
+				if ((p->oflags & VPO_BUSY) || !p->valid || 
+				    p->busy)
 					continue;
-				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
 				if ((p->oflags & VPO_BUSY) || p->busy) {
 					VM_OBJECT_UNLOCK(object);
@@ -1455,8 +1515,7 @@ vm_object_backing_scan(vm_object_t object, int op)
 					 * should not have changed so we
 					 * just restart our scan.
 					 */
-					p = TAILQ_FIRST(&backing_object->memq);
-					continue;
+					goto restart;
 				}
 			}
 
@@ -1492,7 +1551,6 @@ vm_object_backing_scan(vm_object_t object, int op)
 				else
 					vm_page_remove(p);
 				vm_page_unlock(p);
-				p = next;
 				continue;
 			}
 
@@ -1512,7 +1570,6 @@ vm_object_backing_scan(vm_object_t object, int op)
 				 * page before we can (re)lock the parent.
 				 * Hence we can get here.
 				 */
-				p = next;
 				continue;
 			}
 			if (
@@ -1534,7 +1591,6 @@ vm_object_backing_scan(vm_object_t object, int op)
 				else
 					vm_page_remove(p);
 				vm_page_unlock(p);
-				p = next;
 				continue;
 			}
 
@@ -1558,7 +1614,6 @@ vm_object_backing_scan(vm_object_t object, int op)
 			vm_page_unlock(p);
 			/* page automatically made dirty by rename */
 		}
-		p = next;
 	}
 	return (r);
 }
@@ -1669,12 +1724,6 @@ vm_object_collapse(vm_object_t object)
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
-
-				/*
-				 * Free any cached pages from backing_object.
-				 */
-				if (__predict_false(backing_object->cache != NULL))
-					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
@@ -1795,75 +1844,101 @@ void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
-	vm_page_t p, next;
+	struct vnode *vp;
+	vm_page_t pa[VM_RADIX_STACK];
+	vm_page_t p;
+	int i, n;
 	int wirings;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_PHYS) ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
-	if (object->resident_page_count == 0)
-		goto skipmemq;
+	if (object->resident_page_count == 0 && object->cached_page_count == 0)
+		return;
+	vp = NULL;
 	vm_object_pip_add(object, 1);
-again:
-	p = vm_page_find_least(object, start);
-
-	/*
-	 * Here, the variable "p" is either (1) the page with the least pindex
-	 * greater than or equal to the parameter "start" or (2) NULL. 
-	 */
-	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
-		next = TAILQ_NEXT(p, listq);
-
-		/*
-		 * If the page is wired for any reason besides the existence
-		 * of managed, wired mappings, then it cannot be freed.  For
-		 * example, fictitious pages, which represent device memory,
-		 * are inherently wired and cannot be freed.  They can,
-		 * however, be invalidated if the option OBJPR_CLEANONLY is
-		 * not specified.
-		 */
-		vm_page_lock(p);
-		if ((wirings = p->wire_count) != 0 &&
-		    (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
+restart:
+	while ((n = vm_radix_lookupn(&object->rtree, start, end, VM_RADIX_ANY,
+	    (void **)pa, VM_RADIX_STACK, &start)) != 0) {
+		for (i = 0; i < n; i++) {
+			p = pa[i];
+			/*
+			 * Another thread may allocate this cached page from
+			 * the queue before we acquire the page queue free
+			 * mtx.
+			 */
+			if (p->flags & PG_CACHED) {
+				mtx_lock(&vm_page_queue_free_mtx);
+				if (p->object == object) {
+					vm_page_cache_free(p);
+					if (object->type == OBJT_VNODE &&
+					    object->cached_page_count == 0)
+						vp = object->handle;
+				}
+				mtx_unlock(&vm_page_queue_free_mtx);
+				continue;
+			} else if (p->object != object)
+				continue;
+			/*
+			 * If the page is wired for any reason besides
+			 * the existence of managed, wired mappings, then
+			 * it cannot be freed.  For example, fictitious
+			 * pages, which represent device memory, are
+			 * inherently wired and cannot be freed.  They can,
+			 * however, be invalidated if the option
+			 * OBJPR_CLEANONLY is not specified.
+			 */
+			vm_page_lock(p);
+			if ((wirings = p->wire_count) != 0 &&
+			    (wirings = pmap_page_wired_mappings(p)) !=
+			    p->wire_count) {
+				if ((options & OBJPR_NOTMAPPED) == 0) {
+					pmap_remove_all(p);
+					/*
+					 * Account for removal of wired
+					 * mappings.
+					 */
+					if (wirings != 0)
+						p->wire_count -= wirings;
+				}
+				if ((options & OBJPR_CLEANONLY) == 0) {
+					p->valid = 0;
+					vm_page_undirty(p);
+				}
+				vm_page_unlock(p);
+				continue;
+			}
+			if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) {
+				start = 0;
+				goto restart;
+			}
+			KASSERT((p->flags & PG_FICTITIOUS) == 0,
+			    ("vm_object_page_remove: page %p is fictitious",
+			    p));
+			if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
+				if ((options & OBJPR_NOTMAPPED) == 0)
+					pmap_remove_write(p);
+				if (p->dirty) {
+					vm_page_unlock(p);
+					continue;
+				}
+			}
 			if ((options & OBJPR_NOTMAPPED) == 0) {
 				pmap_remove_all(p);
 				/* Account for removal of wired mappings. */
 				if (wirings != 0)
 					p->wire_count -= wirings;
 			}
-			if ((options & OBJPR_CLEANONLY) == 0) {
-				p->valid = 0;
-				vm_page_undirty(p);
-			}
+			vm_page_free(p);
 			vm_page_unlock(p);
-			continue;
 		}
-		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
-			goto again;
-		KASSERT((p->flags & PG_FICTITIOUS) == 0,
-		    ("vm_object_page_remove: page %p is fictitious", p));
-		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
-			if ((options & OBJPR_NOTMAPPED) == 0)
-				pmap_remove_write(p);
-			if (p->dirty) {
-				vm_page_unlock(p);
-				continue;
-			}
-		}
-		if ((options & OBJPR_NOTMAPPED) == 0) {
-			pmap_remove_all(p);
-			/* Account for removal of wired mappings. */
-			if (wirings != 0)
-				p->wire_count -= wirings;
-		}
-		vm_page_free(p);
-		vm_page_unlock(p);
+		if (n < VM_RADIX_STACK)
+			break;
 	}
 	vm_object_pip_wakeup(object);
-skipmemq:
-	if (__predict_false(object->cache != NULL))
-		vm_page_cache_free(object, start, end);
+	if (vp)
+		vdrop(vp);
 }
 
 /*
@@ -2188,8 +2263,9 @@ DB_SHOW_COMMAND(object, vm_object_print_static)
 			db_printf(",");
 		count++;
 
-		db_printf("(off=0x%jx,page=0x%jx)",
-		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
+		db_printf("(off=0x%jx,page=0x%jx,obj=%p,flags=0x%X)",
+		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p),
+		    p->object, p->flags);
 	}
 	if (count != 0)
 		db_printf("\n");
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
index 25cb275..1f345e7 100644
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -90,7 +90,6 @@ struct vm_object {
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	TAILQ_HEAD(, vm_page) memq;	/* list of resident pages */
 	struct vm_radix rtree;	/* root of the resident page radix index tree */
-	vm_page_t root;			/* root of the resident page splay tree */
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
@@ -101,11 +100,11 @@ struct vm_object {
 	u_short pg_color;		/* (c) color of first page in obj */
 	u_short paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
+	int cached_page_count;		/* number of cached pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
-	vm_page_t cache;		/* root of the cache page splay tree */
 	void *handle;
 	union {
 		/*
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index de1ffe1..5661db2 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -764,63 +764,6 @@ vm_page_dirty(vm_page_t m)
 }
 
 /*
- *	vm_page_splay:
- *
- *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
- *	the vm_page containing the given pindex.  If, however, that
- *	pindex is not found in the vm_object, returns a vm_page that is
- *	adjacent to the pindex, coming before or after it.
- */
-vm_page_t
-vm_page_splay(vm_pindex_t pindex, vm_page_t root)
-{
-	struct vm_page dummy;
-	vm_page_t lefttreemax, righttreemin, y;
-
-	if (root == NULL)
-		return (root);
-	lefttreemax = righttreemin = &dummy;
-	for (;; root = y) {
-		if (pindex < root->pindex) {
-			if ((y = root->left) == NULL)
-				break;
-			if (pindex < y->pindex) {
-				/* Rotate right. */
-				root->left = y->right;
-				y->right = root;
-				root = y;
-				if ((y = root->left) == NULL)
-					break;
-			}
-			/* Link into the new root's right tree. */
-			righttreemin->left = root;
-			righttreemin = root;
-		} else if (pindex > root->pindex) {
-			if ((y = root->right) == NULL)
-				break;
-			if (pindex > y->pindex) {
-				/* Rotate left. */
-				root->right = y->left;
-				y->left = root;
-				root = y;
-				if ((y = root->right) == NULL)
-					break;
-			}
-			/* Link into the new root's left tree. */
-			lefttreemax->right = root;
-			lefttreemax = root;
-		} else
-			break;
-	}
-	/* Assemble the new root. */
-	lefttreemax->right = root->left;
-	righttreemin->left = root->right;
-	root->left = dummy.right;
-	root->right = dummy.left;
-	return (root);
-}
-
-/*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
@@ -836,11 +779,7 @@ vm_page_splay(vm_pindex_t pindex, vm_page_t root)
 void
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
-#ifdef VM_RADIX
 	vm_page_t neighbor;
-#else
-	vm_page_t root;
-#endif
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (m->object != NULL)
 		panic("vm_page_insert: page already inserted");
@@ -851,11 +790,11 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 	m->object = object;
 	m->pindex = pindex;
 
-#ifdef VM_RADIX
 	if (object->resident_page_count == 0) {
 		TAILQ_INSERT_TAIL(&object->memq, m, listq);
 	} else { 
-		neighbor = vm_radix_lookup_ge(&object->rtree, pindex);
+		neighbor = vm_radix_lookup_ge(&object->rtree, pindex,
+		    VM_RADIX_BLACK);
 		if (neighbor != NULL) {
 		    	KASSERT(pindex != neighbor->pindex,
 			    ("vm_page_insert: offset already allocated"));
@@ -865,33 +804,6 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 	}
 	if (vm_radix_insert(&object->rtree, pindex, m) != 0)
 		panic("vm_page_insert: unable to insert the new page");
-#else
-	/*
-	 * Now link into the object's ordered list of backed pages.
-	 */
-	root = object->root;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
-		TAILQ_INSERT_TAIL(&object->memq, m, listq);
-	} else {
-		root = vm_page_splay(pindex, root);
-		if (pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-			TAILQ_INSERT_BEFORE(root, m, listq);
-		} else if (pindex == root->pindex)
-			panic("vm_page_insert: offset already allocated");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
-		}
-	}
-	object->root = m;
-#endif
 
 	/*
 	 * show that the object has one more resident page.
@@ -927,9 +839,6 @@ void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
-#ifndef VM_RADIX
-	vm_page_t next, prev, root;
-#endif
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_lock_assert(m, MA_OWNED);
@@ -941,50 +850,7 @@ vm_page_remove(vm_page_t m)
 		vm_page_flash(m);
 	}
 
-#ifdef VM_RADIX
-	vm_radix_remove(&object->rtree, m->pindex);
-#else
-	/*
-	 * Now remove from the object's list of backed pages.
-	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_remove: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_remove: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
-#endif
-
+	vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_BLACK);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
@@ -1013,20 +879,10 @@ vm_page_remove(vm_page_t m)
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
-#ifdef VM_RADIX
-	m = vm_radix_lookup(&object->rtree, pindex);
-#else
-	if ((m = object->root) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->root = m)->pindex != pindex)
-			m = NULL;
-	}
-#endif
-	return (m);
+	return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_BLACK);
 }
 
 /*
@@ -1041,22 +897,12 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 vm_page_t
 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-#ifdef VM_RADIX
-	if ((m = TAILQ_FIRST(&object->memq)) != NULL)
-		m = vm_radix_lookup_ge(&object->rtree, pindex);
-#else
-	if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
-		if (m->pindex < pindex) {
-			m = vm_page_splay(pindex, object->root);
-			if ((object->root = m)->pindex < pindex)
-				m = TAILQ_NEXT(m, listq);
-		}
-	}
-#endif
-	return (m);
+	if (object->resident_page_count)
+		return vm_radix_lookup_ge(&object->rtree, pindex,
+		    VM_RADIX_BLACK);
+	return (NULL);
 }
 
 /*
@@ -1126,71 +972,6 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 }
 
 /*
- *	Convert all of the given object's cached pages that have a
- *	pindex within the given range into free pages.  If the value
- *	zero is given for "end", then the range's upper bound is
- *	infinity.  If the given object is backed by a vnode and it
- *	transitions from having one or more cached pages to none, the
- *	vnode's hold count is reduced. 
- */
-void
-vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
-{
-	vm_page_t m, m_next;
-	boolean_t empty;
-
-	mtx_lock(&vm_page_queue_free_mtx);
-	if (__predict_false(object->cache == NULL)) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		return;
-	}
-	m = object->cache = vm_page_splay(start, object->cache);
-	if (m->pindex < start) {
-		if (m->right == NULL)
-			m = NULL;
-		else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m;
-			m->right = NULL;
-			m = object->cache = m_next;
-		}
-	}
-
-	/*
-	 * At this point, "m" is either (1) a reference to the page
-	 * with the least pindex that is greater than or equal to
-	 * "start" or (2) NULL.
-	 */
-	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
-		/*
-		 * Find "m"'s successor and remove "m" from the
-		 * object's cache.
-		 */
-		if (m->right == NULL) {
-			object->cache = m->left;
-			m_next = NULL;
-		} else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m->left;
-			object->cache = m_next;
-		}
-		/* Convert "m" to a free page. */
-		m->object = NULL;
-		m->valid = 0;
-		/* Clear PG_CACHED and set PG_FREE. */
-		m->flags ^= PG_CACHED | PG_FREE;
-		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
-		    ("vm_page_cache_free: page %p has inconsistent flags", m));
-		cnt.v_cache_count--;
-		cnt.v_free_count++;
-	}
-	empty = object->cache == NULL;
-	mtx_unlock(&vm_page_queue_free_mtx);
-	if (object->type == OBJT_VNODE && empty)
-		vdrop(object->handle);
-}
-
-/*
  *	Returns the cached page that is associated with the given
  *	object and offset.  If, however, none exists, returns NULL.
  *
@@ -1199,15 +980,12 @@ vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 static inline vm_page_t
 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	if ((m = object->cache) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->cache = m)->pindex != pindex)
-			m = NULL;
-	}
-	return (m);
+	if (object->cached_page_count != 0)
+		return vm_radix_lookup(&object->rtree, pindex, VM_RADIX_RED);
+	return (NULL);
 }
 
 /*
@@ -1219,104 +997,77 @@ vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
 void
 vm_page_cache_remove(vm_page_t m)
 {
-	vm_object_t object;
-	vm_page_t root;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_page_cache_remove: page %p is not cached", m));
-	object = m->object;
-	if (m != object->cache) {
-		root = vm_page_splay(m->pindex, object->cache);
-		KASSERT(root == m,
-		    ("vm_page_cache_remove: page %p is not cached in object %p",
-		    m, object));
-	}
-	if (m->left == NULL)
-		root = m->right;
-	else if (m->right == NULL)
-		root = m->left;
-	else {
-		root = vm_page_splay(m->pindex, m->left);
-		root->right = m->right;
-	}
-	object->cache = root;
+	vm_radix_remove(&m->object->rtree, m->pindex, VM_RADIX_RED);
+	m->object->cached_page_count--;
 	m->object = NULL;
 	cnt.v_cache_count--;
 }
 
 /*
- *	Transfer all of the cached pages with offset greater than or
- *	equal to 'offidxstart' from the original object's cache to the
- *	new object's cache.  However, any cached pages with offset
- *	greater than or equal to the new object's size are kept in the
- *	original object.  Initially, the new object's cache must be
- *	empty.  Offset 'offidxstart' in the original object must
- *	correspond to offset zero in the new object.
- *
- *	The new object must be locked.
+ *	Move a given cached page from an object's cached pages to
+ *	the free list.
+ *
+ *	The free page queue mtx and object lock must be locked.
  */
 void
-vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
-    vm_object_t new_object)
+vm_page_cache_free(vm_page_t m)
 {
-	vm_page_t m, m_next;
+	vm_object_t object;
+
+	object = m->object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	KASSERT((m->flags & PG_CACHED) != 0,
+	    ("vm_page_cache_free: page %p is not cached", m));
 
 	/*
-	 * Insertion into an object's collection of cached pages
-	 * requires the object to be locked.  In contrast, removal does
-	 * not.
+	 * Replicate vm_page_cache_remove with a version that can collapse
+	 * internal nodes since the object lock is held.
 	 */
+	vm_radix_remove(&object->rtree, m->pindex, VM_RADIX_ANY);
+	object->cached_page_count--;
+	m->object = NULL;
+	m->valid = 0;
+	/* Clear PG_CACHED and set PG_FREE. */
+	m->flags ^= PG_CACHED | PG_FREE;
+	cnt.v_cache_count--;
+	cnt.v_free_count++;
+}
+
+/*
+ *	Attempt to rename a cached page from one object to another.  If
+ *	it fails the cached page is freed.
+ */
+void
+vm_page_cache_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t idx)
+{
+	vm_object_t orig_object;
+
+	orig_object = m->object;
+	VM_OBJECT_LOCK_ASSERT(orig_object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
-	KASSERT(new_object->cache == NULL,
-	    ("vm_page_cache_transfer: object %p has cached pages",
-	    new_object));
-	mtx_lock(&vm_page_queue_free_mtx);
-	if ((m = orig_object->cache) != NULL) {
-		/*
-		 * Transfer all of the pages with offset greater than or
-		 * equal to 'offidxstart' from the original object's
-		 * cache to the new object's cache.
-		 */
-		m = vm_page_splay(offidxstart, m);
-		if (m->pindex < offidxstart) {
-			orig_object->cache = m;
-			new_object->cache = m->right;
-			m->right = NULL;
-		} else {
-			orig_object->cache = m->left;
-			new_object->cache = m;
-			m->left = NULL;
-		}
-		while ((m = new_object->cache) != NULL) {
-			if ((m->pindex - offidxstart) >= new_object->size) {
-				/*
-				 * Return all of the cached pages with
-				 * offset greater than or equal to the
-				 * new object's size to the original
-				 * object's cache. 
-				 */
-				new_object->cache = m->left;
-				m->left = orig_object->cache;
-				orig_object->cache = m;
-				break;
-			}
-			m_next = vm_page_splay(m->pindex, m->right);
-			/* Update the page's object and offset. */
-			m->object = new_object;
-			m->pindex -= offidxstart;
-			if (m_next == NULL)
-				break;
-			m->right = NULL;
-			m_next->left = m;
-			new_object->cache = m_next;
-		}
-		KASSERT(new_object->cache == NULL ||
-		    new_object->type == OBJT_SWAP,
-		    ("vm_page_cache_transfer: object %p's type is incompatible"
-		    " with cached pages", new_object));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	/*
+	 * If the insert fails we simply free the cached page.
+	 */
+	if (vm_radix_insert(&new_object->rtree, idx, m) != 0) {
+		vm_page_cache_free(m);
+		return;
 	}
-	mtx_unlock(&vm_page_queue_free_mtx);
+	vm_radix_color(&new_object->rtree, idx, VM_RADIX_RED);
+	/*
+	 * We use any color here though we know it's red so that tree
+	 * compaction will still work.
+	 */
+	vm_radix_remove(&orig_object->rtree, m->pindex, VM_RADIX_ANY);
+	m->object = new_object;
+	m->pindex = idx;
+	new_object->cached_page_count++;
+	orig_object->cached_page_count--;
 }
 
 /*
@@ -1447,7 +1198,8 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 			m->valid = 0;
 		m_object = m->object;
 		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+		if (m_object->type == OBJT_VNODE &&
+		    m_object->cached_page_count == 0)
 			vp = m_object->handle;
 	} else {
 		KASSERT(VM_PAGE_IS_FREE(m),
@@ -1547,7 +1299,7 @@ vm_page_alloc_init(vm_page_t m)
 		m_object = m->object;
 		vm_page_cache_remove(m);
 		if (m_object->type == OBJT_VNODE &&
-		    m_object->cache == NULL)
+		    m_object->cached_page_count == 0)
 			drop = m_object->handle;
 	} else {
 		KASSERT(VM_PAGE_IS_FREE(m),
@@ -2086,10 +1838,7 @@ void
 vm_page_cache(vm_page_t m)
 {
 	vm_object_t object;
-#ifndef VM_RADIX
-	vm_page_t next, prev;
-#endif
-	vm_page_t root;
+	int old_cached;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
@@ -2120,50 +1869,7 @@ vm_page_cache(vm_page_t m)
 	 */
 	vm_pageq_remove(m);
 
-#ifdef VM_RADIX
-	vm_radix_remove(&object->rtree, m->pindex);
-#else
-	/*
-	 * Remove the page from the object's collection of resident
-	 * pages. 
-	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_cache: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_cache: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
-#endif
+	vm_radix_color(&object->rtree, m->pindex, VM_RADIX_RED);
 	TAILQ_REMOVE(&object->memq, m, listq);
 	object->resident_page_count--;
 
@@ -2180,26 +1886,9 @@ vm_page_cache(vm_page_t m)
 	m->flags &= ~PG_ZERO;
 	mtx_lock(&vm_page_queue_free_mtx);
 	m->flags |= PG_CACHED;
+	old_cached = object->cached_page_count;
+	object->cached_page_count++;
 	cnt.v_cache_count++;
-	root = object->cache;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
-	} else {
-		root = vm_page_splay(m->pindex, root);
-		if (m->pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-		} else if (__predict_false(m->pindex == root->pindex))
-			panic("vm_page_cache: offset already cached");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-		}
-	}
-	object->cache = m;
 #if VM_NRESERVLEVEL > 0
 	if (!vm_reserv_free_page(m)) {
 #else
@@ -2217,9 +1906,9 @@ vm_page_cache(vm_page_t m)
 	 * the object's only resident page.
 	 */
 	if (object->type == OBJT_VNODE) {
-		if (root == NULL && object->resident_page_count != 0)
+		if (old_cached == 0 && object->resident_page_count != 0)
 			vhold(object->handle);
-		else if (root != NULL && object->resident_page_count == 0)
+		else if (old_cached != 0 && object->resident_page_count == 0)
 			vdrop(object->handle);
 	}
 }
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 23637bb..260a27f 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -116,8 +116,6 @@ TAILQ_HEAD(pglist, vm_page);
 struct vm_page {
 	TAILQ_ENTRY(vm_page) pageq;	/* queue info for FIFO queue or free list (Q) */
 	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) 	*/
-	struct vm_page *left;		/* splay tree link (O)		*/
-	struct vm_page *right;		/* splay tree link (O)		*/
 
 	vm_object_t object;		/* which object am I in (O,P)*/
 	vm_pindex_t pindex;		/* offset into object (O,P) */
@@ -371,9 +369,9 @@ vm_page_t vm_page_alloc_freelist(int, int);
 struct vnode *vm_page_alloc_init(vm_page_t);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
 void vm_page_cache(vm_page_t);
-void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
+void vm_page_cache_free(vm_page_t);
 void vm_page_cache_remove(vm_page_t);
-void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
+void vm_page_cache_rename(vm_page_t, vm_object_t, vm_pindex_t);
 int vm_page_try_to_cache (vm_page_t);
 int vm_page_try_to_free (vm_page_t);
 void vm_page_dontneed(vm_page_t);
@@ -392,7 +390,6 @@ void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
 void vm_page_requeue(vm_page_t m);
 void vm_page_set_valid(vm_page_t m, int base, int size);
 void vm_page_sleep(vm_page_t m, const char *msg);
-vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
 void vm_page_unwire (vm_page_t, int);
diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c
index 4f4d791..1dd3de6 100644
--- a/sys/vm/vm_radix.c
+++ b/sys/vm/vm_radix.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -211,6 +211,18 @@ vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode,
 	rtree->rt_root = root;
 }
 
+static inline void *
+vm_radix_match(void *child, int color)
+{
+	uintptr_t c;
+
+	c = (uintptr_t)child;
+
+	if ((c & color) == 0)
+		return (NULL);
+	return ((void *)(c & ~VM_RADIX_FLAGS));
+}
+
 /*
  * Inserts the key-value pair in to the radix tree.  Returns errno.
  * Panics if the key already exists.
@@ -277,14 +289,15 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
 		rnode = rnode->rn_child[slot];
 	}
 
-	slot = vm_radix_slot(index, level);
+	slot = vm_radix_slot(index, 0);
 	CTR5(KTR_VM, "insert: tree %p, index %p, level %d, slot %d, child %p",
 	    rtree, (void *)index, level, slot, rnode->rn_child[slot]);
 	KASSERT(rnode->rn_child[slot] == NULL,
 	    ("vm_radix_insert: Duplicate value %p at index: %lu\n", 
 	    rnode->rn_child[slot], (u_long)index));
+	val = (void *)((uintptr_t)val | VM_RADIX_BLACK);
 	rnode->rn_child[slot] = val;
-	rnode->rn_count++;
+	atomic_add_int((volatile int *)&rnode->rn_count, 1);
 
 	return 0;
 }
@@ -294,7 +307,7 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
  * NULL is returned.
  */
 void *
-vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
+vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, int color)
 {
 	struct vm_radix_node *rnode;
 	int slot;
@@ -310,7 +323,7 @@ vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
 		    "lookup: tree %p, index %p, level %d, slot %d, child %p",
 		    rtree, (void *)index, level, slot, rnode->rn_child[slot]);
 		if (level == 0)
-			return rnode->rn_child[slot];
+			return vm_radix_match(rnode->rn_child[slot], color);
 		rnode = rnode->rn_child[slot];
 		level--;
 	}
@@ -319,106 +332,121 @@ vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
 	return NULL;
 }
 
+void *
+vm_radix_color(struct vm_radix *rtree, vm_pindex_t index, int color)
+{
+	struct vm_radix_node *rnode;
+	uintptr_t child;
+	int slot;
+	int level;
+
+	level = vm_radix_height(rtree, &rnode);
+	if (index > VM_RADIX_MAX(level))
+		return NULL;
+	level--;
+	while (rnode) {
+		slot = vm_radix_slot(index, level);
+		CTR5(KTR_VM,
+		    "color: tree %p, index %p, level %d, slot %d, child %p",
+		    rtree, (void *)index, level, slot, rnode->rn_child[slot]);
+		if (level == 0)
+			break;
+		rnode = rnode->rn_child[slot];
+		level--;
+	}
+	if (rnode == NULL || rnode->rn_child[slot] == NULL)
+		return (NULL);
+	child = (uintptr_t)rnode->rn_child[slot];
+	child &= ~VM_RADIX_FLAGS;
+	rnode->rn_child[slot] = (void *)(child | color);
+
+	return (void *)child;
+}
+
 /*
- * Looks up as many as cnt values between start and end inclusive, and stores
- * them in the caller allocated array out.  The next index can be used to
- * restart the scan.  This optimizes forward scans in the tree.
+ * Looks up as many as cnt values between start and end, and stores
+ * them in the caller allocated array out.  The next index can be used
+ * to restart the scan.  This optimizes forward scans in the tree.
  */
 int
 vm_radix_lookupn(struct vm_radix *rtree, vm_pindex_t start,
-    vm_pindex_t end, void **out, int cnt, vm_pindex_t *next)
+    vm_pindex_t end, int color, void **out, int cnt, vm_pindex_t *next)
 {
 	struct vm_radix_node *rnode;
-	struct vm_radix_node *child;
-	vm_pindex_t max;
 	vm_pindex_t inc;
 	int slot;
 	int level;
 	void *val;
 	int outidx;
-	int loops = 0;
 
 	CTR3(KTR_VM, "lookupn: tree %p, start %p, end %p",
 	    rtree, (void *)start, (void *)end);
 	outidx = 0;
 restart:
 	level = vm_radix_height(rtree, &rnode);
-	max = VM_RADIX_MAX(level);
-	if (start > max)
+	if (rnode == NULL || start > VM_RADIX_MAX(level))
+		goto out;
+	if (end && start >= end)
 		goto out;
-	if (end > max || end == 0)
-		end = max;
-	loops++;
-	if (loops > 1000)
-		panic("vm_radix_lookupn: looping %d\n", loops);
 	/*
 	 * Search the tree from the top for any leaf node holding an index
 	 * between start and end.
 	 */
-	level--;
-	while (rnode) {
+	for (level--; level; level--) {
 		slot = vm_radix_slot(start, level);
 		CTR5(KTR_VM,
 		    "lookupn: tree %p, index %p, level %d, slot %d, child %p",
 		    rtree, (void *)start, level, slot, rnode->rn_child[slot]);
-		if (level == 0)
-			break;
+		if (rnode->rn_child[slot] != NULL) {
+			rnode = rnode->rn_child[slot];
+			continue;
+		}
 		/*
-		 * If we don't have an exact match we must start our search
-		 * from the next leaf and adjust our index appropriately.
-		 */
-		if ((child = rnode->rn_child[slot]) == NULL) {
-			/*
-			 * Calculate how much to increment our index by
-			 * based on the tree level.  We must truncate the
-			 * lower bits to start from the begnning of the next
-			 * leaf.
-		 	 */
-			inc = 1LL << (level * VM_RADIX_WIDTH);
-			start &= ~VM_RADIX_MAX(level);
-			start += inc;
-			slot++;
-			CTR5(KTR_VM,
-			    "lookupn: start %p end %p inc %d mask 0x%lX slot %d",
-			    (void *)start, (void *)end, inc, ~VM_RADIX_MAX(level), slot);
-			for (; slot < VM_RADIX_COUNT && start <= end;
-			    slot++, start += inc) {
-				child = rnode->rn_child[slot];
-				if (child)
-					break;
+		 * Calculate how much to increment our index by
+		 * based on the tree level.  We must truncate the
+		 * lower bits to start from the begnning of the
+		 * next leaf.
+	 	 */
+		inc = 1LL << (level * VM_RADIX_WIDTH);
+		start &= ~VM_RADIX_MAX(level);
+		start += inc;
+		slot++;
+		CTR5(KTR_VM,
+		    "lookupn: start %p end %p inc %d mask 0x%lX slot %d",
+		    (void *)start, (void *)end, inc,
+		    ~VM_RADIX_MAX(level), slot);
+		for (; slot < VM_RADIX_COUNT; slot++, start += inc) {
+			if (end != 0 && start >= end)
+				goto out;
+			if (rnode->rn_child[slot]) {
+				rnode = rnode->rn_child[slot];
+				break;
 			}
 		}
-		rnode = child;
-		level--;
-	}
-	if (rnode == NULL) {
-		/*
-		 * If we still have another range to search, start looking
-		 * from the next node.  Otherwise, return what we've already
-		 * found.  The loop above has already adjusted start to the
-		 * beginning of the next node.
-		 *
-		 * Detect start wrapping back to 0 and return in this case.
-		 */
-		if (start <= end && start != 0)
+		if (slot == VM_RADIX_COUNT)
 			goto restart;
-		goto out;
 	}
-	for (; outidx < cnt && slot < VM_RADIX_COUNT && start <= end;
-	    slot++, start++) {
-		val = rnode->rn_child[slot];
+	slot = vm_radix_slot(start, 0);
+	for (; slot < VM_RADIX_COUNT; slot++, start++) {
+		if (end != 0 && start >= end)
+			goto out;
+		val = vm_radix_match(rnode->rn_child[slot], color);
 		if (val == NULL)
 			continue;
+		CTR4(KTR_VM,
+		    "lookupn: tree %p index %p slot %d found child %p",
+		    rtree, (void *)start, slot, val);
 		out[outidx++] = val;
+		if (outidx == cnt)
+			goto out;
 	}
 	/*
 	 * Go fetch the next page to fill the requested number of pages
 	 * otherwise the caller will simply call us again to fulfill the
 	 * same request after the structures are pushed out of cache.
 	 */
-	if (outidx < cnt && start <= end)
+	if ((end == 0 || start < end))
 		goto restart;
-
 out:
 	*next = start;
 
@@ -429,15 +457,15 @@ out:
  * Look up any entry at a position less than or equal to index.
  */
 void *
-vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
+vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index, int color)
 {
 	struct vm_radix_node *rnode;
 	struct vm_radix_node *child;
 	vm_pindex_t max;
 	vm_pindex_t inc;
+	void *val;
 	int slot;
 	int level;
-	int loops = 0;
 
 	CTR2(KTR_VM,
 	    "lookup_le: tree %p, index %p", rtree, (void *)index);
@@ -448,9 +476,6 @@ restart:
 	max = VM_RADIX_MAX(level);
 	if (index > max || index == 0)
 		index = max;
-	loops++;
-	if (loops > 1000)
-		panic("vm_radix_looku_le: looping %d\n", loops);
 	/*
 	 * Search the tree from the top for any leaf node holding an index
 	 * lower than 'index'.
@@ -492,8 +517,9 @@ restart:
 	}
 	if (rnode) {
 		for (; slot >= 0; slot--, index--) {
-			if (rnode->rn_child[slot])
-				return (rnode->rn_child[slot]);
+			val = vm_radix_match(rnode->rn_child[slot], color);
+			if (val)
+				return (val);
 		}
 	}
 	if (index != -1)
@@ -507,7 +533,7 @@ restart:
  * panics if the key is not present.
  */
 void *
-vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
+vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index, int color)
 {
 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
 	struct vm_radix_node *rnode, *root;
@@ -534,15 +560,27 @@ vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 		    rtree, (void *)index, level, slot, rnode->rn_child[slot]);
 		level--;
 	}
+	KASSERT(rnode != NULL,
+	    ("vm_radix_remove: index %jd not present in the tree.\n", index));
 	slot = vm_radix_slot(index, 0);
-	KASSERT(rnode != NULL && rnode->rn_child[slot] != NULL,
+	val = vm_radix_match(rnode->rn_child[slot], color);
+	KASSERT(val != NULL,
 	    ("vm_radix_remove: index %jd not present in the tree.\n", index));
 
-	val = rnode->rn_child[slot];
 	for (;;) {
 		rnode->rn_child[slot] = NULL;
-		rnode->rn_count--;
-		if (rnode->rn_count > 0)
+		/*
+		 * Use atomics for the last level since red and black
+		 * will both adjust it.
+		 */
+		if (level == 0)
+			atomic_add_int((volatile int *)&rnode->rn_count, -1);
+		else
+			rnode->rn_count--;
+		/*
+		 * Only allow black removes to prune the tree.
+		 */
+		if ((color & VM_RADIX_BLACK) == 0 || rnode->rn_count > 0)
 			break;
 		vm_radix_node_put(rnode);
 		if (rnode == root) {
diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h
index fac839f..487d4f0 100644
--- a/sys/vm/vm_radix.h
+++ b/sys/vm/vm_radix.h
@@ -35,7 +35,13 @@
 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
 #define	VM_RADIX_LIMIT	howmany((sizeof(vm_pindex_t) * NBBY), VM_RADIX_WIDTH)
-#define	VM_RADIX_HEIGHT	0xf		/* Bits of height in root */
+#define	VM_RADIX_FLAGS	0x3	/* Flag bits stored in node pointers. */
+#define	VM_RADIX_BLACK	0x1	/* Black node. (leaf only) */
+#define	VM_RADIX_RED	0x2	/* Red node. (leaf only) */
+#define	VM_RADIX_ANY	(VM_RADIX_RED | VM_RADIX_BLACK)
+#define	VM_RADIX_EMPTY	0x1	/* Empty hint. (internal only) */
+#define	VM_RADIX_HEIGHT	0xf	/* Bits of height in root */
+#define	VM_RADIX_STACK	8	/* Nodes to store on stack. */
 
 CTASSERT(VM_RADIX_HEIGHT >= VM_RADIX_LIMIT);
 
@@ -45,7 +51,7 @@ CTASSERT(VM_RADIX_HEIGHT >= VM_RADIX_LIMIT);
 	    (((vm_pindex_t)1 << ((h) * VM_RADIX_WIDTH)) - 1))
 
 struct vm_radix_node {
-	void 		*rn_child[VM_RADIX_COUNT];	/* child nodes. */
+	void		*rn_child[VM_RADIX_COUNT];	/* child nodes. */
     	uint16_t	rn_count;			/* Valid children. */
 };
 
@@ -58,24 +64,32 @@ struct vm_radix {
 };
 
 void	vm_radix_init(void);
+
+/*
+ * Functions which only work with black nodes. (object lock)
+ */
 int 	vm_radix_insert(struct vm_radix *, vm_pindex_t, void *);
-void	*vm_radix_remove(struct vm_radix *, vm_pindex_t);
-void	*vm_radix_lookup(struct vm_radix *, vm_pindex_t);
-int	vm_radix_lookupn(struct vm_radix *rtree, vm_pindex_t start,
-	    vm_pindex_t end, void **out, int cnt, vm_pindex_t *next);
-void	*vm_radix_lookup_le(struct vm_radix *, vm_pindex_t);
 void 	vm_radix_shrink(struct vm_radix *);
 
 /*
+ * Functions which work on specified colors. (object, vm_page_queue_free locks)
+ */
+void	*vm_radix_color(struct vm_radix *, vm_pindex_t, int color);
+void	*vm_radix_lookup(struct vm_radix *, vm_pindex_t, int color);
+int	vm_radix_lookupn(struct vm_radix *rtree, vm_pindex_t start,
+	    vm_pindex_t end, int color, void **out, int cnt, vm_pindex_t *next);
+void	*vm_radix_lookup_le(struct vm_radix *, vm_pindex_t, int color);
+void	*vm_radix_remove(struct vm_radix *, vm_pindex_t, int color);
+
+/*
  * Look up any entry at a position greater or equal to index.
  */
 static inline void *
-vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
+vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index, int color)
 {
-        vm_pindex_t unused;
         void *val;
 
-        if (vm_radix_lookupn(rtree, index, 0, &val, 1, &unused))
+        if (vm_radix_lookupn(rtree, index, 0, color, &val, 1, &index))
                 return (val);
         return (NULL);
 }
diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c
index 376f643..8d7b0d4 100644
--- a/sys/vm/vm_reserv.c
+++ b/sys/vm/vm_reserv.c
@@ -309,8 +309,7 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 	/*
 	 * Look for an existing reservation.
 	 */
-#ifdef VM_RADIX
-	mpred = vm_radix_lookup_le(&object->rtree, pindex);
+	mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK);
 	if (mpred != NULL) {
 		KASSERT(mpred->pindex != pindex,
 		    ("vm_reserv_alloc_page: pindex already allocated"));
@@ -323,7 +322,7 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 			return (m);
 		}
 	}
-	msucc = vm_radix_lookup_ge(&object->rtree, pindex);
+	msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex != pindex,
 		    ("vm_reserv_alloc_page: pindex already allocated"));
@@ -337,47 +336,6 @@ vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 		}
 	}
 
-#else
-	msucc = NULL;
-	mpred = object->root;
-	while (mpred != NULL) {
-		KASSERT(mpred->pindex != pindex,
-		    ("vm_reserv_alloc_page: pindex already allocated"));
-		rv = vm_reserv_from_page(mpred);
-		if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) {
-			m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
-			/* Handle vm_page_rename(m, new_object, ...). */
-			if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
-				return (NULL);
-			vm_reserv_populate(rv);
-			return (m);
-		} else if (mpred->pindex < pindex) {
-			if (msucc != NULL ||
-			    (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
-				break;
-			KASSERT(msucc->pindex != pindex,
-			    ("vm_reserv_alloc_page: pindex already allocated"));
-			rv = vm_reserv_from_page(msucc);
-			if (rv->object == object &&
-			    vm_reserv_has_pindex(rv, pindex)) {
-				m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
-				/* Handle vm_page_rename(m, new_object, ...). */
-				if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
-					return (NULL);
-				vm_reserv_populate(rv);
-				return (m);
-			} else if (pindex < msucc->pindex)
-				break;
-		} else if (msucc == NULL) {
-			msucc = mpred;
-			mpred = TAILQ_PREV(msucc, pglist, listq);
-			continue;
-		}
-		msucc = NULL;
-		mpred = object->root = vm_page_splay(pindex, object->root);
-	}
-#endif
-
 	/*
 	 * Determine the first index to the left that can be used.
 	 */
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index e3222cb..892d9a7 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -366,6 +366,7 @@ vnode_pager_setsize(vp, nsize)
 	vm_ooffset_t nsize;
 {
 	vm_object_t object;
+	struct vnode *drop;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
@@ -391,16 +392,41 @@ vnode_pager_setsize(vp, nsize)
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
-		 *
-		 * XXX for some reason (I don't know yet), if we take a
-		 * completely invalid page and mark it partially valid
-		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
-		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
-		    m->valid != 0) {
-			int base = (int)nsize & PAGE_MASK;
-			int size = PAGE_SIZE - base;
+		    (m = vm_radix_lookup(&object->rtree, OFF_TO_IDX(nsize),
+		    VM_RADIX_ANY)) != NULL) {
+			int base;
+			int size;
+
+			/*
+			 * Eliminate any cached page as we would have to
+			 * do too much work to save it.
+			 */
+			if (m->flags & PG_CACHED) {
+				drop = NULL;
+				mtx_lock(&vm_page_queue_free_mtx);
+				if (m->object == object) {
+					vm_page_cache_remove(m);
+					if (object->cached_page_count == 0)
+						drop = vp;
+				}
+				mtx_unlock(&vm_page_queue_free_mtx);
+				if (drop)
+					vdrop(drop);
+				goto out;
+			}
+			/*
+			 * XXX for some reason (I don't know yet), if we take a
+			 * completely invalid page and mark it partially valid
+			 * it can screw up NFS reads, so we don't allow the
+			 * case.
+			 */
+			if (m->valid != 0 || m->object != object)
+				goto out;
+
+			base = (int)nsize & PAGE_MASK;
+			size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
@@ -430,12 +456,9 @@ vnode_pager_setsize(vp, nsize)
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
-		} else if ((nsize & PAGE_MASK) &&
-		    __predict_false(object->cache != NULL)) {
-			vm_page_cache_free(object, OFF_TO_IDX(nsize),
-			    nobjsize);
 		}
 	}
+out:
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_UNLOCK(object);
author	jeff <jeff@FreeBSD.org>	2011-10-30 11:11:04 +0000
committer	jeff <jeff@FreeBSD.org>	2011-10-30 11:11:04 +0000
commit	9e2e6a2980c96185bbe81d081d64d0a32bd42961 (patch)
tree	e1a2a2f82a025980770697c9b5d082a9e598706c
parent	b29030ce37e1b81d952e6533c3a29e7176e3ec9e (diff)
download	FreeBSD-src-9e2e6a2980c96185bbe81d081d64d0a32bd42961.zip FreeBSD-src-9e2e6a2980c96185bbe81d081d64d0a32bd42961.tar.gz