23 files changed, 1701 insertions, 2337 deletions
diff --git a/sys/conf/files b/sys/conf/files
index c2d7e7e..0d48a92 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1075,6 +1075,7 @@ kern/kern_lock.c	standard
 kern/kern_lockf.c	standard
 kern/kern_mac.c		standard
 kern/kern_malloc.c	standard
+kern/kern_mbuf.c	standard
 kern/kern_mib.c		standard
 kern/kern_module.c	standard
 kern/kern_mutex.c	standard
@@ -1116,7 +1117,6 @@ kern/subr_hints.c	standard
 kern/subr_kobj.c	standard
 kern/subr_log.c		standard
 kern/subr_mbpool.c	optional libmbpool
-kern/subr_mbuf.c	standard
 kern/subr_mchain.c	optional libmchain
 kern/subr_module.c	standard
 kern/subr_msgbuf.c	standard
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 50fd06e..9a2f9e3 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -95,6 +95,10 @@ __FBSDID("$FreeBSD$");
 #include <i386/isa/isa.h>
 #endif
 
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	cpu_reset_real(void);
 #ifdef SMP
 static void	cpu_reset_proxy(void);
@@ -584,6 +588,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;
 
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
 	TAILQ_INIT(&sf_buf_freelist);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index c92e70f..4bc3348 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -191,6 +191,7 @@ malloc(size, type, flags)
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
+	uma_keg_t keg;
 #ifdef DIAGNOSTIC
 	unsigned long osize = size;
 #endif
@@ -235,6 +236,7 @@ malloc(size, type, flags)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone;
+		keg = zone->uz_keg;
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
@@ -244,10 +246,11 @@ malloc(size, type, flags)
 			goto out;
 
 		ksp->ks_size |= 1 << indx;
-		size = zone->uz_size;
+		size = keg->uk_size;
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
+		keg = NULL;
 		va = uma_large_malloc(size, flags);
 		mtx_lock(&ksp->ks_mtx);
 		if (va == NULL)
@@ -309,7 +312,7 @@ free(addr, type)
 #ifdef INVARIANTS
 		struct malloc_type **mtp = addr;
 #endif
-		size = slab->us_zone->uz_size;
+		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
@@ -325,7 +328,7 @@ free(addr, type)
 		    sizeof(struct malloc_type *);
 		*mtp = type;
 #endif
-		uma_zfree_arg(slab->us_zone, addr, slab);
+		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
@@ -364,8 +367,8 @@ realloc(addr, size, type, flags)
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
-	if (slab->us_zone)
-		alloc = slab->us_zone->uz_size;
+	if (slab->us_keg)
+		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
@@ -410,7 +413,6 @@ kmeminit(dummy)
 	void *dummy;
 {
 	u_int8_t indx;
-	u_long npg;
 	u_long mem_size;
 	int i;
  
@@ -428,7 +430,7 @@ kmeminit(dummy)
 	 * Note that the kmem_map is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
-	vm_kmem_size = VM_KMEM_SIZE;
+	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
 	mem_size = cnt.v_page_count;
 
 #if defined(VM_KMEM_SIZE_SCALE)
@@ -462,17 +464,8 @@ kmeminit(dummy)
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);
 
-	/*
-	 * In mbuf_init(), we set up submaps for mbufs and clusters, in which
-	 * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
-	 * respectively. Mathematically, this means that what we do here may
-	 * amount to slightly more address space than we need for the submaps,
-	 * but it never hurts to have an extra page in kmem_map.
-	 */
-	npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; 
-
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
-		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+		(vm_offset_t *)&kmemlimit, vm_kmem_size);
 	kmem_map->system_map = 1;
 
 	uma_startup2();
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
new file mode 100644
index 0000000..2bec5ad
--- /dev/null
+++ b/sys/kern/kern_mbuf.c
@@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2004
+ * 	Bosko Milekic <bmilekic@FreeBSD.org>.
+ *	All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors may be
+ *    used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ *  m_clget()                m_getcl()
+ *    |                         |
+ *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
+ *    |   |             [     Packet   ]            |
+ *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
+ *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
+ *        |                       \________         |
+ *  [ Cluster Keg   ]                      \       /
+ *        |    	                         [ Mbuf Keg   ] 
+ *  [ Cluster Slabs ]                         |
+ *        |                              [ Mbuf Slabs ]
+ *         \____________(VM)_________________/
+ */
+
+int nmbclusters;
+struct mbstat mbstat;
+
+static void
+tunable_mbinit(void *dummy)
+{
+
+	/* This has to be done before VM init. */
+	nmbclusters = 1024 + maxusers * 64;
+	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
+    "Maximum number of mbuf clusters allowed");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+    "Mbuf general information and statistics");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t	zone_mbuf;
+uma_zone_t	zone_clust;
+uma_zone_t	zone_pack;
+
+/*
+ * Local prototypes.
+ */
+static void	mb_ctor_mbuf(void *, int, void *);
+static void	mb_ctor_clust(void *, int, void *);
+static void	mb_ctor_pack(void *, int, void *);
+static void	mb_dtor_mbuf(void *, int, void *);
+static void	mb_dtor_clust(void *, int, void *);	/* XXX */
+static void	mb_dtor_pack(void *, int, void *);	/* XXX */
+static void	mb_init_pack(void *, int);
+static void	mb_fini_pack(void *, int);
+
+static void	mb_reclaim(void *);
+static void	mbuf_init(void *);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+static void
+mbuf_init(void *dummy)
+{
+
+	/*
+	 * Configure UMA zones for Mbufs, Clusters, and Packets.
+	 */
+	zone_mbuf = uma_zcreate("Mbuf", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MAXBUCKET);
+	zone_clust = uma_zcreate("MbufClust", MCLBYTES, mb_ctor_clust,
+	    mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	if (nmbclusters > 0)
+		uma_zone_set_max(zone_clust, nmbclusters);
+	zone_pack = uma_zsecond_create("Packet", mb_ctor_pack, mb_dtor_pack,
+	    mb_init_pack, mb_fini_pack, zone_mbuf);
+
+	/* uma_prealloc() goes here */
+
+	/*
+	 * Hook event handler for low-memory situation, used to
+	 * drain protocols and push data back to the caches (UMA
+	 * later pushes it back to VM).
+	 */
+	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+
+	/*
+	 * [Re]set counters and local statistics knobs.
+	 * XXX Some of these should go and be replaced, but UMA stat
+	 * gathering needs to be revised.
+	 */
+	mbstat.m_mbufs = 0;
+	mbstat.m_mclusts = 0;
+	mbstat.m_drain = 0;
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+	mbstat.m_numtypes = MT_NTYPES;
+
+	mbstat.m_mcfail = mbstat.m_mpfail = 0;
+	mbstat.sf_iocnt = 0;
+	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API.
+ */
+static void
+mb_ctor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags;
+	int how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	how = args->how;
+	type = args->type;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_nextpkt = NULL;
+	if (flags & M_PKTHDR) {
+		m->m_data = m->m_pktdat;
+		m->m_flags = M_PKTHDR;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_mbuf(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	} else { 
+		m->m_data = m->m_dat;
+		m->m_flags = 0;
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * The Mbuf master zone and Packet secondary zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+}
+
+/* XXX Only because of stats */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Cluster zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for.
+ */
+static void
+mb_ctor_clust(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)arg;
+	m->m_ext.ext_buf = (caddr_t)mem;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags |= M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_CLUSTER;
+	m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+	    m->m_ext.ext_buf);
+	*(m->m_ext.ref_cnt) = 1;
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/* XXX */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from keg slab to zone cache.
+ */
+static void
+mb_init_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, M_NOWAIT);
+	if (m->m_ext.ext_buf == NULL)	/* XXX */
+		panic("mb_init_pack(): Can't deal with failure yet.");
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_fini_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+	m->m_ext.ext_buf = NULL;
+	mbstat.m_mclusts += 1;	/* XXX */
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static void
+mb_ctor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags, how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	type = args->type;
+	how = args->how;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags = flags|M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_PACKET;
+	*(m->m_ext.ref_cnt) = 1;
+
+	if (flags & M_PKTHDR) {
+		m->m_nextpkt = NULL;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_pack(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called.  The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+	    "mb_reclaim()");
+
+	mbstat.m_drain++;
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain != NULL)
+				(*pr->pr_drain)();
+}
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
deleted file mode 100644
index d84ef31..0000000
--- a/sys/kern/subr_mbuf.c
+++ /dev/null
@@ -1,1548 +0,0 @@
-/*-
- * Copyright (c) 2001, 2002, 2003
- * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission. 
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_mac.h"
-#include "opt_param.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mac.h>
-#include <sys/mbuf.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/smp.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-
-#include <vm/vm.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-
-/*
- * mb_alloc: network buffer allocator
- *
- * XXX: currently, the "low watermark" sysctl is marked read-only as its
- * effects are not completely implemented.  To be fixed soon.
- */
-
-/*
- * Maximum number of PCPU containers. If you know what you're doing you could
- * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
- * system during compilation, and thus prevent kernel structure bloat.
- *
- * SMP and non-SMP kernels clearly have a different number of possible CPUs,
- * but because we cannot assume a dense array of CPUs, we always allocate
- * and traverse PCPU containers up to NCPU amount and merely check for
- * CPU availability.
- */
-#ifdef MBALLOC_NCPU
-#define	NCPU	MBALLOC_NCPU
-#else
-#define	NCPU	MAXCPU
-#endif
-
-/*-
- * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
- * "memcache" proof-of-concept allocator which was itself based on
- * several well-known SMP-friendly allocators.
- *
- * The mb_alloc mbuf allocator is a special when compared to other
- * general-purpose allocators.  Some things to take note of:
- *
- *   Mbufs and mbuf clusters are two different objects.  Sometimes we
- *   will allocate a single mbuf, other times a single cluster,
- *   other times both.  Further, we may sometimes wish to allocate a
- *   whole chain of mbufs with clusters.  This allocator will perform
- *   the common case of each scenario in one function call (this
- *   includes constructing or destructing the object) while only
- *   locking/unlocking the cache once, if it can get away with it.
- *   The caches consist of pure mbufs and pure clusters; that is
- *   there are no 'zones' containing mbufs with already pre-hooked
- *   clusters.  Since we can allocate both objects atomically anyway,
- *   we don't bother fragmenting our caches for any particular 'scenarios.'
- *
- *   We allocate from seperate sub-maps of kmem_map, thus imposing
- *   an ultimate upper-limit on the number of allocatable clusters
- *   and mbufs and also, since the clusters all come from a
- *   virtually contiguous region, we can keep reference counters
- *   for them and "allocate" them purely by indexing into a
- *   dense refcount vector.
- *
- *   We call out to protocol drain routines (which can be hooked
- *   into us) when we're low on space.
- *
- * The mbuf allocator keeps all objects that it allocates in mb_buckets.
- * The buckets keep a number of objects (an object can be an mbuf or an
- * mbuf cluster) and facilitate moving larger sets of contiguous objects
- * from the per-CPU caches to the global cache. The buckets also have
- * the added advantage that objects, when migrated from cache to cache,
- * are migrated in chunks that keep contiguous objects together,
- * minimizing TLB pollution.
- *
- * The buckets are kept on singly-linked lists called "containers." A container
- * is protected by a mutex in order to ensure consistency.  The mutex
- * itself is allocated separately and attached to the container at boot time,
- * thus allowing for certain containers to share the same lock.  Per-CPU
- * containers for mbufs and mbuf clusters all share the same per-CPU 
- * lock whereas the global cache containers for these objects share one
- * global lock.
- */
-struct mb_bucket {
-	SLIST_ENTRY(mb_bucket) mb_blist;
-	int 	mb_owner;
-	int	mb_numfree;
-	void 	*mb_free[0];
-};
-
-struct mb_container {
-	SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
-	struct	mtx *mc_lock;
-	int	mc_numowner;
-	u_int	mc_starved;
-	long	*mc_types;
-	u_long	*mc_objcount;
-	u_long	*mc_numbucks;
-};
-
-struct mb_gen_list {
-	struct	mb_container mb_cont;
-	struct	cv mgl_mstarved;
-};
-
-struct mb_pcpu_list {
-	struct	mb_container mb_cont;
-};
-
-/*
- * Boot-time configurable object counts that will determine the maximum
- * number of permitted objects in the mbuf and mcluster cases.  In the
- * ext counter (nmbcnt) case, it's just an indicator serving to scale
- * kmem_map size properly - in other words, we may be allowed to allocate
- * more than nmbcnt counters, whereas we will never be allowed to allocate
- * more than nmbufs mbufs or nmbclusters mclusters.
- * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
- * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
- */
-#ifndef NMBCLUSTERS
-#define	NMBCLUSTERS	(1024 + maxusers * 64)
-#endif
-#ifndef NMBUFS
-#define	NMBUFS		(nmbclusters * 2)
-#endif
-#ifndef NSFBUFS
-#define	NSFBUFS		(512 + maxusers * 16)
-#endif
-#ifndef NMBCNTS
-#define	NMBCNTS		(nmbclusters + nsfbufs)
-#endif
-int	nmbufs;
-int	nmbclusters;
-int	nmbcnt;
-int	nsfbufs;
-int	nsfbufspeak;
-int	nsfbufsused;
-
-/*
- * Sizes of objects per bucket.  There are this size's worth of mbufs
- * or clusters in each bucket.  Please keep these a power-of-2.
- */
-#define	MBUF_BUCK_SZ	(PAGE_SIZE * 2)
-#define	CLUST_BUCK_SZ	(PAGE_SIZE * 4)
-
-/*
- * Perform sanity checks of tunables declared above.
- */
-static void
-tunable_mbinit(void *dummy)
-{
-
-	/*
-	 * This has to be done before VM init.
-	 */
-	nmbclusters = NMBCLUSTERS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
-	nmbufs = NMBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-	nmbcnt = NMBCNTS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
-	/* Sanity checks */
-	if (nmbufs < nmbclusters * 2)
-		nmbufs = nmbclusters * 2;
-	if (nmbcnt < nmbclusters + nsfbufs)
-		nmbcnt = nmbclusters + nsfbufs;
-}
-SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
-
-/*
- * The freelist structures and mutex locks.  The number statically declared
- * here depends on the number of CPUs.
- *
- * We set up in such a way that all the objects (mbufs, clusters)
- * share the same mutex lock.  It has been established that we do not benefit
- * from different locks for different objects, so we use the same lock,
- * regardless of object type.  This also allows us to do optimised
- * multi-object allocations without dropping the lock in between.
- */
-struct mb_lstmngr {
-	struct mb_gen_list *ml_genlist;
-	struct mb_pcpu_list *ml_cntlst[NCPU];
-	struct mb_bucket **ml_btable;
-	vm_map_t	ml_map;
-	vm_offset_t	ml_mapbase;
-	vm_offset_t	ml_maptop;
-	int		ml_mapfull;
-	u_int		ml_objsize;
-	u_int		ml_objbucks;
-	u_int		*ml_wmhigh;
-	u_int		*ml_wmlow;
-};
-static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
-static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
-static u_int *cl_refcntmap;
-
-/*
- * Local macros for internal allocator structure manipulations.
- */
-#ifdef SMP
-#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
-#else
-#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[0]
-#endif
-
-#define	MB_GET_GEN_LIST(mb_lst)		(mb_lst)->ml_genlist
-
-#define	MB_LOCK_CONT(mb_cnt)		mtx_lock((mb_cnt)->mb_cont.mc_lock)
-
-#define	MB_UNLOCK_CONT(mb_cnt)		mtx_unlock((mb_cnt)->mb_cont.mc_lock)
-
-#define	MB_GET_PCPU_LIST_NUM(mb_lst, num)				\
-    (mb_lst)->ml_cntlst[(num)]
-
-#define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
-    (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) /		\
-    ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
-
-#define	MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)				\
-{									\
-	struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead);	\
-									\
-	(mb_bckt)->mb_numfree--;					\
-	(mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)];	\
-	(*((mb_lst)->mb_cont.mc_objcount))--;				\
-	if ((mb_bckt)->mb_numfree == 0) {				\
-		SLIST_REMOVE_HEAD(_mchd, mb_blist);			\
-		SLIST_NEXT((mb_bckt), mb_blist) = NULL;			\
-		(mb_bckt)->mb_owner |= MB_BUCKET_FREE;			\
-	}								\
-}
-
-#define	MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst)				\
-	(mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp);	\
-	(mb_bckt)->mb_numfree++;					\
-	(*((mb_lst)->mb_cont.mc_objcount))++;
-
-#define	MB_MBTYPES_INC(mb_cnt, mb_type, mb_num)				\
-	if ((mb_type) != MT_NOTMBUF)					\
-	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
-
-#define	MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num)				\
-	if ((mb_type) != MT_NOTMBUF)					\
-	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
-
-/*
- * Ownership of buckets/containers is represented by integers.  The PCPU
- * lists range from 0 to NCPU-1.  We need a free numerical id for the general
- * list (we use NCPU).  We also need a non-conflicting free bit to indicate
- * that the bucket is free and removed from a container, while not losing
- * the bucket's originating container id.  We use the highest bit
- * for the free marker.
- */
-#define	MB_GENLIST_OWNER	(NCPU)
-#define	MB_BUCKET_FREE		(1 << (sizeof(int) * 8 - 1))
-
-/* Statistics structures for allocator (per-CPU and general). */
-static struct mbpstat mb_statpcpu[NCPU + 1];
-struct mbstat mbstat;
-
-/* Sleep time for wait code (in ticks). */
-static int mbuf_wait = 64;
-
-static u_int mbuf_hiwm = 512;	/* High wm on  # of mbufs per cache */
-static u_int mbuf_lowm = 128;	/* Low wm on # of mbufs per cache */
-static u_int clust_hiwm = 128;	/* High wm on # of clusters per cache */
-static u_int clust_lowm = 16;	/* Low wm on # of clusters per cache */
-
-/*
- * Objects exported by sysctl(8).
- */
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0, 
-    "Maximum number of mbuf clusters available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0,
-    "Maximum number of mbufs available"); 
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0,
-    "Number used to scale kmem_map to ensure sufficient space for counters");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
-    "Sleep time of mbuf subsystem wait allocations during exhaustion");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
-    "Upper limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0,
-    "Lower limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
-    "Upper limit of number of mbuf clusters allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0,
-    "Lower limit of number of mbuf clusters allowed in each cache");
-SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
-    "Mbuf general information and statistics");
-SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
-    sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
-
-/*
- * Prototypes of local allocator routines.
- */
-static void		*mb_alloc_wait(struct mb_lstmngr *, short);
-static struct mb_bucket	*mb_pop_cont(struct mb_lstmngr *, int,
-			    struct mb_pcpu_list *);
-static void		 mb_reclaim(void);
-static void		 mbuf_init(void *);
-
-/*
- * Initial allocation numbers.  Each parameter represents the number of buckets
- * of each object that will be placed initially in each PCPU container for
- * said object.
- */
-#define	NMB_MBUF_INIT	2
-#define	NMB_CLUST_INIT	8
-
-/*
- * Internal flags that allow for cache locks to remain "persistent" across
- * allocation and free calls.  They may be used in combination.
- */
-#define	MBP_PERSIST	0x1	/* Return with lock still held. */
-#define	MBP_PERSISTENT	0x2	/* Cache lock is already held coming in. */
-
-/*
- * Initialize the mbuf subsystem.
- *
- * We sub-divide the kmem_map into several submaps; this way, we don't have
- * to worry about artificially limiting the number of mbuf or mbuf cluster
- * allocations, due to fear of one type of allocation "stealing" address
- * space initially reserved for another.
- *
- * Set up both the general containers and all the PCPU containers.  Populate
- * the PCPU containers with initial numbers.
- */
-MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
-SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
-static void
-mbuf_init(void *dummy)
-{
-	struct mb_pcpu_list *pcpu_cnt;
-	vm_size_t mb_map_size;
-	int i, j;
-
-	/*
-	 * Set up all the submaps, for each type of object that we deal
-	 * with in this allocator.
-	 */
-	mb_map_size = (vm_size_t)(nmbufs * MSIZE);
-	mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
-	mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
-	    MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
-	if (mb_list_mbuf.ml_btable == NULL)
-		goto bad;
-	mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
-	    &(mb_list_mbuf.ml_maptop), mb_map_size);
-	mb_list_mbuf.ml_map->system_map = 1;
-	mb_list_mbuf.ml_mapfull = 0;
-	mb_list_mbuf.ml_objsize = MSIZE;
-	mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize;
-	mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
-	mb_list_mbuf.ml_wmlow = &mbuf_lowm;
-
-	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
-	mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
-	mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
-	    CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
-	if (mb_list_clust.ml_btable == NULL)
-		goto bad;
-	mb_list_clust.ml_map = kmem_suballoc(kmem_map,
-	    &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
-	    mb_map_size);
-	mb_list_clust.ml_map->system_map = 1;
-	mb_list_clust.ml_mapfull = 0;
-	mb_list_clust.ml_objsize = MCLBYTES;
-	mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize;
-	mb_list_clust.ml_wmhigh = &clust_hiwm;
-	mb_list_clust.ml_wmlow = &clust_lowm;
-
-	/*
-	 * Allocate required general (global) containers for each object type.
-	 */
-	mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
-	    M_NOWAIT);
-	mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
-	    M_NOWAIT);
-	if ((mb_list_mbuf.ml_genlist == NULL) ||
-	    (mb_list_clust.ml_genlist == NULL))
-		goto bad;
-
-	/*
-	 * Initialize condition variables and general container mutex locks.
-	 */
-	mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, MTX_DEF);
-	cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
-	cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
-	    "mcluster pool starved");
-	mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
-	    mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
-
-	/*
-	 * Set up the general containers for each object.
-	 */
-	mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
-	    mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
-	mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
-	    mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
-	mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
-	mb_list_clust.ml_genlist->mb_cont.mc_objcount =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
-	mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
-	mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
-	mb_list_mbuf.ml_genlist->mb_cont.mc_types =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
-	mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
-	SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
-	SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
-
-	/*
-	 * Allocate all the required counters for clusters.  This makes
-	 * cluster allocations/deallocations much faster.
-	 */
-	cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
-	if (cl_refcntmap == NULL)
-		goto bad;
-
-	/*
-	 * Initialize general mbuf statistics.
-	 */
-	mbstat.m_msize =  mb_list_mbuf.ml_objsize;
-	mbstat.m_mclbytes = mb_list_clust.ml_objsize;
-	mbstat.m_minclsize = MINCLSIZE;
-	mbstat.m_mlen = MLEN;
-	mbstat.m_mhlen = MHLEN;
-	mbstat.m_numtypes = MT_NTYPES;
-	mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks;
-	mbstat.m_clperbuck = mb_list_clust.ml_objbucks;
-
-	/*
-	 * Allocate and initialize PCPU containers.
-	 */
-	for (i = 0; i < NCPU; i++) {
-		if (CPU_ABSENT(i)) {
-			mb_statpcpu[i].mb_active = 0;
-			continue;
-		}
-
-		mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
-		    M_MBUF, M_NOWAIT);
-		mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
-		    M_MBUF, M_NOWAIT);
-		if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
-		    (mb_list_clust.ml_cntlst[i] == NULL))
-			goto bad;
-
-		mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, MTX_DEF);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
-
-		mb_statpcpu[i].mb_active = 1;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
-		    &(mb_statpcpu[i].mb_mbfree);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
-		    &(mb_statpcpu[i].mb_clfree);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
-		    &(mb_statpcpu[i].mb_mbbucks);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
-		    &(mb_statpcpu[i].mb_clbucks);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
-		    &(mb_statpcpu[i].mb_mbtypes[0]);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
-
-		SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
-		SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
-
-		/*
-		 * Perform initial allocations.
-		 */
-		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
-		MB_LOCK_CONT(pcpu_cnt);
-		for (j = 0; j < NMB_MBUF_INIT; j++) {
-			if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
-			    == NULL)
-				goto bad;
-		}
-		MB_UNLOCK_CONT(pcpu_cnt);
-
-		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
-		MB_LOCK_CONT(pcpu_cnt);
-		for (j = 0; j < NMB_CLUST_INIT; j++) {
-			if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
-			    == NULL)
-				goto bad;
-		}
-		MB_UNLOCK_CONT(pcpu_cnt);
-	}
-
-	return;
-bad:
-	panic("mbuf_init(): failed to initialize mbuf subsystem!");
-}
-
-/*
- * Populate a given mbuf PCPU container with a bucket full of fresh new
- * buffers.  Return a pointer to the new bucket (already in the container if
- * successful), or return NULL on failure.
- *
- * LOCKING NOTES:
- * PCPU container lock must be held when this is called.
- * The lock is dropped here so that we can cleanly call the underlying VM
- * code.  If we fail, we return with no locks held. If we succeed (i.e., return
- * non-NULL), we return with the PCPU lock held, ready for allocation from
- * the returned bucket.
- */
-static struct mb_bucket *
-mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
-{
-	struct mb_bucket *bucket;
-	caddr_t p;
-	int i;
-
-	MB_UNLOCK_CONT(cnt_lst);
-	/*
-	 * If our object's (finite) map is starved now (i.e., no more address
-	 * space), bail out now.
-	 */
-	if (mb_list->ml_mapfull)
-		return (NULL);
-
-	bucket = malloc(sizeof(struct mb_bucket) +
-	    mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how));
-	if (bucket == NULL)
-		return (NULL);
-
-	p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * 
-	    mb_list->ml_objbucks, MBTOM(how));
-	if (p == NULL) {
-		free(bucket, M_MBUF);
-		if (how == M_TRYWAIT)
-			mb_list->ml_mapfull = 1;
-		return (NULL);
-	}
-
-	bucket->mb_numfree = 0;
-	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
-	for (i = 0; i < mb_list->ml_objbucks; i++) {
-		bucket->mb_free[i] = p;
-		bucket->mb_numfree++;
-		p += mb_list->ml_objsize;
-	}
-
-	MB_LOCK_CONT(cnt_lst);
-	bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-	SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
-	(*(cnt_lst->mb_cont.mc_numbucks))++;
-	*(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
-
-	return (bucket);
-}
-
-/*
- * Allocate a network buffer.
- * The general case is very easy.  Complications only arise if our PCPU
- * container is empty.  Things get worse if the PCPU container is empty,
- * the general container is empty, and we've run out of address space
- * in our map; then we try to block if we're willing to (M_TRYWAIT).
- */
-static 
-void *
-mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, 
-	 int *pers_list)
-{
-	static int last_report;
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_bucket *bucket;
-	void *m;
-
-#ifdef INVARIANTS
-	int flags;
-	
-	flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
-	if (flags != M_DONTWAIT && flags != M_TRYWAIT) {
-		static	struct timeval lasterr;
-		static	int curerr;
-		if (ppsratecheck(&lasterr, &curerr, 1)) {
-			printf("Bad mbuf alloc flags: %x\n", flags);
-			backtrace();
-			how = M_TRYWAIT;
-		}
-	}
-#endif
-
-	m = NULL;
-	if ((persist & MBP_PERSISTENT) != 0) {
-		/*
-		 * If we're a "persistent" call, then the per-CPU #(pers_list)
-		 * cache lock is already held, and we just need to refer to
-		 * the correct cache descriptor.
-		 */
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
-	} else {
-		cnt_lst = MB_GET_PCPU_LIST(mb_list);
-		MB_LOCK_CONT(cnt_lst);
-	}
-
-	if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
-		/*
-		 * This is the easy allocation case. We just grab an object
-		 * from a bucket in the PCPU container. At worst, we
-		 * have just emptied the bucket and so we remove it
-		 * from the container.
-		 */
-		MB_GET_OBJECT(m, bucket, cnt_lst);
-		MB_MBTYPES_INC(cnt_lst, type, 1);
-
-		/* If asked to persist, do not drop the lock. */
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(cnt_lst);
-		else
-			*pers_list = cnt_lst->mb_cont.mc_numowner;
-	} else {
-		struct mb_gen_list *gen_list;
-
-		/*
-		 * This is the less-common more difficult case. We must
-		 * first verify if the general list has anything for us
-		 * and if that also fails, we must allocate a page from
-		 * the map and create a new bucket to place in our PCPU
-		 * container (already locked). If the map is starved then
-		 * we're really in for trouble, as we have to wait on
-		 * the general container's condition variable.
-		 */
-		gen_list = MB_GET_GEN_LIST(mb_list);
-		MB_LOCK_CONT(gen_list);
-
-		if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
-		    != NULL) {
-			/*
-			 * Give ownership of the bucket to our CPU's
-			 * container, but only actually put the bucket
-			 * in the container if it doesn't become free
-			 * upon removing an mbuf from it.
-			 */
-			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    mb_blist);
-			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-			(*(gen_list->mb_cont.mc_numbucks))--;
-			(*(cnt_lst->mb_cont.mc_numbucks))++;
-			*(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
-			bucket->mb_numfree--;
-			m = bucket->mb_free[(bucket->mb_numfree)];
-			if (bucket->mb_numfree == 0) {
-				SLIST_NEXT(bucket, mb_blist) = NULL;
-				bucket->mb_owner |= MB_BUCKET_FREE;
-			} else {
-				SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-				     bucket, mb_blist);
-				*(cnt_lst->mb_cont.mc_objcount) +=
-				    bucket->mb_numfree;
-			}
-			MB_UNLOCK_CONT(gen_list);
-			MB_MBTYPES_INC(cnt_lst, type, 1);
-
-			/* If asked to persist, do not drop the lock. */
-			if ((persist & MBP_PERSIST) == 0)
-				MB_UNLOCK_CONT(cnt_lst);
-			else
-				*pers_list = cnt_lst->mb_cont.mc_numowner;
-		} else {
-			/*
-			 * We'll have to allocate a new page.
-			 */
-			MB_UNLOCK_CONT(gen_list);
-			bucket = mb_pop_cont(mb_list, how, cnt_lst);
-			if (bucket != NULL) {
-				MB_GET_OBJECT(m, bucket, cnt_lst);
-				MB_MBTYPES_INC(cnt_lst, type, 1);
-
-				/* If asked to persist, do not drop the lock. */
-				if ((persist & MBP_PERSIST) == 0)
-					MB_UNLOCK_CONT(cnt_lst);
-				else
-					*pers_list=cnt_lst->mb_cont.mc_numowner;
-			} else {
-				if (how == M_TRYWAIT) {
-					/*
-				 	 * Absolute worst-case scenario.
-					 * We block if we're willing to, but
-					 * only after trying to steal from
-					 * other lists.
-					 */
-					m = mb_alloc_wait(mb_list, type);
-				} else {
-					/* XXX: No consistency. */
-					mbstat.m_drops++;
-
-					if (ticks < last_report ||
-					   (ticks - last_report) >= hz) {
-						last_report = ticks;
-						printf(
-"All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
-					}
-
-				}
-				if (m != NULL && (persist & MBP_PERSIST) != 0) {
-					cnt_lst = MB_GET_PCPU_LIST(mb_list);
-					MB_LOCK_CONT(cnt_lst);
-					*pers_list=cnt_lst->mb_cont.mc_numowner;
-				}
-			}
-		}
-	}
-
-	return (m);
-}
-
-/*
- * This is the worst-case scenario called only if we're allocating with
- * M_TRYWAIT.  We first drain all the protocols, then try to find an mbuf
- * by looking in every PCPU container.  If we're still unsuccesful, we
- * try the general container one last time and possibly block on our
- * starved cv.
- */
-static void *
-mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
-{
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_gen_list *gen_list;
-	struct mb_bucket *bucket;
-	void *m;
-	int i, cv_ret;
-
-	/*
-	 * Try to reclaim mbuf-related objects (mbufs, clusters).
-	 */
-	mb_reclaim();
-
-	/*
-	 * Cycle all the PCPU containers. Increment starved counts if found
-	 * empty.
-	 */
-	for (i = 0; i < NCPU; i++) {
-		if (CPU_ABSENT(i))
-			continue;
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
-		MB_LOCK_CONT(cnt_lst);
-
-		/*
-		 * If container is non-empty, get a single object from it.
-		 * If empty, increment starved count.
-		 */
-		if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
-		    NULL) {
-			MB_GET_OBJECT(m, bucket, cnt_lst);
-			MB_MBTYPES_INC(cnt_lst, type, 1);
-			MB_UNLOCK_CONT(cnt_lst);
-			mbstat.m_wait++;	/* XXX: No consistency. */
-			return (m);
-		} else
-			cnt_lst->mb_cont.mc_starved++;
-
-		MB_UNLOCK_CONT(cnt_lst);
-	}
-
-	/*
-	 * We're still here, so that means it's time to get the general
-	 * container lock, check it one more time (now that mb_reclaim()
-	 * has been called) and if we still get nothing, block on the cv.
-	 */
-	gen_list = MB_GET_GEN_LIST(mb_list);
-	MB_LOCK_CONT(gen_list);
-	if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
-		MB_GET_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_INC(gen_list, type, 1);
-		MB_UNLOCK_CONT(gen_list);
-		mbstat.m_wait++;	/* XXX: No consistency. */
-		return (m);
-	}
-
-	gen_list->mb_cont.mc_starved++;
-	cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
-	    gen_list->mb_cont.mc_lock, mbuf_wait);
-	gen_list->mb_cont.mc_starved--;
-
-	if ((cv_ret == 0) &&
-	    ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
-		MB_GET_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_INC(gen_list, type, 1);
-		mbstat.m_wait++;	/* XXX: No consistency. */
-	} else {
-		mbstat.m_drops++;	/* XXX: No consistency. */
-		m = NULL;
-	}
-
-	MB_UNLOCK_CONT(gen_list);
-
-	return (m);
-}
-
-/*-
- * Free an object to its rightful container.
- * In the very general case, this operation is really very easy.
- * Complications arise primarily if:
- *	(a) We've hit the high limit on number of free objects allowed in
- *	    our PCPU container.
- *	(b) We're in a critical situation where our container has been
- *	    marked 'starved' and we need to issue wakeups on the starved
- *	    condition variable.
- *	(c) Minor (odd) cases: our bucket has migrated while we were
- *	    waiting for the lock; our bucket is in the general container;
- *	    our bucket is empty.
- */
-static 
-void
-mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
-	int *pers_list)
-{
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_gen_list *gen_list;
-	struct mb_bucket *bucket;
-	u_int owner;
-
-	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
-
-	/*
-	 * Make sure that if after we lock the bucket's present container the
-	 * bucket has migrated, that we drop the lock and get the new one.
-	 */
-retry_lock:
-	owner = bucket->mb_owner & ~MB_BUCKET_FREE;
-	switch (owner) {
-	case MB_GENLIST_OWNER:
-		gen_list = MB_GET_GEN_LIST(mb_list);
-		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
-			if (*pers_list != MB_GENLIST_OWNER) {
-				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
-				    *pers_list);
-				MB_UNLOCK_CONT(cnt_lst);
-				MB_LOCK_CONT(gen_list);
-			}
-		} else {
-			MB_LOCK_CONT(gen_list);
-		}
-		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
-			MB_UNLOCK_CONT(gen_list);
-			*pers_list = -1;
-			goto retry_lock;
-		}
-
-		/*
-		 * If we're intended for the general container, this is
-		 * real easy: no migrating required. The only `bogon'
-		 * is that we're now contending with all the threads
-		 * dealing with the general list, but this is expected.
-		 */
-		MB_PUT_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_DEC(gen_list, type, 1);
-		if (bucket->mb_owner & MB_BUCKET_FREE) {
-			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = MB_GENLIST_OWNER;
-		}
-		if (gen_list->mb_cont.mc_starved > 0)
-			cv_signal(&(gen_list->mgl_mstarved));
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(gen_list);
-		else
-			*pers_list = MB_GENLIST_OWNER;
-		break;
-
-	default:
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
-		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
-			if (*pers_list == MB_GENLIST_OWNER) {
-				gen_list = MB_GET_GEN_LIST(mb_list);
-				MB_UNLOCK_CONT(gen_list);
-				MB_LOCK_CONT(cnt_lst);
-			} else {
-				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
-				    *pers_list);
-				owner = *pers_list;
-			}
-		} else {
-			MB_LOCK_CONT(cnt_lst);
-		}
-		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
-			MB_UNLOCK_CONT(cnt_lst);
-			*pers_list = -1;
-			goto retry_lock;
-		}
-
-		MB_PUT_OBJECT(m, bucket, cnt_lst);
-		MB_MBTYPES_DEC(cnt_lst, type, 1);
-		if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) ||
-		    (cnt_lst->mb_cont.mc_starved > 0)) {
-			/*
-			 * We've hit the high limit of allowed numbers of mbufs
-			 * on this PCPU list or we've been flagged that we need
-			 * to transfer a bucket over to the general cache.
-			 * We must now migrate a bucket over to the general
-			 * container.
-			 */
-			gen_list = MB_GET_GEN_LIST(mb_list);
-			MB_LOCK_CONT(gen_list);
-			if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
-				bucket =
-				    SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
-				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-				    mb_blist);
-			}
-			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = MB_GENLIST_OWNER;
-			*(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
-			*(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
-			(*(cnt_lst->mb_cont.mc_numbucks))--;
-			(*(gen_list->mb_cont.mc_numbucks))++;
-
-			/*
-			 * While we're at it, transfer some of the mbtypes
-			 * "count load" onto the general list's mbtypes
-			 * array, seeing as how we're moving the bucket
-			 * there now, meaning that the freeing of objects
-			 * there will now decrement the _general list's_
-			 * mbtypes counters, and no longer our PCPU list's
-			 * mbtypes counters. We do this for the type presently
-			 * being freed in an effort to keep the mbtypes
-			 * counters approximately balanced across all lists.
-			 */ 
-			MB_MBTYPES_DEC(cnt_lst, type,
-			    mb_list->ml_objbucks - bucket->mb_numfree);
-			MB_MBTYPES_INC(gen_list, type,
-			    mb_list->ml_objbucks - bucket->mb_numfree);
-
-			if (cnt_lst->mb_cont.mc_starved > 0) {
-				/*
-				 * Determine whether or not to keep
-				 * transferring buckets to the general list
-				 * or whether we've transferred enough already.
-				 * The thread that is blocked may end up waking
-				 * up in the meantime, but transferring an
-				 * extra bucket in a constrained situation
-				 * is not so bad, as we're likely to need
-				 * it soon anyway.
-				 */
-				if (gen_list->mb_cont.mc_starved > 0) {
-					cnt_lst->mb_cont.mc_starved--;
-					cv_signal(&(gen_list->mgl_mstarved));
-				} else
-					cnt_lst->mb_cont.mc_starved = 0;
-			}
-			MB_UNLOCK_CONT(gen_list);
-			if ((persist & MBP_PERSIST) == 0)
-				MB_UNLOCK_CONT(cnt_lst);
-			else
-				*pers_list = owner;
-			break;
-		}
-
-		if (bucket->mb_owner & MB_BUCKET_FREE) {
-			SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-		}
-
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(cnt_lst);
-		else
-			*pers_list = owner;
-		break;
-	}
-}
-
-/*
- * Drain protocols in hopes to free up some resources.
- *
- * LOCKING NOTES:
- * No locks should be held when this is called.  The drain routines have to
- * presently acquire some locks which raises the possibility of lock order
- * violation if we're holding any mutex if that mutex is acquired in reverse
- * order relative to one of the locks in the drain routines.
- */
-static void
-mb_reclaim(void)
-{
-	struct domain *dp;
-	struct protosw *pr;
-
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
-	    "mb_reclaim()");
-
-	mbstat.m_drain++;	/* XXX: No consistency. */
-
-	for (dp = domains; dp != NULL; dp = dp->dom_next)
-		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
-			if (pr->pr_drain != NULL)
-				(*pr->pr_drain)();
-}
-
-/******************************************************************************
- * Internal setup macros.
- */
-
-#define	_mb_setup(m, type) do {						\
-	(m)->m_type = (type);						\
-	(m)->m_next = NULL;						\
-	(m)->m_nextpkt = NULL;						\
-	(m)->m_data = (m)->m_dat;					\
-	(m)->m_flags = 0;						\
-} while (0)
-
-#define	_mbhdr_setup(m, type) do {					\
-	(m)->m_type = (type);						\
-	(m)->m_next = NULL;						\
-	(m)->m_nextpkt = NULL;						\
-	(m)->m_data = (m)->m_pktdat;					\
-	(m)->m_flags = M_PKTHDR;					\
-	(m)->m_pkthdr.rcvif = NULL;					\
-	(m)->m_pkthdr.csum_flags = 0;					\
-	SLIST_INIT(&(m)->m_pkthdr.tags);				\
-} while (0)
-
-#define _mcl_setup(m) do {						\
-	(m)->m_data = (m)->m_ext.ext_buf;				\
-	(m)->m_flags |= M_EXT;						\
-	(m)->m_ext.ext_free = NULL;					\
-	(m)->m_ext.ext_args = NULL;					\
-	(m)->m_ext.ext_size = MCLBYTES;					\
-	(m)->m_ext.ext_type = EXT_CLUSTER;				\
-} while (0)
-
-#define	_mext_init_ref(m, ref) do {					\
-	(m)->m_ext.ref_cnt = ((ref) == NULL) ?				\
-	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref);	\
-	if ((m)->m_ext.ref_cnt != NULL) {				\
-		*((m)->m_ext.ref_cnt) = 0;				\
-		MEXT_ADD_REF((m));					\
-	}								\
-} while (0)
-
-#define	cl2ref(cl)							\
-    (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT)
-
-#define	_mext_dealloc_ref(m)						\
-	if ((m)->m_ext.ext_type != EXT_EXTREF)				\
-		free((m)->m_ext.ref_cnt, M_MBUF)
-
-/******************************************************************************
- * Internal routines.
- * 
- * Because mb_alloc() and mb_free() are inlines (to keep the common
- * cases down to a maximum of one function call), below are a few
- * routines used only internally for the sole purpose of making certain
- * functions smaller.
- *
- * - _mext_free(): frees associated storage when the ref. count is
- *   exactly one and we're freeing.
- *
- * - _mgetm_internal(): common "persistent-lock" routine that allocates
- *   an mbuf and a cluster in one shot, but where the lock is already
- *   held coming in (which is what makes it different from the exported
- *   m_getcl()).  The lock is dropped when done.  This is used by m_getm()
- *   and, therefore, is very m_getm()-specific.
- */
-static struct mbuf *_mgetm_internal(int, short, short, int);
-
-void
-_mext_free(struct mbuf *mb)
-{
-
-	if (mb->m_ext.ext_type == EXT_CLUSTER) {
-		mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
-		    0, NULL);
-	} else {
-		(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
-		_mext_dealloc_ref(mb);
-	}
-}
-
-static struct mbuf *
-_mgetm_internal(int how, short type, short persist, int cchnum)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
-	if (mb == NULL)
-		return NULL;
-	_mb_setup(mb, type);
-
-	if ((persist & MBP_PERSIST) != 0) {
-		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
-		    how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
-		if (mb->m_ext.ext_buf == NULL) {
-			(void)m_free(mb);
-			mb = NULL;
-		}
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-	}
-	return (mb);
-}
-
-/******************************************************************************
- * Exported buffer allocation and de-allocation routines.
- */
-
-/*
- * Allocate and return a single (normal) mbuf.  NULL is returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL)
-		_mb_setup(mb, type);
-	return (mb);
-}
-
-/*
- * Allocate a given length worth of mbufs and/or clusters (whatever fits
- * best) and return a pointer to the top of the allocated chain.  If an
- * existing mbuf chain is provided, then we will append the new chain
- * to the existing one but still return the top of the newly allocated
- * chain.  NULL is returned on failure, in which case the [optional]
- * provided chain is left untouched, and any memory already allocated
- * is freed.
- *
- * Arguments:
- *  - m: existing chain to which to append new chain (optional).
- *  - len: total length of data to append, either in mbufs or clusters
- *    (we allocate whatever combination yields the best fit).
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
-{
-	struct mbuf *mb, *top, *cur, *mtail;
-	int num, rem, cchnum;
-	short persist;
-	int i;
-
-	KASSERT(len >= 0, ("m_getm(): len is < 0"));
-
-	/* If m != NULL, we will append to the end of that chain. */
-	if (m != NULL)
-		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
-	else
-		mtail = NULL;
-
-	/*
-	 * In the best-case scenario (which should be the common case
-	 * unless we're in a starvation situation), we will be able to
-	 * go through the allocation of all the desired mbufs and clusters
-	 * here without dropping our per-CPU cache lock in between.
-	 */
-	num = len / MCLBYTES;
-	rem = len % MCLBYTES;
-	persist = 0;
-	cchnum = -1;
-	top = cur = NULL;
-	for (i = 0; i < num; i++) {
-		mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
-		    MBP_PERSIST | persist, &cchnum);
-		if (mb == NULL)
-			goto failed;
-		_mb_setup(mb, type);
-		mb->m_len = 0;
-
-		persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
-		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
-		    how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
-		if (mb->m_ext.ext_buf == NULL) {
-			(void)m_free(mb);
-			goto failed;
-		}
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-		persist = MBP_PERSISTENT;
-
-		if (cur == NULL)
-			top = cur = mb;
-		else
-			cur = (cur->m_next = mb);
-	}
-	if (rem > 0) {
-		if (cchnum >= 0) {
-			persist = MBP_PERSISTENT;
-			persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
-			mb = _mgetm_internal(how, type, persist, cchnum);
-			if (mb == NULL)
-				goto failed;
-		} else if (rem > MINCLSIZE) {
-			mb = m_getcl(how, type, 0);
-		} else {
-			mb = m_get(how, type);
-		}
-		if (mb != NULL) {
-			mb->m_len = 0;
-			if (cur == NULL)
-				top = mb;
-			else
-				cur->m_next = mb;
-		} else
-			goto failed;
-	}
-
-	if (mtail != NULL)
-		mtail->m_next = top;
-	return top;
-failed:
-	if (top != NULL)
-		m_freem(top);
-	return NULL;
-}
-
-/*
- * Allocate and return a single M_PKTHDR mbuf.  NULL is returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mbhdr_setup(mb, type);
-#ifdef MAC
-		if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-			m_free(mb);
-			return (NULL);
-		}
-#endif
-	}
-	return (mb);
-}
-
-/*
- * Allocate and return a single (normal) pre-zero'd mbuf.  NULL is
- * returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get_clrd(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mb_setup(mb, type);
-		bzero(mtod(mb, caddr_t), MLEN);
-	}
-	return (mb);
-}
-
-/*
- * Allocate and return a single M_PKTHDR pre-zero'd mbuf.  NULL is
- * returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr_clrd(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mbhdr_setup(mb, type);
-#ifdef MAC
-		if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-			m_free(mb);
-			return (NULL);
-		}
-#endif
-		bzero(mtod(mb, caddr_t), MHLEN);
-	}
-	return (mb);
-}
-
-/*
- * Free a single mbuf and any associated storage that it may have attached
- * to it.  The associated storage may not be immediately freed if its
- * reference count is above 1.  Returns the next mbuf in the chain following
- * the mbuf being freed.
- *
- * Arguments:
- *  - mb: the mbuf to free.
- */
-struct mbuf *
-m_free(struct mbuf *mb)
-{
-	struct mbuf *nb;
-	int cchnum;
-	short persist = 0;
-
-#ifdef INVARIANTS
-	if (mb->m_flags & M_FREELIST)
-		panic("m_free detected a mbuf double-free");
-	mb->m_flags |= M_FREELIST;
-#endif
-	if ((mb->m_flags & M_PKTHDR) != 0)
-		m_tag_delete_chain(mb, NULL);
-	nb = mb->m_next;
-	if ((mb->m_flags & M_EXT) != 0) {
-		MEXT_REM_REF(mb);
-		if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
-			if (mb->m_ext.ext_type == EXT_CLUSTER) {
-				mb_free(&mb_list_clust,
-				    (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
-				    MBP_PERSIST, &cchnum);
-				persist = MBP_PERSISTENT;
-			} else {
-				(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
-				    mb->m_ext.ext_args);
-				_mext_dealloc_ref(mb);
-				persist = 0;
-			}
-		}
-	}
-	mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
-	return (nb);
-}
-
-/*
- * Free an entire chain of mbufs and associated external buffers, if
- * applicable.  Right now, we only optimize a little so that the cache
- * lock may be held across a single mbuf+cluster free.  Hopefully,
- * we'll eventually be holding the lock across more than merely two
- * consecutive frees but right now this is hard to implement because of
- * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
- * loop.
- *
- *  - mb: the mbuf chain to free.
- */
-void
-m_freem(struct mbuf *mb)
-{
-
-	while (mb != NULL)
-		mb = m_free(mb);
-}
-
-/*
- * Fetch an mbuf with a cluster attached to it.  If one of the
- * allocations fails, the entire allocation fails.  This routine is
- * the preferred way of fetching both the mbuf and cluster together,
- * as it avoids having to unlock/relock between allocations.  Returns
- * NULL on failure. 
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- *  - flags: any flags to pass to the mbuf being allocated; if this includes
- *    the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
- */
-struct mbuf *
-m_getcl(int how, short type, int flags)
-{
-	struct mbuf *mb;
-	int cchnum;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
-	    MBP_PERSIST, &cchnum);
-	if (mb == NULL)
-		return NULL;
-	mb->m_type = type;
-	mb->m_next = NULL;
-	mb->m_flags = flags;
-	if ((flags & M_PKTHDR) != 0) {
-		mb->m_nextpkt = NULL;
-		mb->m_pkthdr.rcvif = NULL;
-		mb->m_pkthdr.csum_flags = 0;
-		SLIST_INIT(&mb->m_pkthdr.tags);
-	}
-
-	mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
-	    MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
-	if (mb->m_ext.ext_buf == NULL) {
-		(void)m_free(mb);
-		mb = NULL;
-	} else {
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-#ifdef MAC
-		if (flags & M_PKTHDR) {
-			if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-				m_free(mb);
-				return (NULL);
-			}
-		}
-#endif
-	}
-	return (mb);
-}
-
-/*
- * Fetch a single mbuf cluster and attach it to an existing mbuf.  If
- * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
- * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
- * The M_EXT bit is not set on failure.
- *
- * Arguments:
- *  - mb: the existing mbuf to which to attach the allocated cluster.
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- */
-void
-m_clget(struct mbuf *mb, int how)
-{
-
-	mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
-	    0, NULL);
-	if (mb->m_ext.ext_buf != NULL) {
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-	}
-}
-
-/*
- * Configure a provided mbuf to refer to the provided external storage
- * buffer and setup a reference count for said buffer.  If the setting
- * up of the reference count fails, the M_EXT bit will not be set.  If
- * successfull, the M_EXT bit is set in the mbuf's flags.
- *
- * Arguments:
- *  - mb: the existing mbuf to which to attach the provided buffer.
- *  - buf: the address of the provided external storage buffer.
- *  - size: the size of the provided buffer.
- *  - freef: a pointer to a routine that is responsible for freeing the
- *    provided external storage buffer.
- *  - args: a pointer to an argument structure (of any type) to be passed
- *    to the provided freef routine (may be NULL).
- *  - flags: any other flags to be passed to the provided mbuf.
- *  - type: the type that the external storage buffer should be labeled with.
- */
-void
-m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
-    void (*freef)(void *, void *), void *args, int flags, int type)
-{
-	u_int *ref_cnt = NULL;
-
-	if (type == EXT_CLUSTER)
-		ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)];
-	else if (type == EXT_EXTREF)
-		ref_cnt = mb->m_ext.ref_cnt;
-	_mext_init_ref(mb, ref_cnt);
-	if (mb->m_ext.ref_cnt != NULL) {
-		mb->m_flags |= (M_EXT | flags);
-		mb->m_ext.ext_buf = buf;
-		mb->m_data = mb->m_ext.ext_buf;
-		mb->m_ext.ext_size = size;
-		mb->m_ext.ext_free = freef;
-		mb->m_ext.ext_args = args;
-		mb->m_ext.ext_type = type;
-	}
-}
-
-/*
- * Change type of provided mbuf.  This is a relatively expensive operation
- * (due to the cost of statistics manipulations) and should be avoided, where
- * possible.
- *
- * Arguments:
- *  - mb: the provided mbuf for which the type needs to be changed.
- *  - new_type: the new type to change the mbuf to.
- */
-void
-m_chtype(struct mbuf *mb, short new_type)
-{
-	struct mb_gen_list *gen_list;
-
-	gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
-	MB_LOCK_CONT(gen_list);
-	MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
-	MB_MBTYPES_INC(gen_list, new_type, 1);
-	MB_UNLOCK_CONT(gen_list);
-	mb->m_type = new_type;
-}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 5815fae..e14aba1 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -86,6 +86,161 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 #endif
 
 /*
+ * Malloc-type for external ext_buf ref counts.
+ */
+MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain.  If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, short type)
+{
+	struct mbuf *mb, *top, *cur, *mtail;
+	int num, rem;
+	int i;
+
+	KASSERT(len >= 0, ("m_getm(): len is < 0"));
+
+	/* If m != NULL, we will append to the end of that chain. */
+	if (m != NULL)
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+	else
+		mtail = NULL;
+
+	/*
+	 * Calculate how many mbufs+clusters ("packets") we need and how much
+	 * leftover there is after that and allocate the first mbuf+cluster
+	 * if required.
+	 */
+	num = len / MCLBYTES;
+	rem = len % MCLBYTES;
+	top = cur = NULL;
+	if (num > 0) {
+		if ((top = cur = m_getcl(how, type, 0)) == NULL)
+			goto failed;
+	}
+	num--;
+	top->m_len = 0;
+
+	for (i = 0; i < num; i++) {
+		mb = m_getcl(how, type, 0);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		cur = (cur->m_next = mb);
+	}
+	if (rem > 0) {
+		mb = (rem > MINCLSIZE) ?
+		    m_getcl(how, type, 0) : m_get(how, type);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		if (cur == NULL)
+			top = mb;
+		else
+			cur->m_next = mb;
+	}
+
+	if (mtail != NULL)
+		mtail->m_next = top;
+	return top;
+failed:
+	if (top != NULL)
+		m_freem(top);
+	return NULL;
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+	while (mb != NULL)
+		mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer.  If the setting
+ * up of the reference count fails, the M_EXT bit will not be set.  If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ *    mb     The existing mbuf to which to attach the provided buffer.
+ *    buf    The address of the provided external storage buffer.
+ *    size   The size of the provided buffer.
+ *    freef  A pointer to a routine that is responsible for freeing the
+ *           provided external storage buffer.
+ *    args   A pointer to an argument structure (of any type) to be passed
+ *           to the provided freef routine (may be NULL).
+ *    flags  Any other flags to be passed to the provided mbuf.
+ *    type   The type that the external storage buffer should be
+ *           labeled with.
+ *
+ * Returns:
+ *    Nothing.
+ */
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+    void (*freef)(void *, void *), void *args, int flags, int type)
+{
+	u_int *ref_cnt = NULL;
+
+	/* XXX Shouldn't be adding EXT_CLUSTER with this API */
+	if (type == EXT_CLUSTER)
+		ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+		    mb->m_ext.ext_buf);
+	else if (type == EXT_EXTREF)
+		ref_cnt = mb->m_ext.ref_cnt;
+	mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
+	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+	if (mb->m_ext.ref_cnt != NULL) {
+		*(mb->m_ext.ref_cnt) = 1;
+		mb->m_flags |= (M_EXT | flags);
+		mb->m_ext.ext_buf = buf;
+		mb->m_data = mb->m_ext.ext_buf;
+		mb->m_ext.ext_size = size;
+		mb->m_ext.ext_free = freef;
+		mb->m_ext.ext_args = args;
+		mb->m_ext.ext_type = type;
+        }
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 0.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+
+	MEXT_REM_REF(m);
+	if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
+		if (m->m_ext.ext_type == EXT_PACKET) {
+			uma_zfree(zone_pack, m);
+			return;
+		} else if (m->m_ext.ext_type == EXT_CLUSTER) {
+			uma_zfree(zone_clust, m->m_ext.ext_buf);
+			m->m_ext.ext_buf = NULL;
+		} else {
+			(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
+			    m->m_ext.ext_args);
+			if (m->m_ext.ext_type != EXT_EXTREF)
+				free(m->m_ext.ref_cnt, M_MBUF);
+		}
+	}
+	uma_zfree(zone_mbuf, m);
+}
+
+/*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
@@ -364,22 +519,22 @@ m_dup(struct mbuf *m, int how)
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
-		MGET(n, how, m->m_type);
+		if (remain >= MINCLSIZE) {
+			n = m_getcl(how, m->m_type, 0);
+			nsize = MCLBYTES;
+		} else {
+			n = m_get(how, m->m_type);
+			nsize = MLEN;
+		}
 		if (n == NULL)
 			goto nospace;
-		if (top == NULL) {		/* first one, must be PKTHDR */
-			if (!m_dup_pkthdr(n, m, how))
-				goto nospace;
-			nsize = MHLEN;
-		} else				/* not the first one */
-			nsize = MLEN;
-		if (remain >= MINCLSIZE) {
-			MCLGET(n, how);
-			if ((n->m_flags & M_EXT) == 0) {
-				(void)m_free(n);
+
+		if (top == NULL) {		/* First one, must be PKTHDR */
+			if (!m_dup_pkthdr(n, m, how)) {
+				m_free(n);
 				goto nospace;
 			}
-			nsize = MCLBYTES;
+			nsize = MHLEN;
 		}
 		n->m_len = 0;
 
@@ -651,39 +806,42 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
 	 void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
-	struct mbuf *top = 0, **mp = &top;
+	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
-	MGETHDR(m, M_DONTWAIT, MT_DATA);
-	if (m == NULL)
-		return (NULL);
-	m->m_pkthdr.rcvif = ifp;
-	m->m_pkthdr.len = totlen;
-	len = MHLEN;
-
 	while (totlen > 0) {
-		if (top) {
-			MGET(m, M_DONTWAIT, MT_DATA);
-			if (m == NULL) {
-				m_freem(top);
-				return (NULL);
-			}
-			len = MLEN;
-		}
-		if (totlen + off >= MINCLSIZE) {
-			MCLGET(m, M_DONTWAIT);
-			if (m->m_flags & M_EXT)
+		if (top == NULL) {	/* First one, must be PKTHDR */
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
+			} else {
+				m = m_gethdr(M_DONTWAIT, MT_DATA);
+				len = MHLEN;
+
+				/* Place initial small packet/header at end of mbuf */
+				if (m && totlen + off + max_linkhdr <= MLEN) {
+					m->m_data += max_linkhdr;
+					len -= max_linkhdr;
+				}
+			}
+			if (m == NULL)
+				return NULL;
+			m->m_pkthdr.rcvif = ifp;
+			m->m_pkthdr.len = totlen;
 		} else {
-			/*
-			 * Place initial small packet/header at end of mbuf.
-			 */
-			if (top == NULL && totlen + off + max_linkhdr <= len) {
-				m->m_data += max_linkhdr;
-				len -= max_linkhdr;
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, 0);
+				len = MCLBYTES;
+			} else {
+				m = m_get(M_DONTWAIT, MT_DATA);
+				len = MLEN;
+			}
+			if (m == NULL) {
+				m_freem(top);
+				return NULL;
 			}
 		}
 		if (off) {
@@ -722,9 +880,10 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
-			n = m_get_clrd(M_DONTWAIT, m->m_type);
+			n = m_get(M_DONTWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
+			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
index 0d11aac..ff7944d 100644
--- a/sys/kern/uipc_mbuf2.c
+++ b/sys/kern/uipc_mbuf2.c
@@ -230,14 +230,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
 	 * now, we need to do the hard way.  don't m_copy as there's no room
 	 * on both end.
 	 */
-	MGET(o, M_DONTWAIT, m->m_type);
-	if (o && len > MLEN) {
-		MCLGET(o, M_DONTWAIT);
-		if ((o->m_flags & M_EXT) == 0) {
-			m_free(o);
-			o = NULL;
-		}
-	}
+	if (len > MLEN)
+		o = m_getcl(M_DONTWAIT, m->m_type, 0);
+	else
+		o = m_get(M_DONTWAIT, m->m_type);
 	if (!o) {
 		m_freem(m);
 		return NULL;	/* ENOBUFS */
@@ -274,29 +270,27 @@ static struct mbuf *
 m_dup1(struct mbuf *m, int off, int len, int wait)
 {
 	struct mbuf *n;
-	int l;
 	int copyhdr;
 
 	if (len > MCLBYTES)
 		return NULL;
-	if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+	if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
 		copyhdr = 1;
-		MGETHDR(n, wait, m->m_type);
-		l = MHLEN;
-	} else {
+	else
 		copyhdr = 0;
-		MGET(n, wait, m->m_type);
-		l = MLEN;
-	}
-	if (n && len > l) {
-		MCLGET(n, wait);
-		if ((n->m_flags & M_EXT) == 0) {
-			m_free(n);
-			n = NULL;
-		}
+	if (len >= MINCLSIZE) {
+		if (copyhdr == 1)
+			n = m_getcl(wait, m->m_type, M_PKTHDR);
+		else
+			n = m_getcl(wait, m->m_type, 0);
+	} else {
+		if (copyhdr == 1)
+			n = m_gethdr(wait, m->m_type);
+		else
+			n = m_get(wait, m->m_type);
 	}
 	if (!n)
-		return NULL;
+		return NULL; /* ENOBUFS */
 
 	if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
 		m_free(n);
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
 
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index e07f4ef..6735e49 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -527,8 +527,8 @@ sosend(so, addr, uio, top, control, flags, td)
 {
 	struct mbuf **mp;
 	struct mbuf *m;
-	long space, len, resid;
-	int clen = 0, error, s, dontroute, mlen;
+	long space, len = 0, resid;
+	int clen = 0, error, s, dontroute;
 	int atomic = sosendallatonce(so) || top;
 #ifdef ZERO_COPY_SOCKETS
 	int cow_send;
@@ -624,25 +624,23 @@ restart:
 #ifdef ZERO_COPY_SOCKETS
 			cow_send = 0;
 #endif /* ZERO_COPY_SOCKETS */
-			if (top == 0) {
-				MGETHDR(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MHLEN;
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = (struct ifnet *)0;
-			} else {
-				MGET(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MLEN;
-			}
 			if (resid >= MINCLSIZE) {
 #ifdef ZERO_COPY_SOCKETS
+				if (top == NULL) {
+					MGETHDR(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else {
+					MGET(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+				}
 				if (so_zero_copy_send &&
 				    resid>=PAGE_SIZE &&
 				    space>=PAGE_SIZE &&
@@ -654,29 +652,48 @@ restart:
 						cow_send = socow_setup(m, uio);
 					}
 				}
-				if (!cow_send){
+				if (!cow_send) {
+					MCLGET(m, M_TRYWAIT);
+					if ((m->m_flags & M_EXT) == 0) {
+						m_free(m);
+						m = NULL;
+					} else {
+						len = min(min(MCLBYTES, resid), space);
+					}
+				} else
+					len = PAGE_SIZE;
+#else /* ZERO_COPY_SOCKETS */
+				if (top == NULL) {
+					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else
+					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+				len = min(min(MCLBYTES, resid), space);
 #endif /* ZERO_COPY_SOCKETS */
-				MCLGET(m, M_TRYWAIT);
-				if ((m->m_flags & M_EXT) == 0)
-					goto nopages;
-				mlen = MCLBYTES;
-				len = min(min(mlen, resid), space);
 			} else {
-#ifdef ZERO_COPY_SOCKETS
-					len = PAGE_SIZE;
+				if (top == NULL) {
+					m = m_gethdr(M_TRYWAIT, MT_DATA);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+
+					len = min(min(MHLEN, resid), space);
+					/*
+					 * For datagram protocols, leave room
+					 * for protocol headers in first mbuf.
+					 */
+					if (atomic && m && len < MHLEN)
+						MH_ALIGN(m, len);
+				} else {
+					m = m_get(M_TRYWAIT, MT_DATA);
+					len = min(min(MLEN, resid), space);
 				}
-
-			} else {
-#endif /* ZERO_COPY_SOCKETS */
-nopages:
-				len = min(min(mlen, resid), space);
-				/*
-				 * For datagram protocols, leave room
-				 * for protocol headers in first mbuf.
-				 */
-				if (atomic && top == 0 && len < mlen)
-					MH_ALIGN(m, len);
 			}
+			if (m == NULL) {
+				error = ENOBUFS;
+				goto release;
+			}
+
 			space -= len;
 #ifdef ZERO_COPY_SOCKETS
 			if (cow_send)
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
 
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 1b886f5..978c30e 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
@@ -85,6 +86,21 @@ static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 /*
+ * NSFBUFS-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+    "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+    "Number of sendfile(2) sf_bufs in use");
+
+/*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c
index fe263f1..4a34567 100644
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@@ -86,6 +86,10 @@
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	sf_buf_init(void *arg);
 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
 
@@ -351,6 +355,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;
 
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
 	SLIST_INIT(&sf_freelist.sf_head);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index d86c57c..2170599 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -10,7 +10,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -33,7 +33,12 @@
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
+/* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <vm/uma.h>
+#endif
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which
@@ -57,6 +62,16 @@
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	dtom(x)		((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1)))
+
+/*
+ * Argument structure passed to UMA routines during mbuf and packet
+ * allocations.
+ */
+struct mb_args {
+	int	flags;	/* Flags for mbuf being allocated */
+	int	how;	/* How to allocate: M_WAITOK or M_DONTWAIT */
+	short	type;	/* Type of mbuf being allocated */
+};
 #endif /* _KERNEL */
 
 /*
@@ -167,6 +182,7 @@ struct mbuf {
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
+#define	EXT_PACKET	3	/* came out of Packet zone */
 #define	EXT_NET_DRV	100	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	200	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	300	/* can throw this buffer away w/page flipping */
@@ -223,28 +239,12 @@ struct mbuf {
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */
 
 /*
- * Mbuf and cluster allocation statistics PCPU structure.
- */
-struct mbpstat {
-	u_long	mb_mbfree;
-	u_long	mb_mbbucks;
-	u_long	mb_clfree;
-	u_long	mb_clbucks;
-	long	mb_mbtypes[MT_NTYPES];
-	short	mb_active;
-};
-
-/*
  * General mbuf allocator statistics structure.
- * XXX: Modifications of these are not protected by any mutex locks nor by
- * any atomic() manipulations.  As a result, we may occasionally lose
- * a count or two.  Luckily, not all of these fields are modified at all
- * and remain static, and those that are manipulated are only manipulated
- * in failure situations, which do not occur (hopefully) very often.
  */
 struct mbstat {
-	u_long	m_drops;	/* times failed to allocate */
-	u_long	m_wait;		/* times succesfully returned from wait */
+	u_long	m_mbufs;	/* XXX */
+	u_long	m_mclusts;	/* XXX */
+
 	u_long	m_drain;	/* times drained protocols for space */
 	u_long	m_mcfail;	/* XXX: times m_copym failed */
 	u_long	m_mpfail;	/* XXX: times m_pullup failed */
@@ -253,10 +253,10 @@ struct mbstat {
 	u_long	m_minclsize;	/* min length of data to allocate a cluster */
 	u_long	m_mlen;		/* length of data in an mbuf */
 	u_long	m_mhlen;	/* length of data in a header mbuf */
-	u_int	m_mbperbuck;	/* number of mbufs per "bucket" */
-	u_int	m_clperbuck;	/* number of clusters per "bucket" */
-	/* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */
+
+	/* Number of mbtypes (gives # elems in mbtypes[] array: */
 	short	m_numtypes;
+
 	/* XXX: Sendfile stats should eventually move to their own struct */
 	u_long	sf_iocnt;	/* times sendfile had to do disk I/O */
 	u_long	sf_allocfail;	/* times sfbuf allocation failed */
@@ -265,14 +265,23 @@ struct mbstat {
 
 /*
  * Flags specifying how an allocation should be made.
- * M_DONTWAIT means "don't block if nothing is available" whereas
- * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is
- * available."
+ *
+ * The flag to use is as follows:
+ * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation.
+ * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block.
+ *
+ * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly
+ * and if we cannot allocate immediately we may return NULL,
+ * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate
+ * resources we will block until they are available, and thus never
+ * return NULL.
+ *
+ * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT.
  */
-#define	M_DONTWAIT	0x4		/* don't conflict with M_NOWAIT */
-#define	M_TRYWAIT	0x8		/* or M_WAITOK */
-#define	M_WAIT		M_TRYWAIT	/* XXX: deprecated */
-#define	MBTOM(how)	((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT)
+#define	MBTOM(how)	(how)
+#define	M_DONTWAIT	M_NOWAIT
+#define	M_TRYWAIT	M_WAITOK
+#define	M_WAIT		M_WAITOK
 
 #ifdef _KERNEL
 /*-
@@ -296,12 +305,114 @@ struct mbstat {
 #define	MEXT_ADD_REF(m)	atomic_add_int((m)->m_ext.ref_cnt, 1)
 
 /*
+ * Network buffer allocation API
+ *
+ * The rest of it is defined in kern/subr_mbuf.c
+ */
+
+extern uma_zone_t	zone_mbuf;
+extern uma_zone_t	zone_clust;
+extern uma_zone_t	zone_pack;
+
+static __inline struct mbuf	*m_get(int how, short type);
+static __inline struct mbuf	*m_gethdr(int how, short type);
+static __inline struct mbuf	*m_getcl(int how, short type, int flags);
+static __inline struct mbuf	*m_getclr(int how, short type);	/* XXX */
+static __inline struct mbuf	*m_free(struct mbuf *m);
+static __inline void		 m_clget(struct mbuf *m, int how);
+static __inline void		 m_chtype(struct mbuf *m, short new_type);
+void				 mb_free_ext(struct mbuf *);
+
+static __inline
+struct mbuf *
+m_get(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+/* XXX This should be depracated, very little use */
+static __inline
+struct mbuf *
+m_getclr(int how, short type)
+{
+	struct mbuf *m;
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m != NULL)
+		bzero(m->m_data, MLEN);
+	return m;
+}
+
+static __inline
+struct mbuf *
+m_gethdr(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = M_PKTHDR;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_getcl(int how, short type, int flags)
+{
+	struct mb_args args;
+
+	args.flags = flags;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_pack, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_free(struct mbuf *m)
+{
+	struct mbuf *n = m->m_next;
+
+#ifdef INVARIANTS
+	m->m_flags |= M_FREELIST;
+#endif
+	if (m->m_flags & M_EXT)
+		mb_free_ext(m);
+	else
+		uma_zfree(zone_mbuf, m);
+	return n;
+}
+
+static __inline
+void
+m_clget(struct mbuf *m, int how)
+{
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, how);
+}
+
+static __inline
+void
+m_chtype(struct mbuf *m, short new_type)
+{
+	m->m_type = new_type;
+}
+
+/*
  * mbuf, cluster, and external object allocation macros
  * (for compatibility purposes).
  */
 /* NB: M_COPY_PKTHDR is deprecated.  Use M_MOVE_PKTHDR or m_dup_pktdr. */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
-#define	m_getclr(how, type)	m_get_clrd((how), (type))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
@@ -309,23 +420,6 @@ struct mbstat {
     m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type))
 
 /*
- * MEXTFREE(m): disassociate (and possibly free) an external object from (m).
- *
- * If the atomic_cmpset_int() returns 0, then we effectively do nothing
- * in terms of "cleaning up" (freeing the ext buf and ref. counter) as
- * this means that either there are still references, or another thread
- * is taking care of the clean-up.
- */
-#define	MEXTFREE(m) do {						\
-	struct mbuf *_mb = (m);						\
-									\
-	MEXT_REM_REF(_mb);						\
-	if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1))		\
-		_mext_free(_mb);					\
-	_mb->m_flags &= ~M_EXT;						\
-} while (0)
-
-/*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this
  * can be both the local data payload, or an external buffer area,
  * depending on whether M_EXT is set).
@@ -425,18 +519,13 @@ extern	int max_linkhdr;		/* Largest link-level header */
 extern	int max_protohdr;		/* Largest protocol header */
 extern	struct mbstat mbstat;		/* General mbuf stats/infos */
 extern	int nmbclusters;		/* Maximum number of clusters */
-extern	int nmbcnt;			/* Scale kmem_map for counter space */
-extern	int nmbufs;			/* Maximum number of mbufs */
 
 struct uio;
 
-void		 _mext_free(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 void		 m_cat(struct mbuf *, struct mbuf *);
-void		 m_chtype(struct mbuf *, short);
-void		 m_clget(struct mbuf *, int);
 void		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(void *, void *), void *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
@@ -451,13 +540,7 @@ struct	mbuf	*m_dup(struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct	mbuf	*m_fragment(struct mbuf *, int, int);
-struct	mbuf	*m_free(struct mbuf *);
 void		 m_freem(struct mbuf *);
-struct	mbuf	*m_get(int, short);
-struct	mbuf	*m_get_clrd(int, short);
-struct	mbuf	*m_getcl(int, short, int);
-struct	mbuf	*m_gethdr(int, short);
-struct	mbuf	*m_gethdr_clrd(int, short);
 struct	mbuf	*m_getm(struct mbuf *, int, int, short);
 struct	mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
@@ -470,7 +553,7 @@ struct	mbuf	*m_split(struct mbuf *, int, int);
 struct	mbuf	*m_uiotombuf(struct uio *, int, int);
 
 /*-
- * Packets may have annotations attached by affixing a list
+ * Network packets may have annotations attached by affixing a list
  * of "packet tags" to the pkthdr structure.  Packet tags are
  * dynamically allocated semi-opaque data structures that have
  * a fixed header (struct m_tag) that specifies the size of the
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index 4de1efa..0d34ca3 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -43,7 +43,7 @@
 
 /* Types and type defs */
 
-struct uma_zone; 
+struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
@@ -157,12 +157,46 @@ typedef void (*uma_fini)(void *mem, int size);
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
-
 uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 			uma_init uminit, uma_fini fini, int align,
 			u_int16_t flags);
 
 /*
+ * Create a secondary uma zone
+ *
+ * Arguments:
+ *	name  The text name of the zone for debugging and stats, this memory
+ *		should not be freed until the zone has been deallocated.
+ *	ctor  The constructor that is called when the object is allocated
+ *	dtor  The destructor that is called when the object is freed.
+ *	zinit  An initializer that sets up the initial state of the memory
+ *		as the object passes from the Keg's slab to the Zone's cache.
+ *	zfini  A discard function that undoes initialization done by init
+ *		as the object passes from the Zone's cache to the Keg's slab.
+ *
+ *		ctor/dtor/zinit/zfini may all be null, see notes above.
+ *		Note that the zinit and zfini specified here are NOT
+ *		exactly the same as the init/fini specified to uma_zcreate()
+ *		when creating a master zone.  These zinit/zfini are called
+ *		on the TRANSITION from keg to zone (and vice-versa). Once
+ *		these are set, the primary zone may alter its init/fini
+ *		(which are called when the object passes from VM to keg)
+ *		using uma_zone_set_init/fini()) as well as its own
+ *		zinit/zfini (unset by default for master zone) with
+ *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
+ *
+ *	align A bitmask that corisponds to the requested alignment
+ *		eg 4 would be 0x3
+ *	flags A set of parameters that control the behavior of the zone
+ *
+ * Returns:
+ *	A pointer to a structure which is intended to be opaque to users of
+ *	the interface.  The value may be null if the wait flag is not set.
+ */
+uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_zone_t master);
+
+/*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
@@ -185,6 +219,9 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
+#define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
+#define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
+#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@@ -201,7 +238,6 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
  *	zone  The zone we want to destroy.
  *
  */
-
 void uma_zdestroy(uma_zone_t zone);
 
 /*
@@ -376,6 +412,28 @@ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
 void uma_zone_set_max(uma_zone_t zone, int nitems);
 
 /*
+ * The following two routines (uma_zone_set_init/fini)
+ * are used to set the backend init/fini pair which acts on an
+ * object as it becomes allocated and is placed in a slab within
+ * the specified zone's backing keg.  These should probably not
+ * be changed once allocations have already begun and only
+ * immediately upon zone creation.
+ */
+void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
+void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
+
+/*
+ * The following two routines (uma_zone_set_zinit/zfini) are
+ * used to set the zinit/zfini pair which acts on an object as
+ * it passes from the backing Keg's slab cache to the
+ * specified Zone's bucket cache.  These should probably not
+ * be changed once allocations have already begun and
+ * only immediately upon zone creation.
+ */
+void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
+void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
+
+/*
  * Replaces the standard page_alloc or obj_alloc functions for this zone
  *
  * Arguments:
@@ -430,5 +488,19 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
  */
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
+/*
+ * Used to lookup the reference counter allocated for an item
+ * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
+ * reference counters are allocated for items and stored in
+ * the underlying slab header.
+ *
+ * Arguments:
+ * 	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
+ *	item  The address of the item for which we want a refcnt.
+ *
+ * Returns:
+ * 	A pointer to a u_int32_t reference counter.
+ */
+u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
 
 #endif
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index f693540..82d60c6 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -84,15 +84,19 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 /*
- * This is the zone from which all zones are spawned.  The idea is that even
- * the zone heads are allocated from the allocator, so we use the bss section
- * to bootstrap us.
+ * This is the zone and keg from which all zones are spawned.  The idea is that
+ * even the zone & keg heads are allocated from the allocator, so we use the
+ * bss section to bootstrap us.
  */
-static struct uma_zone masterzone;
-static uma_zone_t zones = &masterzone;
+static struct uma_keg masterkeg;
+static struct uma_zone masterzone_k;
+static struct uma_zone masterzone_z;
+static uma_zone_t kegs = &masterzone_k;
+static uma_zone_t zones = &masterzone_z;
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
+static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
@@ -107,10 +111,10 @@ static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
  */
 static int bucketdisable = 1;
 
-/* Linked list of all zones in the system */
-static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones);
+/* Linked list of all kegs in the system */
+static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
 
-/* This mutex protects the zone list */
+/* This mutex protects the keg list */
 static struct mtx uma_mtx;
 
 /* These are the pcpu cache locks */
@@ -144,6 +148,16 @@ struct uma_zctor_args {
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
+	uma_keg_t keg;
+	int align;
+	u_int16_t flags;
+};
+
+struct uma_kctor_args {
+	uma_zone_t zone;
+	size_t size;
+	uma_init uminit;
+	uma_fini fini;
 	int align;
 	u_int16_t flags;
 };
@@ -179,6 +193,8 @@ static uma_slab_t slab_zalloc(uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
+static void keg_ctor(void *, int, void *);
+static void keg_dtor(void *, int, void *);
 static void zone_ctor(void *, int, void *);
 static void zone_dtor(void *, int, void *);
 static void zero_init(void *, int);
@@ -202,6 +218,8 @@ static int uma_zalloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
 static void zone_drain(uma_zone_t);
+static void uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
+    uma_fini fini, int align, u_int16_t flags);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -328,10 +346,12 @@ uma_timeout(void *unused)
 static void
 zone_timeout(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	uma_cache_t cache;
 	u_int64_t alloc;
 	int cpu;
 
+	keg = zone->uz_keg;
 	alloc = 0;
 
 	/*
@@ -344,7 +364,7 @@ zone_timeout(uma_zone_t zone)
 	 * to lock and do it here instead so that the statistics don't get too
 	 * far out of sync.
 	 */
-	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
+	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) {
 		for (cpu = 0; cpu <= mp_maxid; cpu++) {
 			if (CPU_ABSENT(cpu))
 				continue;
@@ -369,8 +389,8 @@ zone_timeout(uma_zone_t zone)
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 
-	if (zone->uz_flags & UMA_ZONE_HASH &&
-	    zone->uz_pages / zone->uz_ppera >= zone->uz_hash.uh_hashsize) {
+	if (keg->uk_flags & UMA_ZONE_HASH &&
+	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
@@ -381,14 +401,14 @@ zone_timeout(uma_zone_t zone)
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
-		newhash = zone->uz_hash;
+		newhash = keg->uk_hash;
 		ZONE_UNLOCK(zone);
 		ret = hash_alloc(&newhash);
 		ZONE_LOCK(zone);
 		if (ret) {
-			if (hash_expand(&zone->uz_hash, &newhash)) {
-				oldhash = zone->uz_hash;
-				zone->uz_hash = newhash;
+			if (hash_expand(&keg->uk_hash, &newhash)) {
+				oldhash = keg->uk_hash;
+				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
@@ -530,7 +550,7 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 	mzone = 0;
 
 	/* We have to lookup the slab again for malloc.. */
-	if (zone->uz_flags & UMA_ZONE_MALLOC)
+	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
 		mzone = 1;
 
 	while (bucket->ub_cnt > 0)  {
@@ -636,29 +656,32 @@ static void
 zone_drain(uma_zone_t zone)
 {
 	struct slabhead freeslabs = {};
+	uma_keg_t keg;
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
+	keg = zone->uz_keg;
+
 	/*
-	 * We don't want to take pages from staticly allocated zones at this
+	 * We don't want to take pages from statically allocated zones at this
 	 * time
 	 */
-	if (zone->uz_flags & UMA_ZONE_NOFREE || zone->uz_freef == NULL)
+	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 	ZONE_LOCK(zone);
 
 #ifdef UMA_DEBUG
-	printf("%s free items: %u\n", zone->uz_name, zone->uz_free);
+	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
 #endif
 	bucket_cache_drain(zone);
-	if (zone->uz_free == 0)
+	if (keg->uk_free == 0)
 		goto finished;
 
-	slab = LIST_FIRST(&zone->uz_free_slab);
+	slab = LIST_FIRST(&keg->uk_free_slab);
 	while (slab) {
 		n = LIST_NEXT(slab, us_link);
 
@@ -669,11 +692,11 @@ zone_drain(uma_zone_t zone)
 		}
 
 		LIST_REMOVE(slab, us_link);
-		zone->uz_pages -= zone->uz_ppera;
-		zone->uz_free -= zone->uz_ipers;
+		keg->uk_pages -= keg->uk_ppera;
+		keg->uk_free -= keg->uk_ipers;
 
-		if (zone->uz_flags & UMA_ZONE_HASH)
-			UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data);
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
@@ -684,34 +707,34 @@ finished:
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
-		if (zone->uz_fini)
-			for (i = 0; i < zone->uz_ipers; i++)
-				zone->uz_fini(
-				    slab->us_data + (zone->uz_rsize * i),
-				    zone->uz_size);
+		if (keg->uk_fini)
+			for (i = 0; i < keg->uk_ipers; i++)
+				keg->uk_fini(
+				    slab->us_data + (keg->uk_rsize * i),
+				    keg->uk_size);
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
-		if (zone->uz_flags & UMA_ZONE_OFFPAGE)
-			uma_zfree_internal(slabzone, slab, NULL, 0);
-		if (zone->uz_flags & UMA_ZONE_MALLOC) {
+		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
 				obj = kmem_object;
 			else
 				obj = NULL;
-			for (i = 0; i < zone->uz_ppera; i++)
+			for (i = 0; i < keg->uk_ppera; i++)
 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 				    obj);
 		}
+		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+			uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
-		    zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera);
+		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
-		zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags);
+		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
-
 }
 
 /*
@@ -728,20 +751,23 @@ finished:
 static uma_slab_t
 slab_zalloc(uma_zone_t zone, int wait)
 {
-	uma_slab_t slab;	/* Starting slab */
+	uma_slabrefcnt_t slabref;
+	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
 	slab = NULL;
+	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG
 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
 #endif
 	ZONE_UNLOCK(zone);
 
-	if (zone->uz_flags & UMA_ZONE_OFFPAGE) {
-		slab = uma_zalloc_internal(slabzone, NULL, wait);
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
+		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
 			ZONE_LOCK(zone);
 			return NULL;
@@ -755,12 +781,12 @@ slab_zalloc(uma_zone_t zone, int wait)
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
-	if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
-	mem = zone->uz_allocf(zone, zone->uz_ppera * UMA_SLAB_SIZE,
+	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
 	    &flags, wait);
 	if (mem == NULL) {
 		ZONE_LOCK(zone);
@@ -768,32 +794,39 @@ slab_zalloc(uma_zone_t zone, int wait)
 	}
 
 	/* Point the slab into the allocated memory */
-	if (!(zone->uz_flags & UMA_ZONE_OFFPAGE))
-		slab = (uma_slab_t )(mem + zone->uz_pgoff);
+	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
+		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
-	if (zone->uz_flags & UMA_ZONE_MALLOC)
-		for (i = 0; i < zone->uz_ppera; i++)
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+	    (keg->uk_flags & UMA_ZONE_REFCNT))
+		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
-	slab->us_zone = zone;
+	slab->us_keg = keg;
 	slab->us_data = mem;
-	slab->us_freecount = zone->uz_ipers;
+	slab->us_freecount = keg->uk_ipers;
 	slab->us_firstfree = 0;
 	slab->us_flags = flags;
-	for (i = 0; i < zone->uz_ipers; i++)
-		slab->us_freelist[i] = i+1;
+	for (i = 0; i < keg->uk_ipers; i++)
+		slab->us_freelist[i].us_item = i+1;
 
-	if (zone->uz_init)
-		for (i = 0; i < zone->uz_ipers; i++)
-			zone->uz_init(slab->us_data + (zone->uz_rsize * i),
-			    zone->uz_size);
+	if (keg->uk_flags & UMA_ZONE_REFCNT) {
+		slabref = (uma_slabrefcnt_t)slab;
+		for (i = 0; i < keg->uk_ipers; i++)
+			slabref->us_freelist[i].us_refcnt = 0;
+	}
+
+	if (keg->uk_init)
+		for (i = 0; i < keg->uk_ipers; i++)
+			keg->uk_init(slab->us_data + (keg->uk_rsize * i),
+			    keg->uk_size);
 	ZONE_LOCK(zone);
 
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		UMA_HASH_INSERT(&zone->uz_hash, slab, mem);
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
-	zone->uz_pages += zone->uz_ppera;
-	zone->uz_free += zone->uz_ipers;
+	keg->uk_pages += keg->uk_ppera;
+	keg->uk_free += keg->uk_ipers;
 
 	return (slab);
 }
@@ -806,6 +839,10 @@ slab_zalloc(uma_zone_t zone, int wait)
 static void *
 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
+
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
@@ -827,11 +864,11 @@ startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 	 * Now that we've booted reset these users to their real allocator.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
-	zone->uz_allocf = uma_small_alloc;
+	keg->uk_allocf = uma_small_alloc;
 #else
-	zone->uz_allocf = page_alloc;
+	keg->uk_allocf = page_alloc;
 #endif
-	return zone->uz_allocf(zone, bytes, pflag, wait);
+	return keg->uk_allocf(zone, bytes, pflag, wait);
 }
 
 /*
@@ -877,7 +914,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 	vm_page_t p;
 	int pages, startpages;
 
-	object = zone->uz_obj;
+	object = zone->uz_keg->uk_obj;
 	retkva = 0;
 
 	/*
@@ -887,7 +924,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 	p = TAILQ_LAST(&object->memq, pglist);
 	pages = p != NULL ? p->pindex + 1 : 0;
 	startpages = pages;
-	zkva = zone->uz_kva + pages * PAGE_SIZE;
+	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
 	for (; bytes > 0; bytes -= PAGE_SIZE) {
 		p = vm_page_alloc(object, pages,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
@@ -965,29 +1002,33 @@ zero_init(void *mem, int size)
 static void
 zone_small_init(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	int rsize;
 	int memused;
 	int ipers;
 
-	rsize = zone->uz_size;
+	keg = zone->uz_keg;
+	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
+	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
 		rsize = UMA_SMALLEST_UNIT;
 
-	if (rsize & zone->uz_align)
-		rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1);
+	if (rsize & keg->uk_align)
+		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 
-	zone->uz_rsize = rsize;
+	keg->uk_rsize = rsize;
 
 	rsize += 1;	/* Account for the byte of linkage */
-	zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
-	zone->uz_ppera = 1;
+	keg->uk_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
+	keg->uk_ppera = 1;
 
-	KASSERT(zone->uz_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
-	memused = zone->uz_ipers * zone->uz_rsize;
+	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
+	memused = keg->uk_ipers * keg->uk_rsize;
 
 	/* Can we do any better? */
-	if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) {
+	if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+	    ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE)) {
 		/*
 		 * We can't do this if we're internal or if we've been
 		 * asked to not go to the VM for buckets.  If we do this we
@@ -995,15 +1036,16 @@ zone_small_init(uma_zone_t zone)
 		 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
 		 * result of UMA_ZONE_VM, which clearly forbids it.
 		 */
-		if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) ||
-		    (zone->uz_flags & UMA_ZFLAG_CACHEONLY))
+		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
+		    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 			return;
-		ipers = UMA_SLAB_SIZE / zone->uz_rsize;
-		if (ipers > zone->uz_ipers) {
-			zone->uz_flags |= UMA_ZONE_OFFPAGE;
-			if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
-				zone->uz_flags |= UMA_ZONE_HASH;
-			zone->uz_ipers = ipers;
+		ipers = UMA_SLAB_SIZE / keg->uk_rsize;
+		if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+		    (ipers > keg->uk_ipers)) {
+			keg->uk_flags |= UMA_ZONE_OFFPAGE;
+			if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+				keg->uk_flags |= UMA_ZONE_HASH;
+			keg->uk_ipers = ipers;
 		}
 	}
 }
@@ -1022,179 +1064,298 @@ zone_small_init(uma_zone_t zone)
 static void
 zone_large_init(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	int pages;
 
-	KASSERT((zone->uz_flags & UMA_ZFLAG_CACHEONLY) == 0,
+	keg = zone->uz_keg;
+
+	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
+	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
 
-	pages = zone->uz_size / UMA_SLAB_SIZE;
+	pages = keg->uk_size / UMA_SLAB_SIZE;
 
 	/* Account for remainder */
-	if ((pages * UMA_SLAB_SIZE) < zone->uz_size)
+	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
 		pages++;
 
-	zone->uz_ppera = pages;
-	zone->uz_ipers = 1;
+	keg->uk_ppera = pages;
+	keg->uk_ipers = 1;
 
-	zone->uz_flags |= UMA_ZONE_OFFPAGE;
-	if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
-		zone->uz_flags |= UMA_ZONE_HASH;
+	keg->uk_flags |= UMA_ZONE_OFFPAGE;
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+		keg->uk_flags |= UMA_ZONE_HASH;
 
-	zone->uz_rsize = zone->uz_size;
+	keg->uk_rsize = keg->uk_size;
 }
 
 /*
- * Zone header ctor.  This initializes all fields, locks, etc.  And inserts
- * the zone onto the global zone list.
+ * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
+ * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
- *	udata  Actually uma_zcreat_args
+ *	udata  Actually uma_kctor_args
  */
-
 static void
-zone_ctor(void *mem, int size, void *udata)
+keg_ctor(void *mem, int size, void *udata)
 {
-	struct uma_zctor_args *arg = udata;
-	uma_zone_t zone = mem;
-	int privlc;
+	struct uma_kctor_args *arg = udata;
+	uma_keg_t keg = mem;
+	uma_zone_t zone;
 
-	bzero(zone, size);
-	zone->uz_name = arg->name;
-	zone->uz_size = arg->size;
-	zone->uz_ctor = arg->ctor;
-	zone->uz_dtor = arg->dtor;
-	zone->uz_init = arg->uminit;
-	zone->uz_fini = arg->fini;
-	zone->uz_align = arg->align;
-	zone->uz_free = 0;
-	zone->uz_pages = 0;
-	zone->uz_flags = arg->flags;
-	zone->uz_allocf = page_alloc;
-	zone->uz_freef = page_free;
+	bzero(keg, size);
+	keg->uk_size = arg->size;
+	keg->uk_init = arg->uminit;
+	keg->uk_fini = arg->fini;
+	keg->uk_align = arg->align;
+	keg->uk_free = 0;
+	keg->uk_pages = 0;
+	keg->uk_flags = arg->flags;
+	keg->uk_allocf = page_alloc;
+	keg->uk_freef = page_free;
+	keg->uk_recurse = 0;
+	keg->uk_slabzone = NULL;
 
-	if (arg->flags & UMA_ZONE_ZINIT)
-		zone->uz_init = zero_init;
+	/*
+	 * The master zone is passed to us at keg-creation time.
+	 */
+	zone = arg->zone;
+	zone->uz_keg = keg;
 
 	if (arg->flags & UMA_ZONE_VM)
-		zone->uz_flags |= UMA_ZFLAG_CACHEONLY;
+		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
+
+	if (arg->flags & UMA_ZONE_ZINIT)
+		keg->uk_init = zero_init;
 
 	/*
-	 * XXX:
-	 * The +1 byte added to uz_size is to account for the byte of
+	 * The +1 byte added to uk_size is to account for the byte of
 	 * linkage that is added to the size in zone_small_init().  If
 	 * we don't account for this here then we may end up in
 	 * zone_small_init() with a calculated 'ipers' of 0.
 	 */
-	if ((zone->uz_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
+	if ((keg->uk_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 		zone_large_init(zone);
 	else
 		zone_small_init(zone);
+
+	if (keg->uk_flags & UMA_ZONE_REFCNT)
+		keg->uk_slabzone = slabrefzone;
+	else if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		keg->uk_slabzone = slabzone;
+
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
-	if (zone->uz_ppera == 1) {
+	if (keg->uk_ppera == 1) {
 #ifdef UMA_MD_SMALL_ALLOC
-		zone->uz_allocf = uma_small_alloc;
-		zone->uz_freef = uma_small_free;
+		keg->uk_allocf = uma_small_alloc;
+		keg->uk_freef = uma_small_free;
 #endif
 		if (booted == 0)
-			zone->uz_allocf = startup_alloc;
+			keg->uk_allocf = startup_alloc;
 	}
+
+	/*
+	 * Initialize keg's lock (shared among zones) through
+	 * Master zone
+	 */
+	zone->uz_lock = &keg->uk_lock;
 	if (arg->flags & UMA_ZONE_MTXCLASS)
-		privlc = 1;
+		ZONE_LOCK_INIT(zone, 1);
 	else
-		privlc = 0;
+		ZONE_LOCK_INIT(zone, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
-	if (!(zone->uz_flags & UMA_ZONE_OFFPAGE)) {
+	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		int totsize;
 
 		/* Size of the slab struct and free list */
-		totsize = sizeof(struct uma_slab) + zone->uz_ipers;
+		totsize = sizeof(struct uma_slab) + keg->uk_ipers;
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
-		zone->uz_pgoff = UMA_SLAB_SIZE - totsize;
-		totsize = zone->uz_pgoff + sizeof(struct uma_slab)
-		    + zone->uz_ipers;
+		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
+		totsize = keg->uk_pgoff + sizeof(struct uma_slab)
+		    + keg->uk_ipers;
 		/* I don't think it's possible, but I'll make sure anyway */
 		if (totsize > UMA_SLAB_SIZE) {
 			printf("zone %s ipers %d rsize %d size %d\n",
-			    zone->uz_name, zone->uz_ipers, zone->uz_rsize,
-			    zone->uz_size);
+			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
+			    keg->uk_size);
 			panic("UMA slab won't fit.\n");
 		}
 	}
 
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		hash_alloc(&zone->uz_hash);
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
 	    zone->uz_name, zone,
-	    zone->uz_size, zone->uz_ipers,
-	    zone->uz_ppera, zone->uz_pgoff);
+	    keg->uk_size, keg->uk_ipers,
+	    keg->uk_ppera, keg->uk_pgoff);
 #endif
-	ZONE_LOCK_INIT(zone, privlc);
+
+	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	mtx_lock(&uma_mtx);
-	LIST_INSERT_HEAD(&uma_zones, zone, uz_link);
+	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	mtx_unlock(&uma_mtx);
+}
+
+/*
+ * Zone header ctor.  This initializes all fields, locks, etc.
+ *
+ * Arguments/Returns follow uma_ctor specifications
+ *	udata  Actually uma_zctor_args
+ */
+
+static void
+zone_ctor(void *mem, int size, void *udata)
+{
+	struct uma_zctor_args *arg = udata;
+	uma_zone_t zone = mem;
+	uma_zone_t z;
+	uma_keg_t keg;
+
+	bzero(zone, size);
+	zone->uz_name = arg->name;
+	zone->uz_ctor = arg->ctor;
+	zone->uz_dtor = arg->dtor;
+	zone->uz_init = NULL;
+	zone->uz_fini = NULL;
+	zone->uz_allocs = 0;
+	zone->uz_fills = zone->uz_count = 0;
+
+	if (arg->flags & UMA_ZONE_SECONDARY) {
+		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
+		keg = arg->keg;
+		zone->uz_keg = keg;
+		zone->uz_init = arg->uminit;
+		zone->uz_fini = arg->fini;
+		zone->uz_lock = &keg->uk_lock;
+		mtx_lock(&uma_mtx);
+		ZONE_LOCK(zone);
+		keg->uk_flags |= UMA_ZONE_SECONDARY;
+		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
+			if (LIST_NEXT(z, uz_link) == NULL) {
+				LIST_INSERT_AFTER(z, zone, uz_link);
+				break;
+			}
+		}
+		ZONE_UNLOCK(zone);
+		mtx_unlock(&uma_mtx);
+	} else if (arg->keg == NULL) {
+		uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
+		    arg->align, arg->flags);
+	} else {
+		struct uma_kctor_args karg;
+
+		/* We should only be here from uma_startup() */
+		karg.size = arg->size;
+		karg.uminit = arg->uminit;
+		karg.fini = arg->fini;
+		karg.align = arg->align;
+		karg.flags = arg->flags;
+		karg.zone = zone;
+		keg_ctor(arg->keg, sizeof(struct uma_keg), &karg);
+	}
+	keg = zone->uz_keg;
+	zone->uz_lock = &keg->uk_lock;
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
-	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
+		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
+		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return;
+	}
 
-	if (zone->uz_ipers <= BUCKET_MAX)
-		zone->uz_count = zone->uz_ipers;
+	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
+		zone->uz_count = BUCKET_MAX;
+	else if (keg->uk_ipers <= BUCKET_MAX)
+		zone->uz_count = keg->uk_ipers;
 	else
 		zone->uz_count = BUCKET_MAX;
 }
 
 /*
- * Zone header dtor.  This frees all data, destroys locks, frees the hash table
- * and removes the zone from the global list.
+ * Keg header dtor.  This frees all data, destroys locks, frees the hash
+ * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
+static void
+keg_dtor(void *arg, int size, void *udata)
+{
+	uma_keg_t keg;
 
+	keg = (uma_keg_t)arg;
+	mtx_lock(&keg->uk_lock);
+	if (keg->uk_free != 0) {
+		printf("Freed UMA keg was not empty (%d items). "
+		    " Lost %d pages of memory.\n",
+		    keg->uk_free, keg->uk_pages);
+	}
+	mtx_unlock(&keg->uk_lock);
+
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		hash_free(&keg->uk_hash);
+
+	mtx_destroy(&keg->uk_lock);
+}
+
+/*
+ * Zone header dtor.
+ *
+ * Arguments/Returns follow uma_dtor specifications
+ *	udata  unused
+ */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_zone_t zone;
+	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
+	keg = zone->uz_keg;
 
-	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
+	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
+
 	mtx_lock(&uma_mtx);
-	LIST_REMOVE(zone, uz_link);
 	zone_drain(zone);
-	mtx_unlock(&uma_mtx);
-
-	ZONE_LOCK(zone);
-	if (zone->uz_free != 0) {
-		printf("Zone %s was not empty (%d items). "
-		    " Lost %d pages of memory.\n",
-		    zone->uz_name, zone->uz_free, zone->uz_pages);
-		uma_print_zone(zone);
+	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
+		LIST_REMOVE(zone, uz_link);
+		/*
+		 * XXX there are some races here where
+		 * the zone can be drained but zone lock
+		 * released and then refilled before we
+		 * remove it... we dont care for now
+		 */
+		ZONE_LOCK(zone);
+		if (LIST_EMPTY(&keg->uk_zones))
+			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
+		ZONE_UNLOCK(zone);
+		mtx_unlock(&uma_mtx);
+	} else {
+		LIST_REMOVE(keg, uk_link);
+		LIST_REMOVE(zone, uz_link);
+		mtx_unlock(&uma_mtx);
+		uma_zfree_internal(kegs, keg, NULL, 0);
 	}
-
-	ZONE_UNLOCK(zone);
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		hash_free(&zone->uz_hash);
-
-	ZONE_LOCK_FINI(zone);
+	zone->uz_keg = NULL;
 }
+
 /*
  * Traverses every zone in the system and calls a callback
  *
@@ -1208,11 +1369,14 @@ zone_dtor(void *arg, int size, void *udata)
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
+	uma_keg_t keg;
 	uma_zone_t zone;
 
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(zone, &uma_zones, uz_link)
-		zfunc(zone);
+	LIST_FOREACH(keg, &uma_kegs, uk_link) {
+		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
+			zfunc(zone);
+	}
 	mtx_unlock(&uma_mtx);
 }
 
@@ -1227,25 +1391,23 @@ uma_startup(void *bootmem)
 	int i;
 
 #ifdef UMA_DEBUG
-	printf("Creating uma zone headers zone.\n");
+	printf("Creating uma keg headers zone and keg.\n");
 #endif
 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
-	/* "manually" Create the initial zone */
-	args.name = "UMA Zones";
-	args.size = sizeof(struct uma_zone) +
-	    (sizeof(struct uma_cache) * (mp_maxid + 1));
-	args.ctor = zone_ctor;
-	args.dtor = zone_dtor;
+
+	/* "manually" create the initial zone */
+	args.name = "UMA Kegs";
+	args.size = sizeof(struct uma_keg);
+	args.ctor = keg_ctor;
+	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
+	args.keg = &masterkeg;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
-	zone_ctor(zones, sizeof(struct uma_zone), &args);
+	zone_ctor(kegs, sizeof(struct uma_zone), &args);
 
-	/* Initialize the pcpu cache lock set once and for all */
-	for (i = 0; i <= mp_maxid; i++)
-		CPU_LOCK_INIT(i);
 #ifdef UMA_DEBUG
 	printf("Filling boot free list.\n");
 #endif
@@ -1258,7 +1420,30 @@ uma_startup(void *bootmem)
 	}
 
 #ifdef UMA_DEBUG
-	printf("Creating slab zone.\n");
+	printf("Creating uma zone headers zone and keg.\n");
+#endif
+	args.name = "UMA Zones";
+	args.size = sizeof(struct uma_zone) +
+	    (sizeof(struct uma_cache) * (mp_maxid + 1));
+	args.ctor = zone_ctor;
+	args.dtor = zone_dtor;
+	args.uminit = zero_init;
+	args.fini = NULL;
+	args.keg = NULL;
+	args.align = 32 - 1;
+	args.flags = UMA_ZFLAG_INTERNAL;
+	/* The initial zone has no Per cpu queues so it's smaller */
+	zone_ctor(zones, sizeof(struct uma_zone), &args);
+
+#ifdef UMA_DEBUG
+	printf("Initializing pcpu cache locks.\n");
+#endif
+	/* Initialize the pcpu cache lock set once and for all */
+	for (i = 0; i <= mp_maxid; i++)
+		CPU_LOCK_INIT(i);
+
+#ifdef UMA_DEBUG
+	printf("Creating slab and hash zones.\n");
 #endif
 
 	/*
@@ -1276,6 +1461,20 @@ uma_startup(void *bootmem)
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
+	/*
+	 * We also create a zone for the bigger slabs with reference
+	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
+	 */
+	slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt);
+	slabsize /= UMA_MAX_WASTE;
+	slabsize++;
+	slabsize += 4 * slabsize;
+	slabsize += sizeof(struct uma_slab_refcnt);
+	slabrefzone = uma_zcreate("UMA RCntSlabs",
+				  slabsize,
+				  NULL, NULL, NULL, NULL,
+				  UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
+
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
@@ -1321,6 +1520,21 @@ uma_startup3(void)
 #endif
 }
 
+static void
+uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
+		int align, u_int16_t flags)
+{
+	struct uma_kctor_args args;
+
+	args.size = size;
+	args.uminit = uminit;
+	args.fini = fini;
+	args.align = align;
+	args.flags = flags;
+	args.zone = zone;
+	zone = uma_zalloc_internal(kegs, &args, M_WAITOK);
+}
+
 /* See uma.h */
 uma_zone_t
 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
@@ -1338,6 +1552,27 @@ uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 	args.fini = fini;
 	args.align = align;
 	args.flags = flags;
+	args.keg = NULL;
+
+	return (uma_zalloc_internal(zones, &args, M_WAITOK));
+}
+
+/* See uma.h */
+uma_zone_t
+uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_zone_t master)
+{
+	struct uma_zctor_args args;
+
+	args.name = name;
+	args.size = master->uz_keg->uk_size;
+	args.ctor = ctor;
+	args.dtor = dtor;
+	args.uminit = zinit;
+	args.fini = zfini;
+	args.align = master->uz_keg->uk_align;
+	args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
+	args.keg = master->uz_keg;
 
 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
 }
@@ -1357,35 +1592,25 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
+	int badness = 1;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 
-#ifdef INVARIANTS
-	/*
-	 * To make sure that WAITOK or NOWAIT is set, but not more than
-	 * one, and check against the API botches that are common.
-	 * The uma code implies M_WAITOK if M_NOWAIT is not set, so
-	 * we default to waiting if none of the flags is set.
-	 */
-	cpu = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
-	if (cpu != M_NOWAIT && cpu != M_WAITOK) {
-		static	struct timeval lasterr;
-		static	int curerr, once;
-		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
-			printf("Bad uma_zalloc flags: %x\n", cpu);
-			backtrace();
-			once++;
-		}
-	}
-#endif
 	if (!(flags & M_NOWAIT)) {
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
-		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-		    "malloc() of \"%s\"", zone->uz_name);
+#ifdef WITNESS
+		badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
+		    zone->uz_name);
+#endif
+		if (badness) {
+			flags &= ~M_WAITOK;
+			flags |= M_NOWAIT;
+		}
 	}
 
 zalloc_restart:
@@ -1413,9 +1638,9 @@ zalloc_start:
 #endif
 			CPU_UNLOCK(cpu);
 			if (zone->uz_ctor)
-				zone->uz_ctor(item, zone->uz_size, udata);
+				zone->uz_ctor(item,zone->uz_keg->uk_size,udata);
 			if (flags & M_ZERO)
-				bzero(item, zone->uz_size);
+				bzero(item, zone->uz_keg->uk_size);
 			return (item);
 		} else if (cache->uc_freebucket) {
 			/*
@@ -1465,6 +1690,7 @@ zalloc_start:
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
+
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the begining.
@@ -1488,6 +1714,9 @@ static uma_slab_t
 uma_zone_slab(uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
 
 	/*
 	 * This is to prevent us from recursively trying to allocate
@@ -1498,7 +1727,7 @@ uma_zone_slab(uma_zone_t zone, int flags)
 	 * things happen.  So instead we return a NULL bucket, and make
 	 * the code that allocates buckets smart enough to deal with it
 	 */
-	if (zone->uz_flags & UMA_ZFLAG_INTERNAL && zone->uz_recurse != 0)
+	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
 		return (NULL);
 
 	slab = NULL;
@@ -1509,14 +1738,14 @@ uma_zone_slab(uma_zone_t zone, int flags)
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
-		if (zone->uz_free != 0) {
-			if (!LIST_EMPTY(&zone->uz_part_slab)) {
-				slab = LIST_FIRST(&zone->uz_part_slab);
+		if (keg->uk_free != 0) {
+			if (!LIST_EMPTY(&keg->uk_part_slab)) {
+				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
-				slab = LIST_FIRST(&zone->uz_free_slab);
+				slab = LIST_FIRST(&keg->uk_free_slab);
 				LIST_REMOVE(slab, us_link);
-				LIST_INSERT_HEAD(&zone->uz_part_slab, slab,
-				us_link);
+				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
+				    us_link);
 			}
 			return (slab);
 		}
@@ -1527,27 +1756,28 @@ uma_zone_slab(uma_zone_t zone, int flags)
 		if (flags & M_NOVM)
 			break;
 
-		if (zone->uz_maxpages &&
-		    zone->uz_pages >= zone->uz_maxpages) {
-			zone->uz_flags |= UMA_ZFLAG_FULL;
+		if (keg->uk_maxpages &&
+		    keg->uk_pages >= keg->uk_maxpages) {
+			keg->uk_flags |= UMA_ZFLAG_FULL;
 
 			if (flags & M_NOWAIT)
 				break;
 			else
-				msleep(zone, &zone->uz_lock, PVM,
+				msleep(keg, &keg->uk_lock, PVM,
 				    "zonelimit", 0);
 			continue;
 		}
-		zone->uz_recurse++;
+		keg->uk_recurse++;
 		slab = slab_zalloc(zone, flags);
-		zone->uz_recurse--;
+		keg->uk_recurse--;
+
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
-			LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
@@ -1564,22 +1794,25 @@ uma_zone_slab(uma_zone_t zone, int flags)
 static void *
 uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
 {
+	uma_keg_t keg;
 	void *item;
 	u_int8_t freei;
 
+	keg = zone->uz_keg;
+
 	freei = slab->us_firstfree;
-	slab->us_firstfree = slab->us_freelist[freei];
-	item = slab->us_data + (zone->uz_rsize * freei);
+	slab->us_firstfree = slab->us_freelist[freei].us_item;
+	item = slab->us_data + (keg->uk_rsize * freei);
 
 	slab->us_freecount--;
-	zone->uz_free--;
+	keg->uk_free--;
 #ifdef INVARIANTS
 	uma_dbg_alloc(zone, slab, item);
 #endif
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
 	}
 
 	return (item);
@@ -1590,6 +1823,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 {
 	uma_bucket_t bucket;
 	uma_slab_t slab;
+	int16_t saved;
 	int max;
 
 	/*
@@ -1603,7 +1837,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 		int bflags;
 
 		bflags = (flags & ~M_ZERO);
-		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+		if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 			bflags |= M_NOVM;
 
 		ZONE_UNLOCK(zone);
@@ -1628,18 +1862,36 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	/* Try to keep the buckets totally full */
+	saved = bucket->ub_cnt;
 	while (bucket->ub_cnt < max &&
 	    (slab = uma_zone_slab(zone, flags)) != NULL) {
 		while (slab->us_freecount && bucket->ub_cnt < max) {
 			bucket->ub_bucket[bucket->ub_cnt++] =
 			    uma_slab_alloc(zone, slab);
 		}
+
 		/* Don't block on the next fill */
 		flags |= M_NOWAIT;
 	}
 
-	zone->uz_fills--;
+	/*
+	 * We unlock here because we need to call the zone's init.
+	 * It should be safe to unlock because the slab dealt with
+	 * above is already on the appropriate list within the keg
+	 * and the bucket we filled is not yet on any list, so we
+	 * own it.
+	 */
+	if (zone->uz_init != NULL) {
+		int i;
+
+		ZONE_UNLOCK(zone);
+		for (i = saved; i < bucket->ub_cnt; i++)
+			zone->uz_init(bucket->ub_bucket[i],
+			    zone->uz_keg->uk_size);
+		ZONE_LOCK(zone);
+	}
 
+	zone->uz_fills--;
 	if (bucket->ub_cnt != 0) {
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
@@ -1668,10 +1920,12 @@ done:
 static void *
 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 {
+	uma_keg_t keg;
 	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
+	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -1688,10 +1942,18 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 
 	ZONE_UNLOCK(zone);
 
+	/*
+	 * We have to call both the zone's init (not the keg's init)
+	 * and the zone's ctor.  This is because the item is going from
+	 * a keg slab directly to the user, and the user is expecting it
+	 * to be both zone-init'd as well as zone-ctor'd.
+	 */
+	if (zone->uz_init != NULL)
+		zone->uz_init(item, keg->uk_size);
 	if (zone->uz_ctor != NULL)
-		zone->uz_ctor(item, zone->uz_size, udata);
+		zone->uz_ctor(item, keg->uk_size, udata);
 	if (flags & M_ZERO)
-		bzero(item, zone->uz_size);
+		bzero(item, keg->uk_size);
 
 	return (item);
 }
@@ -1700,6 +1962,7 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
+	uma_keg_t keg;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int bflags;
@@ -1708,6 +1971,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 
 	/* This is the fast path free */
 	skip = 0;
+	keg = zone->uz_keg;
+
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
@@ -1716,11 +1981,11 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 	 * a little longer for the limits to be reset.
 	 */
 
-	if (zone->uz_flags & UMA_ZFLAG_FULL)
+	if (keg->uk_flags & UMA_ZFLAG_FULL)
 		goto zfree_internal;
 
 	if (zone->uz_dtor) {
-		zone->uz_dtor(item, zone->uz_size, udata);
+		zone->uz_dtor(item, keg->uk_size, udata);
 		skip = 1;
 	}
 
@@ -1745,7 +2010,7 @@ zfree_start:
 			bucket->ub_cnt++;
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
-			if (zone->uz_flags & UMA_ZONE_MALLOC)
+			if (keg->uk_flags & UMA_ZONE_MALLOC)
 				uma_dbg_free(zone, udata, item);
 			else
 				uma_dbg_free(zone, NULL, item);
@@ -1810,7 +2075,7 @@ zfree_start:
 #endif
 	bflags = M_NOWAIT;
 
-	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+	if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 		bflags |= M_NOVM;
 	bucket = bucket_alloc(zone->uz_count, bflags);
 	if (bucket) {
@@ -1836,7 +2101,7 @@ zfree_internal:
 	 */
 	if (skip) {
 		ZONE_LOCK(zone);
-		if (zone->uz_flags & UMA_ZONE_MALLOC)
+		if (keg->uk_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
@@ -1846,7 +2111,6 @@ zfree_internal:
 	uma_zfree_internal(zone, item, udata, skip);
 
 	return;
-
 }
 
 /*
@@ -1862,20 +2126,25 @@ static void
 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t freei;
 
+	keg = zone->uz_keg;
+
 	if (!skip && zone->uz_dtor)
-		zone->uz_dtor(item, zone->uz_size, udata);
+		zone->uz_dtor(item, keg->uk_size, udata);
+	if (zone->uz_fini)
+		zone->uz_fini(item, keg->uk_size);
 
 	ZONE_LOCK(zone);
 
-	if (!(zone->uz_flags & UMA_ZONE_MALLOC)) {
+	if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-		if (zone->uz_flags & UMA_ZONE_HASH)
-			slab = hash_sfind(&zone->uz_hash, mem);
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			slab = hash_sfind(&keg->uk_hash, mem);
 		else {
-			mem += zone->uz_pgoff;
+			mem += keg->uk_pgoff;
 			slab = (uma_slab_t)mem;
 		}
 	} else {
@@ -1883,36 +2152,36 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 	}
 
 	/* Do we need to remove from any lists? */
-	if (slab->us_freecount+1 == zone->uz_ipers) {
+	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
 	/* Slab management stuff */
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-		/ zone->uz_rsize;
+		/ keg->uk_rsize;
 
 #ifdef INVARIANTS
 	if (!skip)
 		uma_dbg_free(zone, slab, item);
 #endif
 
-	slab->us_freelist[freei] = slab->us_firstfree;
+	slab->us_freelist[freei].us_item = slab->us_firstfree;
 	slab->us_firstfree = freei;
 	slab->us_freecount++;
 
 	/* Zone statistics */
-	zone->uz_free++;
+	keg->uk_free++;
 
-	if (zone->uz_flags & UMA_ZFLAG_FULL) {
-		if (zone->uz_pages < zone->uz_maxpages)
-			zone->uz_flags &= ~UMA_ZFLAG_FULL;
+	if (keg->uk_flags & UMA_ZFLAG_FULL) {
+		if (keg->uk_pages < keg->uk_maxpages)
+			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 
 		/* We can handle one more allocation */
-		wakeup_one(zone);
+		wakeup_one(keg);
 	}
 
 	ZONE_UNLOCK(zone);
@@ -1922,24 +2191,71 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 void
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
-	if (zone->uz_ppera > 1)
-		zone->uz_maxpages = nitems * zone->uz_ppera;
+	if (keg->uk_ppera > 1)
+		keg->uk_maxpages = nitems * keg->uk_ppera;
 	else
-		zone->uz_maxpages = nitems / zone->uz_ipers;
+		keg->uk_maxpages = nitems / keg->uk_ipers;
 
-	if (zone->uz_maxpages * zone->uz_ipers < nitems)
-		zone->uz_maxpages++;
+	if (keg->uk_maxpages * keg->uk_ipers < nitems)
+		keg->uk_maxpages++;
 
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
+uma_zone_set_init(uma_zone_t zone, uma_init uminit)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_init on non-empty keg"));
+	zone->uz_keg->uk_init = uminit;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_fini on non-empty keg"));
+	zone->uz_keg->uk_fini = fini;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_zinit on non-empty keg"));
+	zone->uz_init = zinit;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_zfini on non-empty keg"));
+	zone->uz_fini = zfini;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	ZONE_LOCK(zone);
-	zone->uz_freef = freef;
+	zone->uz_keg->uk_freef = freef;
 	ZONE_UNLOCK(zone);
 }
 
@@ -1948,8 +2264,8 @@ void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	ZONE_LOCK(zone);
-	zone->uz_flags |= UMA_ZFLAG_PRIVALLOC;
-	zone->uz_allocf = allocf;
+	zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
+	zone->uz_keg->uk_allocf = allocf;
 	ZONE_UNLOCK(zone);
 }
 
@@ -1957,12 +2273,14 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 int
 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 {
-	int pages;
+	uma_keg_t keg;
 	vm_offset_t kva;
+	int pages;
 
-	pages = count / zone->uz_ipers;
+	keg = zone->uz_keg;
+	pages = count / keg->uk_ipers;
 
-	if (pages * zone->uz_ipers < count)
+	if (pages * keg->uk_ipers < count)
 		pages++;
 
 	kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
@@ -1978,11 +2296,11 @@ uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 		    pages, obj);
 	}
 	ZONE_LOCK(zone);
-	zone->uz_kva = kva;
-	zone->uz_obj = obj;
-	zone->uz_maxpages = pages;
-	zone->uz_allocf = obj_alloc;
-	zone->uz_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
+	keg->uk_kva = kva;
+	keg->uk_obj = obj;
+	keg->uk_maxpages = pages;
+	keg->uk_allocf = obj_alloc;
+	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
 	ZONE_UNLOCK(zone);
 	return (1);
 }
@@ -1993,20 +2311,41 @@ uma_prealloc(uma_zone_t zone, int items)
 {
 	int slabs;
 	uma_slab_t slab;
+	uma_keg_t keg;
 
+	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
-	slabs = items / zone->uz_ipers;
-	if (slabs * zone->uz_ipers < items)
+	slabs = items / keg->uk_ipers;
+	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = slab_zalloc(zone, M_WAITOK);
-		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
+u_int32_t *
+uma_find_refcnt(uma_zone_t zone, void *item)
+{
+	uma_slabrefcnt_t slab;
+	uma_keg_t keg;
+	u_int32_t *refcnt;
+	int idx;
+
+	keg = zone->uz_keg;
+	slab = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
+	KASSERT(slab != NULL,
+	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
+	idx = ((unsigned long)item - (unsigned long)slab->us_data)
+	    / keg->uk_rsize;
+	refcnt = &(slab->us_freelist[idx].us_refcnt);
+	return refcnt;
+}
+
+/* See uma.h */
 void
 uma_reclaim(void)
 {
@@ -2021,6 +2360,7 @@ uma_reclaim(void)
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
+	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
@@ -2044,7 +2384,6 @@ uma_large_malloc(int size, int wait)
 		uma_zfree_internal(slabzone, slab, NULL, 0);
 	}
 
-
 	return (mem);
 }
 
@@ -2065,8 +2404,8 @@ uma_print_stats(void)
 static void
 slab_print(uma_slab_t slab)
 {
-	printf("slab: zone %p, data %p, freecount %d, firstfree %d\n",
-		slab->us_zone, slab->us_data, slab->us_freecount,
+	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
+		slab->us_keg, slab->us_data, slab->us_freecount,
 		slab->us_firstfree);
 }
 
@@ -2084,21 +2423,23 @@ void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
+	uma_keg_t keg;
 	uma_slab_t slab;
 	int i;
 
+	keg = zone->uz_keg;
 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
-	    zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags,
-	    zone->uz_ipers, zone->uz_ppera,
-	    (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free);
+	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
+	    keg->uk_ipers, keg->uk_ppera,
+	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 	printf("Part slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_part_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
 	printf("Free slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_free_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
 		slab_print(slab);
 	printf("Full slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_full_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
 		slab_print(slab);
 	for (i = 0; i <= mp_maxid; i++) {
 		if (CPU_ABSENT(i))
@@ -2122,6 +2463,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 	int totalfree;
 	char *tmpbuf, *offset;
 	uma_zone_t z;
+	uma_keg_t zk;
 	char *p;
 	int cpu;
 	int cachefree;
@@ -2130,8 +2472,10 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 
 	cnt = 0;
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(z, &uma_zones, uz_link)
-		cnt++;
+	LIST_FOREACH(zk, &uma_kegs, uk_link) {
+		LIST_FOREACH(z, &zk->uk_zones, uz_link)
+			cnt++;
+	}
 	mtx_unlock(&uma_mtx);
 	MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
 			M_TEMP, M_WAITOK);
@@ -2144,10 +2488,11 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		goto out;
 	offset = tmpbuf;
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(z, &uma_zones, uz_link) {
+	LIST_FOREACH(zk, &uma_kegs, uk_link) {
+	  LIST_FOREACH(z, &zk->uk_zones, uz_link) {
 		if (cnt == 0)	/* list may have changed size */
 			break;
-		if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
 					continue;
@@ -2156,7 +2501,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		}
 		ZONE_LOCK(z);
 		cachefree = 0;
-		if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
 					continue;
@@ -2171,12 +2516,12 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
 			cachefree += bucket->ub_cnt;
 		}
-		totalfree = z->uz_free + cachefree;
+		totalfree = zk->uk_free + cachefree;
 		len = snprintf(offset, linesize,
 		    "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
-		    z->uz_name, z->uz_size,
-		    z->uz_maxpages * z->uz_ipers,
-		    (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree,
+		    z->uz_name, zk->uk_size,
+		    zk->uk_maxpages * zk->uk_ipers,
+		    (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
 		    totalfree,
 		    (unsigned long long)z->uz_allocs);
 		ZONE_UNLOCK(z);
@@ -2185,6 +2530,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		p[1] = ':';
 		cnt--;
 		offset += len;
+	  }
 	}
 	mtx_unlock(&uma_mtx);
 	*offset++ = '\0';
diff --git a/sys/vm/uma_dbg.c b/sys/vm/uma_dbg.c
index 85d067d..0f845cf 100644
--- a/sys/vm/uma_dbg.c
+++ b/sys/vm/uma_dbg.c
@@ -192,15 +192,17 @@ static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 
+	keg = zone->uz_keg;
 	mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-	if (zone->uz_flags & UMA_ZONE_MALLOC) {
+	if (keg->uk_flags & UMA_ZONE_MALLOC) {
 		slab = vtoslab((vm_offset_t)mem);
-	} else if (zone->uz_flags & UMA_ZONE_HASH) {
-		slab = hash_sfind(&zone->uz_hash, mem);
+	} else if (keg->uk_flags & UMA_ZONE_HASH) {
+		slab = hash_sfind(&keg->uk_hash, mem);
 	} else {
-		mem += zone->uz_pgoff;
+		mem += keg->uk_pgoff;
 		slab = (uma_slab_t)mem;
 	}
 
@@ -215,8 +217,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item)
 void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;
 
+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -225,9 +229,9 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 	}
 
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;
 
-	slab->us_freelist[freei] = 255;
+	slab->us_freelist[freei].us_item = 255;
 
 	return;
 }
@@ -241,8 +245,10 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;
 
+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -251,22 +257,22 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	}
 
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;
 
-	if (freei >= zone->uz_ipers)
+	if (freei >= keg->uk_ipers)
 		panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
-		    zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
+		    zone->uz_name, zone, slab, freei, keg->uk_ipers-1);
 
-	if (((freei * zone->uz_rsize) + slab->us_data) != item) {
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) {
 		printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
 		    zone->uz_name, zone, slab, item);
 		panic("should be %p\n",
-		    (freei * zone->uz_rsize) + slab->us_data);
+		    (freei * keg->uk_rsize) + slab->us_data);
 	}
 
-	if (slab->us_freelist[freei] != 255) {
+	if (slab->us_freelist[freei].us_item != 255) {
 		printf("Slab at %p, freei %d = %d.\n",
-		    slab, freei, slab->us_freelist[freei]);
+		    slab, freei, slab->us_freelist[freei].us_item);
 		panic("Duplicate free of item %p from zone %p(%s)\n",
 		    item, zone, zone->uz_name);
 	}
@@ -276,5 +282,5 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	 * Until then the count of valid slabs will make sure we don't
 	 * accidentally follow this and assume it's a valid index.
 	 */
-	slab->us_freelist[freei] = 0;
+	slab->us_freelist[freei].us_item = 0;
 }
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 35acfde..a4cbe5f 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -35,10 +35,10 @@
 /* 
  * Here's a quick description of the relationship between the objects:
  *
- * Zones contain lists of slabs which are stored in either the full bin, empty
+ * Kegs contain lists of slabs which are stored in either the full bin, empty
  * bin, or partially allocated bin, to reduce fragmentation.  They also contain
  * the user supplied value for size, which is adjusted for alignment purposes
- * and rsize is the result of that.  The zone also stores information for
+ * and rsize is the result of that.  The Keg also stores information for
  * managing a hash of page addresses that maps pages to uma_slab_t structures
  * for pages that don't have embedded uma_slab_t's.
  *  
@@ -67,6 +67,20 @@
  * so at this time it may not make sense to optimize for it.  This can, of 
  * course, be solved with dynamic slab sizes.
  *
+ * Kegs may serve multiple Zones but by far most of the time they only serve
+ * one.  When a Zone is created, a Keg is allocated and setup for it.  While
+ * the backing Keg stores slabs, the Zone caches Buckets of items allocated
+ * from the slabs.  Each Zone is equipped with an init/fini and ctor/dtor
+ * pair, as well as with its own set of small per-CPU caches, layered above
+ * the Zone's general Bucket cache.
+ *
+ * The PCPU caches are protected by their own locks, while the Zones backed
+ * by the same Keg all share a common Keg lock (to coalesce contention on
+ * the backing slabs).  The backing Keg typically only serves one Zone but
+ * in the case of multiple Zones, one of the Zones is considered the
+ * Master Zone and all Zone-related stats from the Keg are done in the
+ * Master Zone.  For an example of a Multi-Zone setup, refer to the
+ * Mbuf allocation code.
  */
 
 /*
@@ -134,28 +148,6 @@
 		SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h),		\
 		    (mem))], (s), uma_slab, us_hlink);
 
-/* Page management structure */
-
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab {
-	uma_zone_t	us_zone;		/* Zone we live in */
-	union {
-		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
-		unsigned long	_us_size;	/* Size of allocation */
-	} us_type;
-	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
-	u_int8_t	*us_data;		/* First item */
-	u_int8_t	us_flags;		/* Page flags see uma.h */
-	u_int8_t	us_freecount;	/* How many are free? */
-	u_int8_t	us_firstfree;	/* First free item index */
-	u_int8_t	us_freelist[1];	/* Free List (actually larger) */
-};
-
-#define us_link	us_type._us_link
-#define us_size	us_type._us_size
-
-typedef struct uma_slab * uma_slab_t;
-
 /* Hash table for freed address -> slab translation */
 
 SLIST_HEAD(slabhead, uma_slab);
@@ -188,6 +180,97 @@ struct uma_cache {
 typedef struct uma_cache * uma_cache_t;
 
 /*
+ * Keg management structure
+ *
+ * TODO: Optimize for cache line size
+ *
+ */
+struct uma_keg {
+	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
+
+	struct mtx	uk_lock;	/* Lock for the keg */
+	struct uma_hash	uk_hash;
+
+	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
+	LIST_HEAD(,uma_slab)	uk_part_slab;	/* partially allocated slabs */
+	LIST_HEAD(,uma_slab)	uk_free_slab;	/* empty slab list */
+	LIST_HEAD(,uma_slab)	uk_full_slab;	/* full slabs */
+
+	u_int32_t	uk_recurse;	/* Allocation recursion count */
+	u_int32_t	uk_align;	/* Alignment mask */
+	u_int32_t	uk_pages;	/* Total page count */
+	u_int32_t	uk_free;	/* Count of items free in slabs */
+	u_int32_t	uk_size;	/* Requested size of each item */
+	u_int32_t	uk_rsize;	/* Real size of each item */
+	u_int32_t	uk_maxpages;	/* Maximum number of pages to alloc */
+
+	uma_init	uk_init;	/* Keg's init routine */
+	uma_fini	uk_fini;	/* Keg's fini routine */
+	uma_alloc	uk_allocf;	/* Allocation function */
+	uma_free	uk_freef;	/* Free routine */
+
+	struct vm_object	*uk_obj;	/* Zone specific object */
+	vm_offset_t	uk_kva;		/* Base kva for zones with objs */
+	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
+
+	u_int16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	u_int16_t	uk_ppera;	/* pages per allocation from backend */
+	u_int16_t	uk_ipers;	/* Items per slab */
+	u_int16_t	uk_flags;	/* Internal flags */
+};
+
+/* Simpler reference to uma_keg for internal use. */
+typedef struct uma_keg * uma_keg_t;
+
+/* Page management structure */
+
+/* Sorry for the union, but space efficiency is important */
+struct uma_slab_head {
+	uma_keg_t	us_keg;			/* Keg we live in */
+	union {
+		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
+		unsigned long	_us_size;	/* Size of allocation */
+	} us_type;
+	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
+	u_int8_t	*us_data;		/* First item */
+	u_int8_t	us_flags;		/* Page flags see uma.h */
+	u_int8_t	us_freecount;	/* How many are free? */
+	u_int8_t	us_firstfree;	/* First free item index */
+};
+
+/* The standard slab structure */
+struct uma_slab {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+/*
+ * The slab structure for UMA_ZONE_REFCNT zones for whose items we
+ * maintain reference counters in the slab for.
+ */
+struct uma_slab_refcnt {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+		u_int32_t	us_refcnt;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+#define	us_keg		us_head.us_keg
+#define	us_link		us_head.us_type._us_link
+#define	us_size		us_head.us_type._us_size
+#define	us_hlink	us_head.us_hlink
+#define	us_data		us_head.us_data
+#define	us_flags	us_head.us_flags
+#define	us_freecount	us_head.us_freecount
+#define	us_firstfree	us_head.us_firstfree
+
+typedef struct uma_slab * uma_slab_t;
+typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
+
+/*
  * Zone management structure 
  *
  * TODO: Optimize for cache line size
@@ -195,42 +278,22 @@ typedef struct uma_cache * uma_cache_t;
  */
 struct uma_zone {
 	char		*uz_name;	/* Text name of the zone */
-	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones */
-	u_int32_t	uz_align;	/* Alignment mask */
-	u_int32_t	uz_pages;	/* Total page count */
-
-/* Used during alloc / free */
-	struct mtx	uz_lock;	/* Lock for the zone */
-	u_int32_t	uz_free;	/* Count of items free in slabs */
-	u_int16_t	uz_ipers;	/* Items per slab */
-	u_int16_t	uz_flags;	/* Internal flags */
-
-	LIST_HEAD(,uma_slab)	uz_part_slab;	/* partially allocated slabs */
-	LIST_HEAD(,uma_slab)	uz_free_slab;	/* empty slab list */
-	LIST_HEAD(,uma_slab)	uz_full_slab;	/* full slabs */
+	struct mtx	*uz_lock;	/* Lock for the zone (keg's lock) */
+	uma_keg_t	uz_keg;		/* Our underlying Keg */
+
+	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones in keg */
 	LIST_HEAD(,uma_bucket)	uz_full_bucket;	/* full buckets */
 	LIST_HEAD(,uma_bucket)	uz_free_bucket;	/* Buckets for frees */
-	u_int32_t	uz_size;	/* Requested size of each item */
-	u_int32_t	uz_rsize;	/* Real size of each item */
-
-	struct uma_hash	uz_hash;
-	u_int16_t	uz_pgoff;	/* Offset to uma_slab struct */
-	u_int16_t	uz_ppera;	/* pages per allocation from backend */
 
 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
-	u_int64_t	uz_allocs;	/* Total number of allocations */
-
 	uma_init	uz_init;	/* Initializer for each item */
 	uma_fini	uz_fini;	/* Discards memory */
-	uma_alloc	uz_allocf;	/* Allocation function */
-	uma_free	uz_freef;	/* Free routine */
-	struct vm_object	*uz_obj;	/* Zone specific object */
-	vm_offset_t	uz_kva;		/* Base kva for zones with objs */
-	u_int32_t	uz_maxpages;	/* Maximum number of pages to alloc */
-	int		uz_recurse;	/* Allocation recursion count */
+
+	u_int64_t	uz_allocs;	/* Total number of allocations */
 	uint16_t	uz_fills;	/* Outstanding bucket fills */
 	uint16_t	uz_count;	/* Highest value ub_ptr can have */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@@ -256,16 +319,16 @@ void uma_large_free(uma_slab_t slab);
 #define	ZONE_LOCK_INIT(z, lc)					\
 	do {							\
 		if ((lc))					\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    (z)->uz_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 	    
-#define	ZONE_LOCK_FINI(z)	mtx_destroy(&(z)->uz_lock)
-#define	ZONE_LOCK(z)	mtx_lock(&(z)->uz_lock)
-#define ZONE_UNLOCK(z)	mtx_unlock(&(z)->uz_lock)
+#define	ZONE_LOCK_FINI(z)	mtx_destroy((z)->uz_lock)
+#define	ZONE_LOCK(z)	mtx_lock((z)->uz_lock)
+#define ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lock)
 
 #define	CPU_LOCK_INIT(cpu)					\
 	mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu",	\
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 3e21a99..f71785f 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -320,16 +320,6 @@ kmem_malloc(map, size, flags)
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
 		vm_map_unlock(map);
-		if (map != kmem_map) {
-			static int last_report; /* when we did it (in ticks) */
-			if (ticks < last_report ||
-			    (ticks - last_report) >= hz) {
-				last_report = ticks;
-				printf("Out of mbuf address space!\n");
-				printf("Consider increasing NMBCLUSTERS\n");
-			}
-			return (0);
-		}
 		if ((flags & M_NOWAIT) == 0)
 			panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
 				(long)size, (long)map->size);
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index ac9dd26..8992599 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -256,7 +256,6 @@ static char *nlistf = NULL, *memf = NULL;
 int	Aflag;		/* show addresses of protocol control block */
 int	aflag;		/* show all sockets (including servers) */
 int	bflag;		/* show i/f total bytes in/out */
-int	cflag;		/* show mbuf cache information */
 int	dflag;		/* show i/f dropped packets */
 int	gflag;		/* show group (multicast) routing or stats */
 int	iflag;		/* show interfaces */
@@ -297,9 +296,6 @@ main(int argc, char *argv[])
 		case 'b':
 			bflag = 1;
 			break;
-		case 'c':
-			cflag = 1;
-			break;
 		case 'd':
 			dflag = 1;
 			break;
@@ -425,10 +421,6 @@ main(int argc, char *argv[])
 	if (nlistf != NULL || memf != NULL)
 		setgid(getgid());
 
-	if (cflag && !mflag) {
-		(void)fprintf(stderr, "-c only valid with -m\n");
-		usage();
-	}
 	if (mflag) {
 		if (memf != NULL) {
 			if (kread(0, 0, 0) == 0)
diff --git a/usr.bin/netstat/mbuf.c b/usr.bin/netstat/mbuf.c
index aa6a8d2..98546c4 100644
--- a/usr.bin/netstat/mbuf.c
+++ b/usr.bin/netstat/mbuf.c
@@ -99,17 +99,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
     u_long mbhiaddr, u_long clhiaddr, u_long mbloaddr, u_long clloaddr,
     u_long cpusaddr __unused, u_long pgsaddr, u_long mbpaddr)
 {
-	int i, j, nmbufs, nmbclusters, page_size, num_objs;
+	int i, nmbclusters;
 	int nsfbufs, nsfbufspeak, nsfbufsused;
-	u_int mbuf_hiwm, clust_hiwm, mbuf_lowm, clust_lowm;
-	u_long totspace[2], totused[2];
-	u_long gentotnum, gentotfree, totnum, totfree;
-	u_long totmem, totmemalloced, totmemused;
 	short nmbtypes;
 	size_t mlen;
 	long *mbtypes = NULL;
 	struct mbstat *mbstat = NULL;
-	struct mbpstat **mbpstat = NULL;
 	struct mbtypenames *mp;
 	bool *seen = NULL;
 
@@ -119,50 +114,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		goto err;
 	}
 
-	/*
-	 * XXX: Unfortunately, for the time being, we have to fetch
-	 * the total length of the per-CPU stats area via sysctl
-	 * (regardless of whether we're looking at a core or not.
-	 */
-	if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &mlen, NULL, 0) < 0) {
-		warn("sysctl: retrieving mb_statpcpu len");
-		goto err;
-	} 
-	num_objs = (int)(mlen / sizeof(struct mbpstat));
-	if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
-		warn("calloc: cannot allocate memory for mbpstats pointers");
-		goto err;
-	}
-	if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
-		warn("calloc: cannot allocate memory for mbpstats");
-		goto err;
-	}
-
 	if (mbaddr) {
-		if (kread(mbpaddr, (char *)mbpstat[0], mlen))
-			goto err; 
 		if (kread(mbaddr, (char *)mbstat, sizeof mbstat))
 			goto err;
 		if (kread(nmbcaddr, (char *)&nmbclusters, sizeof(int)))
 			goto err;
-		if (kread(nmbufaddr, (char *)&nmbufs, sizeof(int)))
-			goto err;
-		if (kread(mbhiaddr, (char *)&mbuf_hiwm, sizeof(u_int)))
-			goto err;
-		if (kread(clhiaddr, (char *)&clust_hiwm, sizeof(u_int)))
-			goto err;
-		if (kread(mbloaddr, (char *)&mbuf_lowm, sizeof(u_int)))
-			goto err;
-		if (kread(clloaddr, (char *)&clust_lowm, sizeof(u_int)))
-			goto err;
-		if (kread(pgsaddr, (char *)&page_size, sizeof(int)))
-			goto err;
 	} else {
-		if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mb_statpcpu");
-			goto err;
-		}
 		mlen = sizeof *mbstat;
 		if (sysctlbyname("kern.ipc.mbstat", mbstat, &mlen, NULL, 0)
 		    < 0) {
@@ -175,43 +132,9 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 			warn("sysctl: retrieving nmbclusters");
 			goto err;
 		}
-		mlen = sizeof(int);
-		if (sysctlbyname("kern.ipc.nmbufs", &nmbufs, &mlen, NULL, 0)
-		    < 0) {
-			warn("sysctl: retrieving nmbufs");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.mbuf_hiwm", &mbuf_hiwm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mbuf_hiwm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.clust_hiwm", &clust_hiwm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving clust_hiwm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.mbuf_lowm", &mbuf_lowm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mbuf_lowm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.clust_lowm", &clust_lowm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving clust_lowm");
-			goto err;
-		}
-		mlen = sizeof(int);
-		if (sysctlbyname("hw.pagesize", &page_size, &mlen, NULL, 0)
-		    < 0) {
-			warn("sysctl: retrieving hw.pagesize");
-			goto err;
-		}
 	}
+	if (mbstat->m_mbufs < 0) mbstat->m_mbufs = 0;		/* XXX */
+	if (mbstat->m_mclusts < 0) mbstat->m_mclusts = 0;	/* XXX */
 
 	nmbtypes = mbstat->m_numtypes;
 	if ((seen = calloc(nmbtypes, sizeof(*seen))) == NULL) {
@@ -223,59 +146,13 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		goto err;
 	}
 
-	for (i = 0; i < num_objs; i++)
-		mbpstat[i] = mbpstat[0] + i;
-
 #undef MSIZE
 #define MSIZE		(mbstat->m_msize)
 #undef MCLBYTES
 #define	MCLBYTES	(mbstat->m_mclbytes)
-#define	GENLST		(num_objs - 1)
 
-	totnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
-	totfree = mbpstat[GENLST]->mb_mbfree;
-	for (j = 1; j < nmbtypes; j++)
-		mbtypes[j] += mbpstat[GENLST]->mb_mbtypes[j];
-	totspace[0] = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck * MSIZE;
-	for (i = 0; i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totspace[0] += mbpstat[i]->mb_mbbucks*mbstat->m_mbperbuck*MSIZE;
-		totnum += mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck;
-		totfree += mbpstat[i]->mb_mbfree;
-		for (j = 1; j < nmbtypes; j++)
-			mbtypes[j] += mbpstat[i]->mb_mbtypes[j]; 
-	}
-	totused[0] = totnum - totfree;
-	if (cflag) {
-		printf("mbuf usage:\n"
-		    "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
-		    totused[0], totnum, nmbufs);
-		gentotnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
-		gentotfree = mbpstat[GENLST]->mb_mbfree;
-		printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
-		    gentotnum - gentotfree, gentotnum);
-	} else {
-		/* XXX: peak is now wrong. */
-		printf("%lu/%lu/%d mbufs in use (current/peak/max):\n",
-		    totused[0], totnum, nmbufs);
-	}
+	printf("%lu mbufs in use\n", mbstat->m_mbufs);
 
-	for (i = 0; cflag && i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
-		    i,
-		    (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck -
-		     mbpstat[i]->mb_mbfree),
-		    (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck));
-	}
-	if (cflag) {
-		printf("\tMbuf cache high watermark: %d\n", mbuf_hiwm);
-#ifdef NOTYET
-		printf("\tMbuf cache low watermark: %d\n", mbuf_lowm);
-#endif
-	}
 	for (mp = mbtypenames; mp->mt_name; mp++) {
 		if (mbtypes[mp->mt_type]) {
 			seen[mp->mt_type] = YES;
@@ -288,53 +165,10 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 			printf("\t  %lu mbufs allocated to <mbuf type: %d>\n",
 			    mbtypes[i], i);
 	}
-	if (cflag)
-		printf("\t%.1f%% of mbuf map consumed\n",
-		    totspace[0] * 100.0 / (nmbufs * MSIZE));
 
-	totnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
-	totfree = mbpstat[GENLST]->mb_clfree;
-	totspace[1] = mbpstat[GENLST]->mb_clbucks*mbstat->m_clperbuck*MCLBYTES;
-	for (i = 0; i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totspace[1] += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck
-		    * MCLBYTES;
-		totnum += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck;
-		totfree += mbpstat[i]->mb_clfree;
-	}
-	totused[1] = totnum - totfree;
-	if (cflag) {
-		printf("mbuf cluster usage:\n"
-		    "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
-		    totused[1], totnum, nmbclusters);
-		gentotnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
-		gentotfree = mbpstat[GENLST]->mb_clfree;
-		printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
-		    gentotnum - gentotfree, gentotnum);
-	} else {
-		/* XXX: peak is now wrong. */
-		printf("%lu/%lu/%d mbuf clusters in use (current/peak/max)\n",
-		    totused[1], totnum, nmbclusters);
-	}
-	for (i = 0; cflag && i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
-		    i,
-		    (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck -
-		     mbpstat[i]->mb_clfree),
-		    (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck));
-	}
-	if (cflag) {
-		printf("\tCluster cache high watermark: %d\n", clust_hiwm);
-#ifdef NOTYET
-		printf("\tCluster cache low watermark: %d\n", clust_lowm);
-#endif
-	}
-	if (cflag)
-		printf("\t%.1f%% of cluster map consumed\n",
-		    totspace[1] * 100.0 / (nmbclusters * MCLBYTES));
+	printf("%lu/%d mbuf clusters in use (current/max)\n",
+	    mbstat->m_mclusts, nmbclusters);
+
 	mlen = sizeof(nsfbufs);
 	if (!sysctlbyname("kern.ipc.nsfbufs", &nsfbufs, &mlen, NULL, 0) &&
 	    !sysctlbyname("kern.ipc.nsfbufsused", &nsfbufsused, &mlen, NULL,
@@ -344,15 +178,8 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		printf("%d/%d/%d sfbufs in use (current/peak/max)\n",
 		    nsfbufsused, nsfbufspeak, nsfbufs);
 	}
-	totmem = nmbufs * MSIZE + nmbclusters * MCLBYTES;
-	totmemalloced = totspace[0] + totspace[1];
-	totmemused = totused[0] * MSIZE + totused[1] * MCLBYTES;
-	printf(
-	    "%lu KBytes allocated to network (%.1f%% in use, %.1f%% wired)\n",
-	    totmem / 1024, totmemused * 100.0 / totmem,
-	    totmemalloced * 100.0 / totmem);
-	printf("%lu requests for memory denied\n", mbstat->m_drops);
-	printf("%lu requests for memory delayed\n", mbstat->m_wait);
+	printf("%lu KBytes allocated to network\n", (mbstat->m_mbufs * MSIZE +
+	    mbstat->m_mclusts * MCLBYTES) / 1024);
 	printf("%lu requests for sfbufs denied\n", mbstat->sf_allocfail);
 	printf("%lu requests for sfbufs delayed\n", mbstat->sf_allocwait);
 	printf("%lu requests for I/O initiated by sendfile\n",
@@ -366,9 +193,4 @@ err:
 		free(seen);
 	if (mbstat != NULL)
 		free(mbstat);
-	if (mbpstat != NULL) {
-		if (mbpstat[0] != NULL)
-			free(mbpstat[0]);
-		free(mbpstat);
-	}
 }
diff --git a/usr.bin/netstat/netstat.1 b/usr.bin/netstat/netstat.1
index 45023fe..32edfec 100644
--- a/usr.bin/netstat/netstat.1
+++ b/usr.bin/netstat/netstat.1
@@ -181,7 +181,6 @@ or for a single
 .Bk -words
 .Nm
 .Fl m
-.Op Fl c
 .Op Fl M Ar core
 .Op Fl N Ar system
 .Ek
@@ -189,9 +188,6 @@ or for a single
 Show statistics recorded by the memory management routines
 .Pq Xr mbuf 9 .
 The network manages a private pool of memory buffers.
-The
-.Fl c
-option shows per-CPU statistics for caching.
 .It Xo
 .Bk -words
 .Nm
diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h
index c59b7e8..e2b3f29 100644
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@@ -39,7 +39,6 @@
 extern int	Aflag;	/* show addresses of protocol control block */
 extern int	aflag;	/* show all sockets (including servers) */
 extern int	bflag;	/* show i/f total bytes in/out */
-extern int	cflag;	/* show mbuf cache information */
 extern int	dflag;	/* show i/f dropped packets */
 extern int	gflag;	/* show group (multicast) routing or stats */
 extern int	iflag;	/* show interfaces */
diff --git a/usr.bin/systat/mbufs.c b/usr.bin/systat/mbufs.c
index e1b665b2..1193a3e 100644
--- a/usr.bin/systat/mbufs.c
+++ b/usr.bin/systat/mbufs.c
@@ -52,12 +52,9 @@ static const char sccsid[] = "@(#)mbufs.c	8.1 (Berkeley) 6/6/93";
 #include "systat.h"
 #include "extern.h"
 
-static struct mbpstat **mbpstat;
 static struct mbstat *mbstat;
-static int num_objs;
 static long *m_mbtypes;
 static short nmbtypes;
-#define	GENLST	(num_objs - 1)
 
 static struct mtnames {
 	short mt_type;
@@ -101,20 +98,11 @@ void
 showmbufs()
 {
 	int i, j, max, idx;
-	u_long totfree;
+	u_long totmbufs;
 	char buf[10];
 	const char *mtname;
 
-	totfree = mbpstat[GENLST]->mb_mbfree; 
-	for (i = 1; i < nmbtypes; i++)
-		m_mbtypes[i] += mbpstat[GENLST]->mb_mbtypes[i];
-	for (i = 0; i < GENLST; i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totfree += mbpstat[i]->mb_mbfree;
-		for (j = 1; j < nmbtypes; j++)
-			m_mbtypes[j] += mbpstat[i]->mb_mbtypes[j];
-	}
+	totmbufs = mbstat->m_mbufs;
 
 	/*
 	 * Print totals for different mbuf types.
@@ -159,16 +147,16 @@ showmbufs()
 	/*
 	 * Print total number of free mbufs.
 	 */
-	if (totfree > 0) {
-		mvwprintw(wnd, 1+j, 0, "%-10.10s", "free");
-		if (totfree > 60) {
-			snprintf(buf, sizeof(buf), " %lu", totfree);
-			totfree = 60;
-			while(totfree--)
+	if (totmbufs > 0) {
+		mvwprintw(wnd, 1+j, 0, "%-10.10s", "Mbufs");
+		if (totmbufs > 60) {
+			snprintf(buf, sizeof(buf), " %lu", totmbufs);
+			totmbufs = 60;
+			while(totmbufs--)
 				waddch(wnd, 'X');
 			waddstr(wnd, buf);
 		} else {
-			while(totfree--)
+			while(totmbufs--)
 				waddch(wnd, 'X');
 		}
 		wclrtoeol(wnd);
@@ -198,23 +186,6 @@ initmbufs()
 		return 0;
 	}
 
-	if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &len, NULL, 0) < 0) {
-		error("sysctl getting mbpstat total size failed");
-		return 0;
-	}
-	num_objs = (int)(len / sizeof(struct mbpstat));
-	if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
-		error("calloc mbpstat pointers failed");
-		return 0;
-	}
-	if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
-		error("calloc mbpstat structures failed");
-		return 0;
-	}
-
-	for (i = 0; i < num_objs; i++)
-		mbpstat[i] = mbpstat[0] + i;
-
 	return 1;
 }
 
@@ -223,7 +194,7 @@ fetchmbufs()
 {
 	size_t len;
 
-	len = num_objs * sizeof(struct mbpstat);
-	if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &len, NULL, 0) < 0)
-		printw("sysctl: mbpstat: %s", strerror(errno));
+	len = sizeof *mbstat;
+	if (sysctlbyname("kern.ipc.mbstat", mbstat, &len, NULL, 0) < 0)
+		printw("sysctl: mbstat: %s", strerror(errno));
 }