Bring in mbuma to replace mballoc.

mbuma is an Mbuf & Cluster allocator built on top of a number of extensions to the UMA framework, all included herein. Extensions to UMA worth noting: - Better layering between slab <-> zone caches; introduce Keg structure which splits off slab cache away from the zone structure and allows multiple zones to be stacked on top of a single Keg (single type of slab cache); perhaps we should look into defining a subset API on top of the Keg for special use by malloc(9), for example. - UMA_ZONE_REFCNT zones can now be added, and reference counters automagically allocated for them within the end of the associated slab structures. uma_find_refcnt() does a kextract to fetch the slab struct reference from the underlying page, and lookup the corresponding refcnt. mbuma things worth noting: - integrates mbuf & cluster allocations with extended UMA and provides caches for commonly-allocated items; defines several zones (two primary, one secondary) and two kegs. - change up certain code paths that always used to do: m_get() + m_clget() to instead just use m_getcl() and try to take advantage of the newly defined secondary Packet zone. - netstat(1) and systat(1) quickly hacked up to do basic stat reporting but additional stats work needs to be done once some other details within UMA have been taken care of and it becomes clearer to how stats will work within the modified framework. From the user perspective, one implication is that the NMBCLUSTERS compile-time option is no longer used. The maximum number of clusters is still capped off according to maxusers, but it can be made unlimited by setting the kern.ipc.nmbclusters boot-time tunable to zero. Work should be done to write an appropriate sysctl handler allowing dynamic tuning of kern.ipc.nmbclusters at runtime. Additional things worth noting/known issues (READ): - One report of 'ips' (ServeRAID) driver acting really slow in conjunction with mbuma. Need more data. Latest report is that ips is equally sucking with and without mbuma. - Giant leak in NFS code sometimes occurs, can't reproduce but currently analyzing; brueffer is able to reproduce but THIS IS NOT an mbuma-specific problem and currently occurs even WITHOUT mbuma. - Issues in network locking: there is at least one code path in the rip code where one or more locks are acquired and we end up in m_prepend() with M_WAITOK, which causes WITNESS to whine from within UMA. Current temporary solution: force all UMA allocations to be M_NOWAIT from within UMA for now to avoid deadlocks unless WITNESS is defined and we can determine with certainty that we're not holding any locks when we're M_WAITOK. - I've seen at least one weird socketbuffer empty-but- mbuf-still-attached panic. I don't believe this to be related to mbuma but please keep your eyes open, turn on debugging, and capture crash dumps. This change removes more code than it adds. A paper is available detailing the change and considering various performance issues, it was presented at BSDCan2004: http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf Please read the paper for Future Work and implementation details, as well as credits. Testing and Debugging: rwatson, brueffer, Ketrien I. Saihr-Kesenchedra, ... Reviewed by: Lots of people (for different parts)
author: bmilekic <bmilekic@FreeBSD.org> 2004-05-31 21:46:06 +0000
committer: bmilekic <bmilekic@FreeBSD.org> 2004-05-31 21:46:06 +0000
commit: f7574a2276b935509aba6b131a39c685a68e61d2 (patch)
tree: dacbb577a5d3ed365d11df0435010eee4c5380da /sys
parent: d5d90e314729317ee9cce434f3c548b3f4aaaf04 (diff)
download: FreeBSD-src-f7574a2276b935509aba6b131a39c685a68e61d2.zip
FreeBSD-src-f7574a2276b935509aba6b131a39c685a68e61d2.tar.gz
18 files changed, 1680 insertions, 2096 deletions
diff --git a/sys/conf/files b/sys/conf/files
index c2d7e7e..0d48a92 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1075,6 +1075,7 @@ kern/kern_lock.c	standard
 kern/kern_lockf.c	standard
 kern/kern_mac.c		standard
 kern/kern_malloc.c	standard
+kern/kern_mbuf.c	standard
 kern/kern_mib.c		standard
 kern/kern_module.c	standard
 kern/kern_mutex.c	standard
@@ -1116,7 +1117,6 @@ kern/subr_hints.c	standard
 kern/subr_kobj.c	standard
 kern/subr_log.c		standard
 kern/subr_mbpool.c	optional libmbpool
-kern/subr_mbuf.c	standard
 kern/subr_mchain.c	optional libmchain
 kern/subr_module.c	standard
 kern/subr_msgbuf.c	standard
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 50fd06e..9a2f9e3 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -95,6 +95,10 @@ __FBSDID("$FreeBSD$");
 #include <i386/isa/isa.h>
 #endif
 
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	cpu_reset_real(void);
 #ifdef SMP
 static void	cpu_reset_proxy(void);
@@ -584,6 +588,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;
 
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
 	TAILQ_INIT(&sf_buf_freelist);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index c92e70f..4bc3348 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -191,6 +191,7 @@ malloc(size, type, flags)
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
+	uma_keg_t keg;
 #ifdef DIAGNOSTIC
 	unsigned long osize = size;
 #endif
@@ -235,6 +236,7 @@ malloc(size, type, flags)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone;
+		keg = zone->uz_keg;
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
@@ -244,10 +246,11 @@ malloc(size, type, flags)
 			goto out;
 
 		ksp->ks_size |= 1 << indx;
-		size = zone->uz_size;
+		size = keg->uk_size;
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
+		keg = NULL;
 		va = uma_large_malloc(size, flags);
 		mtx_lock(&ksp->ks_mtx);
 		if (va == NULL)
@@ -309,7 +312,7 @@ free(addr, type)
 #ifdef INVARIANTS
 		struct malloc_type **mtp = addr;
 #endif
-		size = slab->us_zone->uz_size;
+		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
@@ -325,7 +328,7 @@ free(addr, type)
 		    sizeof(struct malloc_type *);
 		*mtp = type;
 #endif
-		uma_zfree_arg(slab->us_zone, addr, slab);
+		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
@@ -364,8 +367,8 @@ realloc(addr, size, type, flags)
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
-	if (slab->us_zone)
-		alloc = slab->us_zone->uz_size;
+	if (slab->us_keg)
+		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
@@ -410,7 +413,6 @@ kmeminit(dummy)
 	void *dummy;
 {
 	u_int8_t indx;
-	u_long npg;
 	u_long mem_size;
 	int i;
  
@@ -428,7 +430,7 @@ kmeminit(dummy)
 	 * Note that the kmem_map is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
-	vm_kmem_size = VM_KMEM_SIZE;
+	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
 	mem_size = cnt.v_page_count;
 
 #if defined(VM_KMEM_SIZE_SCALE)
@@ -462,17 +464,8 @@ kmeminit(dummy)
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);
 
-	/*
-	 * In mbuf_init(), we set up submaps for mbufs and clusters, in which
-	 * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
-	 * respectively. Mathematically, this means that what we do here may
-	 * amount to slightly more address space than we need for the submaps,
-	 * but it never hurts to have an extra page in kmem_map.
-	 */
-	npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; 
-
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
-		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+		(vm_offset_t *)&kmemlimit, vm_kmem_size);
 	kmem_map->system_map = 1;
 
 	uma_startup2();
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
new file mode 100644
index 0000000..2bec5ad
--- /dev/null
+++ b/sys/kern/kern_mbuf.c
@@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2004
+ * 	Bosko Milekic <bmilekic@FreeBSD.org>.
+ *	All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors may be
+ *    used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ *  m_clget()                m_getcl()
+ *    |                         |
+ *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
+ *    |   |             [     Packet   ]            |
+ *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
+ *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
+ *        |                       \________         |
+ *  [ Cluster Keg   ]                      \       /
+ *        |    	                         [ Mbuf Keg   ] 
+ *  [ Cluster Slabs ]                         |
+ *        |                              [ Mbuf Slabs ]
+ *         \____________(VM)_________________/
+ */
+
+int nmbclusters;
+struct mbstat mbstat;
+
+static void
+tunable_mbinit(void *dummy)
+{
+
+	/* This has to be done before VM init. */
+	nmbclusters = 1024 + maxusers * 64;
+	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
+    "Maximum number of mbuf clusters allowed");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+    "Mbuf general information and statistics");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t	zone_mbuf;
+uma_zone_t	zone_clust;
+uma_zone_t	zone_pack;
+
+/*
+ * Local prototypes.
+ */
+static void	mb_ctor_mbuf(void *, int, void *);
+static void	mb_ctor_clust(void *, int, void *);
+static void	mb_ctor_pack(void *, int, void *);
+static void	mb_dtor_mbuf(void *, int, void *);
+static void	mb_dtor_clust(void *, int, void *);	/* XXX */
+static void	mb_dtor_pack(void *, int, void *);	/* XXX */
+static void	mb_init_pack(void *, int);
+static void	mb_fini_pack(void *, int);
+
+static void	mb_reclaim(void *);
+static void	mbuf_init(void *);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+static void
+mbuf_init(void *dummy)
+{
+
+	/*
+	 * Configure UMA zones for Mbufs, Clusters, and Packets.
+	 */
+	zone_mbuf = uma_zcreate("Mbuf", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MAXBUCKET);
+	zone_clust = uma_zcreate("MbufClust", MCLBYTES, mb_ctor_clust,
+	    mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	if (nmbclusters > 0)
+		uma_zone_set_max(zone_clust, nmbclusters);
+	zone_pack = uma_zsecond_create("Packet", mb_ctor_pack, mb_dtor_pack,
+	    mb_init_pack, mb_fini_pack, zone_mbuf);
+
+	/* uma_prealloc() goes here */
+
+	/*
+	 * Hook event handler for low-memory situation, used to
+	 * drain protocols and push data back to the caches (UMA
+	 * later pushes it back to VM).
+	 */
+	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+
+	/*
+	 * [Re]set counters and local statistics knobs.
+	 * XXX Some of these should go and be replaced, but UMA stat
+	 * gathering needs to be revised.
+	 */
+	mbstat.m_mbufs = 0;
+	mbstat.m_mclusts = 0;
+	mbstat.m_drain = 0;
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+	mbstat.m_numtypes = MT_NTYPES;
+
+	mbstat.m_mcfail = mbstat.m_mpfail = 0;
+	mbstat.sf_iocnt = 0;
+	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API.
+ */
+static void
+mb_ctor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags;
+	int how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	how = args->how;
+	type = args->type;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_nextpkt = NULL;
+	if (flags & M_PKTHDR) {
+		m->m_data = m->m_pktdat;
+		m->m_flags = M_PKTHDR;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_mbuf(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	} else { 
+		m->m_data = m->m_dat;
+		m->m_flags = 0;
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * The Mbuf master zone and Packet secondary zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+}
+
+/* XXX Only because of stats */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Cluster zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for.
+ */
+static void
+mb_ctor_clust(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)arg;
+	m->m_ext.ext_buf = (caddr_t)mem;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags |= M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_CLUSTER;
+	m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+	    m->m_ext.ext_buf);
+	*(m->m_ext.ref_cnt) = 1;
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/* XXX */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from keg slab to zone cache.
+ */
+static void
+mb_init_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, M_NOWAIT);
+	if (m->m_ext.ext_buf == NULL)	/* XXX */
+		panic("mb_init_pack(): Can't deal with failure yet.");
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_fini_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+	m->m_ext.ext_buf = NULL;
+	mbstat.m_mclusts += 1;	/* XXX */
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static void
+mb_ctor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags, how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	type = args->type;
+	how = args->how;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags = flags|M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_PACKET;
+	*(m->m_ext.ref_cnt) = 1;
+
+	if (flags & M_PKTHDR) {
+		m->m_nextpkt = NULL;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_pack(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called.  The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+	    "mb_reclaim()");
+
+	mbstat.m_drain++;
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain != NULL)
+				(*pr->pr_drain)();
+}
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
deleted file mode 100644
index d84ef31..0000000
--- a/sys/kern/subr_mbuf.c
+++ /dev/null
@@ -1,1548 +0,0 @@
-/*-
- * Copyright (c) 2001, 2002, 2003
- * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission. 
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_mac.h"
-#include "opt_param.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mac.h>
-#include <sys/mbuf.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/smp.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-
-#include <vm/vm.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-
-/*
- * mb_alloc: network buffer allocator
- *
- * XXX: currently, the "low watermark" sysctl is marked read-only as its
- * effects are not completely implemented.  To be fixed soon.
- */
-
-/*
- * Maximum number of PCPU containers. If you know what you're doing you could
- * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
- * system during compilation, and thus prevent kernel structure bloat.
- *
- * SMP and non-SMP kernels clearly have a different number of possible CPUs,
- * but because we cannot assume a dense array of CPUs, we always allocate
- * and traverse PCPU containers up to NCPU amount and merely check for
- * CPU availability.
- */
-#ifdef MBALLOC_NCPU
-#define	NCPU	MBALLOC_NCPU
-#else
-#define	NCPU	MAXCPU
-#endif
-
-/*-
- * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
- * "memcache" proof-of-concept allocator which was itself based on
- * several well-known SMP-friendly allocators.
- *
- * The mb_alloc mbuf allocator is a special when compared to other
- * general-purpose allocators.  Some things to take note of:
- *
- *   Mbufs and mbuf clusters are two different objects.  Sometimes we
- *   will allocate a single mbuf, other times a single cluster,
- *   other times both.  Further, we may sometimes wish to allocate a
- *   whole chain of mbufs with clusters.  This allocator will perform
- *   the common case of each scenario in one function call (this
- *   includes constructing or destructing the object) while only
- *   locking/unlocking the cache once, if it can get away with it.
- *   The caches consist of pure mbufs and pure clusters; that is
- *   there are no 'zones' containing mbufs with already pre-hooked
- *   clusters.  Since we can allocate both objects atomically anyway,
- *   we don't bother fragmenting our caches for any particular 'scenarios.'
- *
- *   We allocate from seperate sub-maps of kmem_map, thus imposing
- *   an ultimate upper-limit on the number of allocatable clusters
- *   and mbufs and also, since the clusters all come from a
- *   virtually contiguous region, we can keep reference counters
- *   for them and "allocate" them purely by indexing into a
- *   dense refcount vector.
- *
- *   We call out to protocol drain routines (which can be hooked
- *   into us) when we're low on space.
- *
- * The mbuf allocator keeps all objects that it allocates in mb_buckets.
- * The buckets keep a number of objects (an object can be an mbuf or an
- * mbuf cluster) and facilitate moving larger sets of contiguous objects
- * from the per-CPU caches to the global cache. The buckets also have
- * the added advantage that objects, when migrated from cache to cache,
- * are migrated in chunks that keep contiguous objects together,
- * minimizing TLB pollution.
- *
- * The buckets are kept on singly-linked lists called "containers." A container
- * is protected by a mutex in order to ensure consistency.  The mutex
- * itself is allocated separately and attached to the container at boot time,
- * thus allowing for certain containers to share the same lock.  Per-CPU
- * containers for mbufs and mbuf clusters all share the same per-CPU 
- * lock whereas the global cache containers for these objects share one
- * global lock.
- */
-struct mb_bucket {
-	SLIST_ENTRY(mb_bucket) mb_blist;
-	int 	mb_owner;
-	int	mb_numfree;
-	void 	*mb_free[0];
-};
-
-struct mb_container {
-	SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
-	struct	mtx *mc_lock;
-	int	mc_numowner;
-	u_int	mc_starved;
-	long	*mc_types;
-	u_long	*mc_objcount;
-	u_long	*mc_numbucks;
-};
-
-struct mb_gen_list {
-	struct	mb_container mb_cont;
-	struct	cv mgl_mstarved;
-};
-
-struct mb_pcpu_list {
-	struct	mb_container mb_cont;
-};
-
-/*
- * Boot-time configurable object counts that will determine the maximum
- * number of permitted objects in the mbuf and mcluster cases.  In the
- * ext counter (nmbcnt) case, it's just an indicator serving to scale
- * kmem_map size properly - in other words, we may be allowed to allocate
- * more than nmbcnt counters, whereas we will never be allowed to allocate
- * more than nmbufs mbufs or nmbclusters mclusters.
- * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
- * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
- */
-#ifndef NMBCLUSTERS
-#define	NMBCLUSTERS	(1024 + maxusers * 64)
-#endif
-#ifndef NMBUFS
-#define	NMBUFS		(nmbclusters * 2)
-#endif
-#ifndef NSFBUFS
-#define	NSFBUFS		(512 + maxusers * 16)
-#endif
-#ifndef NMBCNTS
-#define	NMBCNTS		(nmbclusters + nsfbufs)
-#endif
-int	nmbufs;
-int	nmbclusters;
-int	nmbcnt;
-int	nsfbufs;
-int	nsfbufspeak;
-int	nsfbufsused;
-
-/*
- * Sizes of objects per bucket.  There are this size's worth of mbufs
- * or clusters in each bucket.  Please keep these a power-of-2.
- */
-#define	MBUF_BUCK_SZ	(PAGE_SIZE * 2)
-#define	CLUST_BUCK_SZ	(PAGE_SIZE * 4)
-
-/*
- * Perform sanity checks of tunables declared above.
- */
-static void
-tunable_mbinit(void *dummy)
-{
-
-	/*
-	 * This has to be done before VM init.
-	 */
-	nmbclusters = NMBCLUSTERS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
-	nmbufs = NMBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-	nmbcnt = NMBCNTS;
-	TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
-	/* Sanity checks */
-	if (nmbufs < nmbclusters * 2)
-		nmbufs = nmbclusters * 2;
-	if (nmbcnt < nmbclusters + nsfbufs)
-		nmbcnt = nmbclusters + nsfbufs;
-}
-SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
-
-/*
- * The freelist structures and mutex locks.  The number statically declared
- * here depends on the number of CPUs.
- *
- * We set up in such a way that all the objects (mbufs, clusters)
- * share the same mutex lock.  It has been established that we do not benefit
- * from different locks for different objects, so we use the same lock,
- * regardless of object type.  This also allows us to do optimised
- * multi-object allocations without dropping the lock in between.
- */
-struct mb_lstmngr {
-	struct mb_gen_list *ml_genlist;
-	struct mb_pcpu_list *ml_cntlst[NCPU];
-	struct mb_bucket **ml_btable;
-	vm_map_t	ml_map;
-	vm_offset_t	ml_mapbase;
-	vm_offset_t	ml_maptop;
-	int		ml_mapfull;
-	u_int		ml_objsize;
-	u_int		ml_objbucks;
-	u_int		*ml_wmhigh;
-	u_int		*ml_wmlow;
-};
-static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
-static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
-static u_int *cl_refcntmap;
-
-/*
- * Local macros for internal allocator structure manipulations.
- */
-#ifdef SMP
-#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
-#else
-#define	MB_GET_PCPU_LIST(mb_lst)	(mb_lst)->ml_cntlst[0]
-#endif
-
-#define	MB_GET_GEN_LIST(mb_lst)		(mb_lst)->ml_genlist
-
-#define	MB_LOCK_CONT(mb_cnt)		mtx_lock((mb_cnt)->mb_cont.mc_lock)
-
-#define	MB_UNLOCK_CONT(mb_cnt)		mtx_unlock((mb_cnt)->mb_cont.mc_lock)
-
-#define	MB_GET_PCPU_LIST_NUM(mb_lst, num)				\
-    (mb_lst)->ml_cntlst[(num)]
-
-#define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
-    (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) /		\
-    ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
-
-#define	MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)				\
-{									\
-	struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead);	\
-									\
-	(mb_bckt)->mb_numfree--;					\
-	(mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)];	\
-	(*((mb_lst)->mb_cont.mc_objcount))--;				\
-	if ((mb_bckt)->mb_numfree == 0) {				\
-		SLIST_REMOVE_HEAD(_mchd, mb_blist);			\
-		SLIST_NEXT((mb_bckt), mb_blist) = NULL;			\
-		(mb_bckt)->mb_owner |= MB_BUCKET_FREE;			\
-	}								\
-}
-
-#define	MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst)				\
-	(mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp);	\
-	(mb_bckt)->mb_numfree++;					\
-	(*((mb_lst)->mb_cont.mc_objcount))++;
-
-#define	MB_MBTYPES_INC(mb_cnt, mb_type, mb_num)				\
-	if ((mb_type) != MT_NOTMBUF)					\
-	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
-
-#define	MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num)				\
-	if ((mb_type) != MT_NOTMBUF)					\
-	    (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
-
-/*
- * Ownership of buckets/containers is represented by integers.  The PCPU
- * lists range from 0 to NCPU-1.  We need a free numerical id for the general
- * list (we use NCPU).  We also need a non-conflicting free bit to indicate
- * that the bucket is free and removed from a container, while not losing
- * the bucket's originating container id.  We use the highest bit
- * for the free marker.
- */
-#define	MB_GENLIST_OWNER	(NCPU)
-#define	MB_BUCKET_FREE		(1 << (sizeof(int) * 8 - 1))
-
-/* Statistics structures for allocator (per-CPU and general). */
-static struct mbpstat mb_statpcpu[NCPU + 1];
-struct mbstat mbstat;
-
-/* Sleep time for wait code (in ticks). */
-static int mbuf_wait = 64;
-
-static u_int mbuf_hiwm = 512;	/* High wm on  # of mbufs per cache */
-static u_int mbuf_lowm = 128;	/* Low wm on # of mbufs per cache */
-static u_int clust_hiwm = 128;	/* High wm on # of clusters per cache */
-static u_int clust_lowm = 16;	/* Low wm on # of clusters per cache */
-
-/*
- * Objects exported by sysctl(8).
- */
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0, 
-    "Maximum number of mbuf clusters available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0,
-    "Maximum number of mbufs available"); 
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0,
-    "Number used to scale kmem_map to ensure sufficient space for counters");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
-    "Sleep time of mbuf subsystem wait allocations during exhaustion");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
-    "Upper limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0,
-    "Lower limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
-    "Upper limit of number of mbuf clusters allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0,
-    "Lower limit of number of mbuf clusters allowed in each cache");
-SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
-    "Mbuf general information and statistics");
-SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
-    sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
-
-/*
- * Prototypes of local allocator routines.
- */
-static void		*mb_alloc_wait(struct mb_lstmngr *, short);
-static struct mb_bucket	*mb_pop_cont(struct mb_lstmngr *, int,
-			    struct mb_pcpu_list *);
-static void		 mb_reclaim(void);
-static void		 mbuf_init(void *);
-
-/*
- * Initial allocation numbers.  Each parameter represents the number of buckets
- * of each object that will be placed initially in each PCPU container for
- * said object.
- */
-#define	NMB_MBUF_INIT	2
-#define	NMB_CLUST_INIT	8
-
-/*
- * Internal flags that allow for cache locks to remain "persistent" across
- * allocation and free calls.  They may be used in combination.
- */
-#define	MBP_PERSIST	0x1	/* Return with lock still held. */
-#define	MBP_PERSISTENT	0x2	/* Cache lock is already held coming in. */
-
-/*
- * Initialize the mbuf subsystem.
- *
- * We sub-divide the kmem_map into several submaps; this way, we don't have
- * to worry about artificially limiting the number of mbuf or mbuf cluster
- * allocations, due to fear of one type of allocation "stealing" address
- * space initially reserved for another.
- *
- * Set up both the general containers and all the PCPU containers.  Populate
- * the PCPU containers with initial numbers.
- */
-MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
-SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
-static void
-mbuf_init(void *dummy)
-{
-	struct mb_pcpu_list *pcpu_cnt;
-	vm_size_t mb_map_size;
-	int i, j;
-
-	/*
-	 * Set up all the submaps, for each type of object that we deal
-	 * with in this allocator.
-	 */
-	mb_map_size = (vm_size_t)(nmbufs * MSIZE);
-	mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
-	mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
-	    MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
-	if (mb_list_mbuf.ml_btable == NULL)
-		goto bad;
-	mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
-	    &(mb_list_mbuf.ml_maptop), mb_map_size);
-	mb_list_mbuf.ml_map->system_map = 1;
-	mb_list_mbuf.ml_mapfull = 0;
-	mb_list_mbuf.ml_objsize = MSIZE;
-	mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize;
-	mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
-	mb_list_mbuf.ml_wmlow = &mbuf_lowm;
-
-	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
-	mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
-	mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
-	    CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
-	if (mb_list_clust.ml_btable == NULL)
-		goto bad;
-	mb_list_clust.ml_map = kmem_suballoc(kmem_map,
-	    &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
-	    mb_map_size);
-	mb_list_clust.ml_map->system_map = 1;
-	mb_list_clust.ml_mapfull = 0;
-	mb_list_clust.ml_objsize = MCLBYTES;
-	mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize;
-	mb_list_clust.ml_wmhigh = &clust_hiwm;
-	mb_list_clust.ml_wmlow = &clust_lowm;
-
-	/*
-	 * Allocate required general (global) containers for each object type.
-	 */
-	mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
-	    M_NOWAIT);
-	mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
-	    M_NOWAIT);
-	if ((mb_list_mbuf.ml_genlist == NULL) ||
-	    (mb_list_clust.ml_genlist == NULL))
-		goto bad;
-
-	/*
-	 * Initialize condition variables and general container mutex locks.
-	 */
-	mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, MTX_DEF);
-	cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
-	cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
-	    "mcluster pool starved");
-	mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
-	    mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
-
-	/*
-	 * Set up the general containers for each object.
-	 */
-	mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
-	    mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
-	mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
-	    mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
-	mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
-	mb_list_clust.ml_genlist->mb_cont.mc_objcount =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
-	mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
-	mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
-	mb_list_mbuf.ml_genlist->mb_cont.mc_types =
-	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
-	mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
-	SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
-	SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
-
-	/*
-	 * Allocate all the required counters for clusters.  This makes
-	 * cluster allocations/deallocations much faster.
-	 */
-	cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
-	if (cl_refcntmap == NULL)
-		goto bad;
-
-	/*
-	 * Initialize general mbuf statistics.
-	 */
-	mbstat.m_msize =  mb_list_mbuf.ml_objsize;
-	mbstat.m_mclbytes = mb_list_clust.ml_objsize;
-	mbstat.m_minclsize = MINCLSIZE;
-	mbstat.m_mlen = MLEN;
-	mbstat.m_mhlen = MHLEN;
-	mbstat.m_numtypes = MT_NTYPES;
-	mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks;
-	mbstat.m_clperbuck = mb_list_clust.ml_objbucks;
-
-	/*
-	 * Allocate and initialize PCPU containers.
-	 */
-	for (i = 0; i < NCPU; i++) {
-		if (CPU_ABSENT(i)) {
-			mb_statpcpu[i].mb_active = 0;
-			continue;
-		}
-
-		mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
-		    M_MBUF, M_NOWAIT);
-		mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
-		    M_MBUF, M_NOWAIT);
-		if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
-		    (mb_list_clust.ml_cntlst[i] == NULL))
-			goto bad;
-
-		mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, MTX_DEF);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
-
-		mb_statpcpu[i].mb_active = 1;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
-		    mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
-		    &(mb_statpcpu[i].mb_mbfree);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
-		    &(mb_statpcpu[i].mb_clfree);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
-		    &(mb_statpcpu[i].mb_mbbucks);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
-		    &(mb_statpcpu[i].mb_clbucks);
-		mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
-		    &(mb_statpcpu[i].mb_mbtypes[0]);
-		mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
-
-		SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
-		SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
-
-		/*
-		 * Perform initial allocations.
-		 */
-		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
-		MB_LOCK_CONT(pcpu_cnt);
-		for (j = 0; j < NMB_MBUF_INIT; j++) {
-			if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
-			    == NULL)
-				goto bad;
-		}
-		MB_UNLOCK_CONT(pcpu_cnt);
-
-		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
-		MB_LOCK_CONT(pcpu_cnt);
-		for (j = 0; j < NMB_CLUST_INIT; j++) {
-			if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
-			    == NULL)
-				goto bad;
-		}
-		MB_UNLOCK_CONT(pcpu_cnt);
-	}
-
-	return;
-bad:
-	panic("mbuf_init(): failed to initialize mbuf subsystem!");
-}
-
-/*
- * Populate a given mbuf PCPU container with a bucket full of fresh new
- * buffers.  Return a pointer to the new bucket (already in the container if
- * successful), or return NULL on failure.
- *
- * LOCKING NOTES:
- * PCPU container lock must be held when this is called.
- * The lock is dropped here so that we can cleanly call the underlying VM
- * code.  If we fail, we return with no locks held. If we succeed (i.e., return
- * non-NULL), we return with the PCPU lock held, ready for allocation from
- * the returned bucket.
- */
-static struct mb_bucket *
-mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
-{
-	struct mb_bucket *bucket;
-	caddr_t p;
-	int i;
-
-	MB_UNLOCK_CONT(cnt_lst);
-	/*
-	 * If our object's (finite) map is starved now (i.e., no more address
-	 * space), bail out now.
-	 */
-	if (mb_list->ml_mapfull)
-		return (NULL);
-
-	bucket = malloc(sizeof(struct mb_bucket) +
-	    mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how));
-	if (bucket == NULL)
-		return (NULL);
-
-	p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * 
-	    mb_list->ml_objbucks, MBTOM(how));
-	if (p == NULL) {
-		free(bucket, M_MBUF);
-		if (how == M_TRYWAIT)
-			mb_list->ml_mapfull = 1;
-		return (NULL);
-	}
-
-	bucket->mb_numfree = 0;
-	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
-	for (i = 0; i < mb_list->ml_objbucks; i++) {
-		bucket->mb_free[i] = p;
-		bucket->mb_numfree++;
-		p += mb_list->ml_objsize;
-	}
-
-	MB_LOCK_CONT(cnt_lst);
-	bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-	SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
-	(*(cnt_lst->mb_cont.mc_numbucks))++;
-	*(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
-
-	return (bucket);
-}
-
-/*
- * Allocate a network buffer.
- * The general case is very easy.  Complications only arise if our PCPU
- * container is empty.  Things get worse if the PCPU container is empty,
- * the general container is empty, and we've run out of address space
- * in our map; then we try to block if we're willing to (M_TRYWAIT).
- */
-static 
-void *
-mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, 
-	 int *pers_list)
-{
-	static int last_report;
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_bucket *bucket;
-	void *m;
-
-#ifdef INVARIANTS
-	int flags;
-	
-	flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
-	if (flags != M_DONTWAIT && flags != M_TRYWAIT) {
-		static	struct timeval lasterr;
-		static	int curerr;
-		if (ppsratecheck(&lasterr, &curerr, 1)) {
-			printf("Bad mbuf alloc flags: %x\n", flags);
-			backtrace();
-			how = M_TRYWAIT;
-		}
-	}
-#endif
-
-	m = NULL;
-	if ((persist & MBP_PERSISTENT) != 0) {
-		/*
-		 * If we're a "persistent" call, then the per-CPU #(pers_list)
-		 * cache lock is already held, and we just need to refer to
-		 * the correct cache descriptor.
-		 */
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
-	} else {
-		cnt_lst = MB_GET_PCPU_LIST(mb_list);
-		MB_LOCK_CONT(cnt_lst);
-	}
-
-	if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
-		/*
-		 * This is the easy allocation case. We just grab an object
-		 * from a bucket in the PCPU container. At worst, we
-		 * have just emptied the bucket and so we remove it
-		 * from the container.
-		 */
-		MB_GET_OBJECT(m, bucket, cnt_lst);
-		MB_MBTYPES_INC(cnt_lst, type, 1);
-
-		/* If asked to persist, do not drop the lock. */
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(cnt_lst);
-		else
-			*pers_list = cnt_lst->mb_cont.mc_numowner;
-	} else {
-		struct mb_gen_list *gen_list;
-
-		/*
-		 * This is the less-common more difficult case. We must
-		 * first verify if the general list has anything for us
-		 * and if that also fails, we must allocate a page from
-		 * the map and create a new bucket to place in our PCPU
-		 * container (already locked). If the map is starved then
-		 * we're really in for trouble, as we have to wait on
-		 * the general container's condition variable.
-		 */
-		gen_list = MB_GET_GEN_LIST(mb_list);
-		MB_LOCK_CONT(gen_list);
-
-		if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
-		    != NULL) {
-			/*
-			 * Give ownership of the bucket to our CPU's
-			 * container, but only actually put the bucket
-			 * in the container if it doesn't become free
-			 * upon removing an mbuf from it.
-			 */
-			SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    mb_blist);
-			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-			(*(gen_list->mb_cont.mc_numbucks))--;
-			(*(cnt_lst->mb_cont.mc_numbucks))++;
-			*(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
-			bucket->mb_numfree--;
-			m = bucket->mb_free[(bucket->mb_numfree)];
-			if (bucket->mb_numfree == 0) {
-				SLIST_NEXT(bucket, mb_blist) = NULL;
-				bucket->mb_owner |= MB_BUCKET_FREE;
-			} else {
-				SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-				     bucket, mb_blist);
-				*(cnt_lst->mb_cont.mc_objcount) +=
-				    bucket->mb_numfree;
-			}
-			MB_UNLOCK_CONT(gen_list);
-			MB_MBTYPES_INC(cnt_lst, type, 1);
-
-			/* If asked to persist, do not drop the lock. */
-			if ((persist & MBP_PERSIST) == 0)
-				MB_UNLOCK_CONT(cnt_lst);
-			else
-				*pers_list = cnt_lst->mb_cont.mc_numowner;
-		} else {
-			/*
-			 * We'll have to allocate a new page.
-			 */
-			MB_UNLOCK_CONT(gen_list);
-			bucket = mb_pop_cont(mb_list, how, cnt_lst);
-			if (bucket != NULL) {
-				MB_GET_OBJECT(m, bucket, cnt_lst);
-				MB_MBTYPES_INC(cnt_lst, type, 1);
-
-				/* If asked to persist, do not drop the lock. */
-				if ((persist & MBP_PERSIST) == 0)
-					MB_UNLOCK_CONT(cnt_lst);
-				else
-					*pers_list=cnt_lst->mb_cont.mc_numowner;
-			} else {
-				if (how == M_TRYWAIT) {
-					/*
-				 	 * Absolute worst-case scenario.
-					 * We block if we're willing to, but
-					 * only after trying to steal from
-					 * other lists.
-					 */
-					m = mb_alloc_wait(mb_list, type);
-				} else {
-					/* XXX: No consistency. */
-					mbstat.m_drops++;
-
-					if (ticks < last_report ||
-					   (ticks - last_report) >= hz) {
-						last_report = ticks;
-						printf(
-"All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
-					}
-
-				}
-				if (m != NULL && (persist & MBP_PERSIST) != 0) {
-					cnt_lst = MB_GET_PCPU_LIST(mb_list);
-					MB_LOCK_CONT(cnt_lst);
-					*pers_list=cnt_lst->mb_cont.mc_numowner;
-				}
-			}
-		}
-	}
-
-	return (m);
-}
-
-/*
- * This is the worst-case scenario called only if we're allocating with
- * M_TRYWAIT.  We first drain all the protocols, then try to find an mbuf
- * by looking in every PCPU container.  If we're still unsuccesful, we
- * try the general container one last time and possibly block on our
- * starved cv.
- */
-static void *
-mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
-{
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_gen_list *gen_list;
-	struct mb_bucket *bucket;
-	void *m;
-	int i, cv_ret;
-
-	/*
-	 * Try to reclaim mbuf-related objects (mbufs, clusters).
-	 */
-	mb_reclaim();
-
-	/*
-	 * Cycle all the PCPU containers. Increment starved counts if found
-	 * empty.
-	 */
-	for (i = 0; i < NCPU; i++) {
-		if (CPU_ABSENT(i))
-			continue;
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
-		MB_LOCK_CONT(cnt_lst);
-
-		/*
-		 * If container is non-empty, get a single object from it.
-		 * If empty, increment starved count.
-		 */
-		if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
-		    NULL) {
-			MB_GET_OBJECT(m, bucket, cnt_lst);
-			MB_MBTYPES_INC(cnt_lst, type, 1);
-			MB_UNLOCK_CONT(cnt_lst);
-			mbstat.m_wait++;	/* XXX: No consistency. */
-			return (m);
-		} else
-			cnt_lst->mb_cont.mc_starved++;
-
-		MB_UNLOCK_CONT(cnt_lst);
-	}
-
-	/*
-	 * We're still here, so that means it's time to get the general
-	 * container lock, check it one more time (now that mb_reclaim()
-	 * has been called) and if we still get nothing, block on the cv.
-	 */
-	gen_list = MB_GET_GEN_LIST(mb_list);
-	MB_LOCK_CONT(gen_list);
-	if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
-		MB_GET_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_INC(gen_list, type, 1);
-		MB_UNLOCK_CONT(gen_list);
-		mbstat.m_wait++;	/* XXX: No consistency. */
-		return (m);
-	}
-
-	gen_list->mb_cont.mc_starved++;
-	cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
-	    gen_list->mb_cont.mc_lock, mbuf_wait);
-	gen_list->mb_cont.mc_starved--;
-
-	if ((cv_ret == 0) &&
-	    ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
-		MB_GET_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_INC(gen_list, type, 1);
-		mbstat.m_wait++;	/* XXX: No consistency. */
-	} else {
-		mbstat.m_drops++;	/* XXX: No consistency. */
-		m = NULL;
-	}
-
-	MB_UNLOCK_CONT(gen_list);
-
-	return (m);
-}
-
-/*-
- * Free an object to its rightful container.
- * In the very general case, this operation is really very easy.
- * Complications arise primarily if:
- *	(a) We've hit the high limit on number of free objects allowed in
- *	    our PCPU container.
- *	(b) We're in a critical situation where our container has been
- *	    marked 'starved' and we need to issue wakeups on the starved
- *	    condition variable.
- *	(c) Minor (odd) cases: our bucket has migrated while we were
- *	    waiting for the lock; our bucket is in the general container;
- *	    our bucket is empty.
- */
-static 
-void
-mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
-	int *pers_list)
-{
-	struct mb_pcpu_list *cnt_lst;
-	struct mb_gen_list *gen_list;
-	struct mb_bucket *bucket;
-	u_int owner;
-
-	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
-
-	/*
-	 * Make sure that if after we lock the bucket's present container the
-	 * bucket has migrated, that we drop the lock and get the new one.
-	 */
-retry_lock:
-	owner = bucket->mb_owner & ~MB_BUCKET_FREE;
-	switch (owner) {
-	case MB_GENLIST_OWNER:
-		gen_list = MB_GET_GEN_LIST(mb_list);
-		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
-			if (*pers_list != MB_GENLIST_OWNER) {
-				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
-				    *pers_list);
-				MB_UNLOCK_CONT(cnt_lst);
-				MB_LOCK_CONT(gen_list);
-			}
-		} else {
-			MB_LOCK_CONT(gen_list);
-		}
-		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
-			MB_UNLOCK_CONT(gen_list);
-			*pers_list = -1;
-			goto retry_lock;
-		}
-
-		/*
-		 * If we're intended for the general container, this is
-		 * real easy: no migrating required. The only `bogon'
-		 * is that we're now contending with all the threads
-		 * dealing with the general list, but this is expected.
-		 */
-		MB_PUT_OBJECT(m, bucket, gen_list);
-		MB_MBTYPES_DEC(gen_list, type, 1);
-		if (bucket->mb_owner & MB_BUCKET_FREE) {
-			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = MB_GENLIST_OWNER;
-		}
-		if (gen_list->mb_cont.mc_starved > 0)
-			cv_signal(&(gen_list->mgl_mstarved));
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(gen_list);
-		else
-			*pers_list = MB_GENLIST_OWNER;
-		break;
-
-	default:
-		cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
-		if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
-			if (*pers_list == MB_GENLIST_OWNER) {
-				gen_list = MB_GET_GEN_LIST(mb_list);
-				MB_UNLOCK_CONT(gen_list);
-				MB_LOCK_CONT(cnt_lst);
-			} else {
-				cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
-				    *pers_list);
-				owner = *pers_list;
-			}
-		} else {
-			MB_LOCK_CONT(cnt_lst);
-		}
-		if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
-			MB_UNLOCK_CONT(cnt_lst);
-			*pers_list = -1;
-			goto retry_lock;
-		}
-
-		MB_PUT_OBJECT(m, bucket, cnt_lst);
-		MB_MBTYPES_DEC(cnt_lst, type, 1);
-		if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) ||
-		    (cnt_lst->mb_cont.mc_starved > 0)) {
-			/*
-			 * We've hit the high limit of allowed numbers of mbufs
-			 * on this PCPU list or we've been flagged that we need
-			 * to transfer a bucket over to the general cache.
-			 * We must now migrate a bucket over to the general
-			 * container.
-			 */
-			gen_list = MB_GET_GEN_LIST(mb_list);
-			MB_LOCK_CONT(gen_list);
-			if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
-				bucket =
-				    SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
-				SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-				    mb_blist);
-			}
-			SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = MB_GENLIST_OWNER;
-			*(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
-			*(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
-			(*(cnt_lst->mb_cont.mc_numbucks))--;
-			(*(gen_list->mb_cont.mc_numbucks))++;
-
-			/*
-			 * While we're at it, transfer some of the mbtypes
-			 * "count load" onto the general list's mbtypes
-			 * array, seeing as how we're moving the bucket
-			 * there now, meaning that the freeing of objects
-			 * there will now decrement the _general list's_
-			 * mbtypes counters, and no longer our PCPU list's
-			 * mbtypes counters. We do this for the type presently
-			 * being freed in an effort to keep the mbtypes
-			 * counters approximately balanced across all lists.
-			 */ 
-			MB_MBTYPES_DEC(cnt_lst, type,
-			    mb_list->ml_objbucks - bucket->mb_numfree);
-			MB_MBTYPES_INC(gen_list, type,
-			    mb_list->ml_objbucks - bucket->mb_numfree);
-
-			if (cnt_lst->mb_cont.mc_starved > 0) {
-				/*
-				 * Determine whether or not to keep
-				 * transferring buckets to the general list
-				 * or whether we've transferred enough already.
-				 * The thread that is blocked may end up waking
-				 * up in the meantime, but transferring an
-				 * extra bucket in a constrained situation
-				 * is not so bad, as we're likely to need
-				 * it soon anyway.
-				 */
-				if (gen_list->mb_cont.mc_starved > 0) {
-					cnt_lst->mb_cont.mc_starved--;
-					cv_signal(&(gen_list->mgl_mstarved));
-				} else
-					cnt_lst->mb_cont.mc_starved = 0;
-			}
-			MB_UNLOCK_CONT(gen_list);
-			if ((persist & MBP_PERSIST) == 0)
-				MB_UNLOCK_CONT(cnt_lst);
-			else
-				*pers_list = owner;
-			break;
-		}
-
-		if (bucket->mb_owner & MB_BUCKET_FREE) {
-			SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
-			    bucket, mb_blist);
-			bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
-		}
-
-		if ((persist & MBP_PERSIST) == 0)
-			MB_UNLOCK_CONT(cnt_lst);
-		else
-			*pers_list = owner;
-		break;
-	}
-}
-
-/*
- * Drain protocols in hopes to free up some resources.
- *
- * LOCKING NOTES:
- * No locks should be held when this is called.  The drain routines have to
- * presently acquire some locks which raises the possibility of lock order
- * violation if we're holding any mutex if that mutex is acquired in reverse
- * order relative to one of the locks in the drain routines.
- */
-static void
-mb_reclaim(void)
-{
-	struct domain *dp;
-	struct protosw *pr;
-
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
-	    "mb_reclaim()");
-
-	mbstat.m_drain++;	/* XXX: No consistency. */
-
-	for (dp = domains; dp != NULL; dp = dp->dom_next)
-		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
-			if (pr->pr_drain != NULL)
-				(*pr->pr_drain)();
-}
-
-/******************************************************************************
- * Internal setup macros.
- */
-
-#define	_mb_setup(m, type) do {						\
-	(m)->m_type = (type);						\
-	(m)->m_next = NULL;						\
-	(m)->m_nextpkt = NULL;						\
-	(m)->m_data = (m)->m_dat;					\
-	(m)->m_flags = 0;						\
-} while (0)
-
-#define	_mbhdr_setup(m, type) do {					\
-	(m)->m_type = (type);						\
-	(m)->m_next = NULL;						\
-	(m)->m_nextpkt = NULL;						\
-	(m)->m_data = (m)->m_pktdat;					\
-	(m)->m_flags = M_PKTHDR;					\
-	(m)->m_pkthdr.rcvif = NULL;					\
-	(m)->m_pkthdr.csum_flags = 0;					\
-	SLIST_INIT(&(m)->m_pkthdr.tags);				\
-} while (0)
-
-#define _mcl_setup(m) do {						\
-	(m)->m_data = (m)->m_ext.ext_buf;				\
-	(m)->m_flags |= M_EXT;						\
-	(m)->m_ext.ext_free = NULL;					\
-	(m)->m_ext.ext_args = NULL;					\
-	(m)->m_ext.ext_size = MCLBYTES;					\
-	(m)->m_ext.ext_type = EXT_CLUSTER;				\
-} while (0)
-
-#define	_mext_init_ref(m, ref) do {					\
-	(m)->m_ext.ref_cnt = ((ref) == NULL) ?				\
-	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref);	\
-	if ((m)->m_ext.ref_cnt != NULL) {				\
-		*((m)->m_ext.ref_cnt) = 0;				\
-		MEXT_ADD_REF((m));					\
-	}								\
-} while (0)
-
-#define	cl2ref(cl)							\
-    (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT)
-
-#define	_mext_dealloc_ref(m)						\
-	if ((m)->m_ext.ext_type != EXT_EXTREF)				\
-		free((m)->m_ext.ref_cnt, M_MBUF)
-
-/******************************************************************************
- * Internal routines.
- * 
- * Because mb_alloc() and mb_free() are inlines (to keep the common
- * cases down to a maximum of one function call), below are a few
- * routines used only internally for the sole purpose of making certain
- * functions smaller.
- *
- * - _mext_free(): frees associated storage when the ref. count is
- *   exactly one and we're freeing.
- *
- * - _mgetm_internal(): common "persistent-lock" routine that allocates
- *   an mbuf and a cluster in one shot, but where the lock is already
- *   held coming in (which is what makes it different from the exported
- *   m_getcl()).  The lock is dropped when done.  This is used by m_getm()
- *   and, therefore, is very m_getm()-specific.
- */
-static struct mbuf *_mgetm_internal(int, short, short, int);
-
-void
-_mext_free(struct mbuf *mb)
-{
-
-	if (mb->m_ext.ext_type == EXT_CLUSTER) {
-		mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
-		    0, NULL);
-	} else {
-		(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
-		_mext_dealloc_ref(mb);
-	}
-}
-
-static struct mbuf *
-_mgetm_internal(int how, short type, short persist, int cchnum)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
-	if (mb == NULL)
-		return NULL;
-	_mb_setup(mb, type);
-
-	if ((persist & MBP_PERSIST) != 0) {
-		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
-		    how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
-		if (mb->m_ext.ext_buf == NULL) {
-			(void)m_free(mb);
-			mb = NULL;
-		}
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-	}
-	return (mb);
-}
-
-/******************************************************************************
- * Exported buffer allocation and de-allocation routines.
- */
-
-/*
- * Allocate and return a single (normal) mbuf.  NULL is returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL)
-		_mb_setup(mb, type);
-	return (mb);
-}
-
-/*
- * Allocate a given length worth of mbufs and/or clusters (whatever fits
- * best) and return a pointer to the top of the allocated chain.  If an
- * existing mbuf chain is provided, then we will append the new chain
- * to the existing one but still return the top of the newly allocated
- * chain.  NULL is returned on failure, in which case the [optional]
- * provided chain is left untouched, and any memory already allocated
- * is freed.
- *
- * Arguments:
- *  - m: existing chain to which to append new chain (optional).
- *  - len: total length of data to append, either in mbufs or clusters
- *    (we allocate whatever combination yields the best fit).
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
-{
-	struct mbuf *mb, *top, *cur, *mtail;
-	int num, rem, cchnum;
-	short persist;
-	int i;
-
-	KASSERT(len >= 0, ("m_getm(): len is < 0"));
-
-	/* If m != NULL, we will append to the end of that chain. */
-	if (m != NULL)
-		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
-	else
-		mtail = NULL;
-
-	/*
-	 * In the best-case scenario (which should be the common case
-	 * unless we're in a starvation situation), we will be able to
-	 * go through the allocation of all the desired mbufs and clusters
-	 * here without dropping our per-CPU cache lock in between.
-	 */
-	num = len / MCLBYTES;
-	rem = len % MCLBYTES;
-	persist = 0;
-	cchnum = -1;
-	top = cur = NULL;
-	for (i = 0; i < num; i++) {
-		mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
-		    MBP_PERSIST | persist, &cchnum);
-		if (mb == NULL)
-			goto failed;
-		_mb_setup(mb, type);
-		mb->m_len = 0;
-
-		persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
-		mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
-		    how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
-		if (mb->m_ext.ext_buf == NULL) {
-			(void)m_free(mb);
-			goto failed;
-		}
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-		persist = MBP_PERSISTENT;
-
-		if (cur == NULL)
-			top = cur = mb;
-		else
-			cur = (cur->m_next = mb);
-	}
-	if (rem > 0) {
-		if (cchnum >= 0) {
-			persist = MBP_PERSISTENT;
-			persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
-			mb = _mgetm_internal(how, type, persist, cchnum);
-			if (mb == NULL)
-				goto failed;
-		} else if (rem > MINCLSIZE) {
-			mb = m_getcl(how, type, 0);
-		} else {
-			mb = m_get(how, type);
-		}
-		if (mb != NULL) {
-			mb->m_len = 0;
-			if (cur == NULL)
-				top = mb;
-			else
-				cur->m_next = mb;
-		} else
-			goto failed;
-	}
-
-	if (mtail != NULL)
-		mtail->m_next = top;
-	return top;
-failed:
-	if (top != NULL)
-		m_freem(top);
-	return NULL;
-}
-
-/*
- * Allocate and return a single M_PKTHDR mbuf.  NULL is returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mbhdr_setup(mb, type);
-#ifdef MAC
-		if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-			m_free(mb);
-			return (NULL);
-		}
-#endif
-	}
-	return (mb);
-}
-
-/*
- * Allocate and return a single (normal) pre-zero'd mbuf.  NULL is
- * returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get_clrd(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mb_setup(mb, type);
-		bzero(mtod(mb, caddr_t), MLEN);
-	}
-	return (mb);
-}
-
-/*
- * Allocate and return a single M_PKTHDR pre-zero'd mbuf.  NULL is
- * returned on failure.
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr_clrd(int how, short type)
-{
-	struct mbuf *mb;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
-	if (mb != NULL) {
-		_mbhdr_setup(mb, type);
-#ifdef MAC
-		if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-			m_free(mb);
-			return (NULL);
-		}
-#endif
-		bzero(mtod(mb, caddr_t), MHLEN);
-	}
-	return (mb);
-}
-
-/*
- * Free a single mbuf and any associated storage that it may have attached
- * to it.  The associated storage may not be immediately freed if its
- * reference count is above 1.  Returns the next mbuf in the chain following
- * the mbuf being freed.
- *
- * Arguments:
- *  - mb: the mbuf to free.
- */
-struct mbuf *
-m_free(struct mbuf *mb)
-{
-	struct mbuf *nb;
-	int cchnum;
-	short persist = 0;
-
-#ifdef INVARIANTS
-	if (mb->m_flags & M_FREELIST)
-		panic("m_free detected a mbuf double-free");
-	mb->m_flags |= M_FREELIST;
-#endif
-	if ((mb->m_flags & M_PKTHDR) != 0)
-		m_tag_delete_chain(mb, NULL);
-	nb = mb->m_next;
-	if ((mb->m_flags & M_EXT) != 0) {
-		MEXT_REM_REF(mb);
-		if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
-			if (mb->m_ext.ext_type == EXT_CLUSTER) {
-				mb_free(&mb_list_clust,
-				    (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
-				    MBP_PERSIST, &cchnum);
-				persist = MBP_PERSISTENT;
-			} else {
-				(*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
-				    mb->m_ext.ext_args);
-				_mext_dealloc_ref(mb);
-				persist = 0;
-			}
-		}
-	}
-	mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
-	return (nb);
-}
-
-/*
- * Free an entire chain of mbufs and associated external buffers, if
- * applicable.  Right now, we only optimize a little so that the cache
- * lock may be held across a single mbuf+cluster free.  Hopefully,
- * we'll eventually be holding the lock across more than merely two
- * consecutive frees but right now this is hard to implement because of
- * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
- * loop.
- *
- *  - mb: the mbuf chain to free.
- */
-void
-m_freem(struct mbuf *mb)
-{
-
-	while (mb != NULL)
-		mb = m_free(mb);
-}
-
-/*
- * Fetch an mbuf with a cluster attached to it.  If one of the
- * allocations fails, the entire allocation fails.  This routine is
- * the preferred way of fetching both the mbuf and cluster together,
- * as it avoids having to unlock/relock between allocations.  Returns
- * NULL on failure. 
- *
- * Arguments:
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- *  - type: the type of the mbuf being allocated.
- *  - flags: any flags to pass to the mbuf being allocated; if this includes
- *    the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
- */
-struct mbuf *
-m_getcl(int how, short type, int flags)
-{
-	struct mbuf *mb;
-	int cchnum;
-
-	mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
-	    MBP_PERSIST, &cchnum);
-	if (mb == NULL)
-		return NULL;
-	mb->m_type = type;
-	mb->m_next = NULL;
-	mb->m_flags = flags;
-	if ((flags & M_PKTHDR) != 0) {
-		mb->m_nextpkt = NULL;
-		mb->m_pkthdr.rcvif = NULL;
-		mb->m_pkthdr.csum_flags = 0;
-		SLIST_INIT(&mb->m_pkthdr.tags);
-	}
-
-	mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
-	    MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
-	if (mb->m_ext.ext_buf == NULL) {
-		(void)m_free(mb);
-		mb = NULL;
-	} else {
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-#ifdef MAC
-		if (flags & M_PKTHDR) {
-			if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
-				m_free(mb);
-				return (NULL);
-			}
-		}
-#endif
-	}
-	return (mb);
-}
-
-/*
- * Fetch a single mbuf cluster and attach it to an existing mbuf.  If
- * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
- * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
- * The M_EXT bit is not set on failure.
- *
- * Arguments:
- *  - mb: the existing mbuf to which to attach the allocated cluster.
- *  - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- *    if really starved for memory.  M_DONTWAIT to never block.
- */
-void
-m_clget(struct mbuf *mb, int how)
-{
-
-	mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
-	    0, NULL);
-	if (mb->m_ext.ext_buf != NULL) {
-		_mcl_setup(mb);
-		_mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-	}
-}
-
-/*
- * Configure a provided mbuf to refer to the provided external storage
- * buffer and setup a reference count for said buffer.  If the setting
- * up of the reference count fails, the M_EXT bit will not be set.  If
- * successfull, the M_EXT bit is set in the mbuf's flags.
- *
- * Arguments:
- *  - mb: the existing mbuf to which to attach the provided buffer.
- *  - buf: the address of the provided external storage buffer.
- *  - size: the size of the provided buffer.
- *  - freef: a pointer to a routine that is responsible for freeing the
- *    provided external storage buffer.
- *  - args: a pointer to an argument structure (of any type) to be passed
- *    to the provided freef routine (may be NULL).
- *  - flags: any other flags to be passed to the provided mbuf.
- *  - type: the type that the external storage buffer should be labeled with.
- */
-void
-m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
-    void (*freef)(void *, void *), void *args, int flags, int type)
-{
-	u_int *ref_cnt = NULL;
-
-	if (type == EXT_CLUSTER)
-		ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)];
-	else if (type == EXT_EXTREF)
-		ref_cnt = mb->m_ext.ref_cnt;
-	_mext_init_ref(mb, ref_cnt);
-	if (mb->m_ext.ref_cnt != NULL) {
-		mb->m_flags |= (M_EXT | flags);
-		mb->m_ext.ext_buf = buf;
-		mb->m_data = mb->m_ext.ext_buf;
-		mb->m_ext.ext_size = size;
-		mb->m_ext.ext_free = freef;
-		mb->m_ext.ext_args = args;
-		mb->m_ext.ext_type = type;
-	}
-}
-
-/*
- * Change type of provided mbuf.  This is a relatively expensive operation
- * (due to the cost of statistics manipulations) and should be avoided, where
- * possible.
- *
- * Arguments:
- *  - mb: the provided mbuf for which the type needs to be changed.
- *  - new_type: the new type to change the mbuf to.
- */
-void
-m_chtype(struct mbuf *mb, short new_type)
-{
-	struct mb_gen_list *gen_list;
-
-	gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
-	MB_LOCK_CONT(gen_list);
-	MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
-	MB_MBTYPES_INC(gen_list, new_type, 1);
-	MB_UNLOCK_CONT(gen_list);
-	mb->m_type = new_type;
-}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 5815fae..e14aba1 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -86,6 +86,161 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 #endif
 
 /*
+ * Malloc-type for external ext_buf ref counts.
+ */
+MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain.  If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, short type)
+{
+	struct mbuf *mb, *top, *cur, *mtail;
+	int num, rem;
+	int i;
+
+	KASSERT(len >= 0, ("m_getm(): len is < 0"));
+
+	/* If m != NULL, we will append to the end of that chain. */
+	if (m != NULL)
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+	else
+		mtail = NULL;
+
+	/*
+	 * Calculate how many mbufs+clusters ("packets") we need and how much
+	 * leftover there is after that and allocate the first mbuf+cluster
+	 * if required.
+	 */
+	num = len / MCLBYTES;
+	rem = len % MCLBYTES;
+	top = cur = NULL;
+	if (num > 0) {
+		if ((top = cur = m_getcl(how, type, 0)) == NULL)
+			goto failed;
+	}
+	num--;
+	top->m_len = 0;
+
+	for (i = 0; i < num; i++) {
+		mb = m_getcl(how, type, 0);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		cur = (cur->m_next = mb);
+	}
+	if (rem > 0) {
+		mb = (rem > MINCLSIZE) ?
+		    m_getcl(how, type, 0) : m_get(how, type);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		if (cur == NULL)
+			top = mb;
+		else
+			cur->m_next = mb;
+	}
+
+	if (mtail != NULL)
+		mtail->m_next = top;
+	return top;
+failed:
+	if (top != NULL)
+		m_freem(top);
+	return NULL;
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+	while (mb != NULL)
+		mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer.  If the setting
+ * up of the reference count fails, the M_EXT bit will not be set.  If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ *    mb     The existing mbuf to which to attach the provided buffer.
+ *    buf    The address of the provided external storage buffer.
+ *    size   The size of the provided buffer.
+ *    freef  A pointer to a routine that is responsible for freeing the
+ *           provided external storage buffer.
+ *    args   A pointer to an argument structure (of any type) to be passed
+ *           to the provided freef routine (may be NULL).
+ *    flags  Any other flags to be passed to the provided mbuf.
+ *    type   The type that the external storage buffer should be
+ *           labeled with.
+ *
+ * Returns:
+ *    Nothing.
+ */
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+    void (*freef)(void *, void *), void *args, int flags, int type)
+{
+	u_int *ref_cnt = NULL;
+
+	/* XXX Shouldn't be adding EXT_CLUSTER with this API */
+	if (type == EXT_CLUSTER)
+		ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+		    mb->m_ext.ext_buf);
+	else if (type == EXT_EXTREF)
+		ref_cnt = mb->m_ext.ref_cnt;
+	mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
+	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+	if (mb->m_ext.ref_cnt != NULL) {
+		*(mb->m_ext.ref_cnt) = 1;
+		mb->m_flags |= (M_EXT | flags);
+		mb->m_ext.ext_buf = buf;
+		mb->m_data = mb->m_ext.ext_buf;
+		mb->m_ext.ext_size = size;
+		mb->m_ext.ext_free = freef;
+		mb->m_ext.ext_args = args;
+		mb->m_ext.ext_type = type;
+        }
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 0.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+
+	MEXT_REM_REF(m);
+	if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
+		if (m->m_ext.ext_type == EXT_PACKET) {
+			uma_zfree(zone_pack, m);
+			return;
+		} else if (m->m_ext.ext_type == EXT_CLUSTER) {
+			uma_zfree(zone_clust, m->m_ext.ext_buf);
+			m->m_ext.ext_buf = NULL;
+		} else {
+			(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
+			    m->m_ext.ext_args);
+			if (m->m_ext.ext_type != EXT_EXTREF)
+				free(m->m_ext.ref_cnt, M_MBUF);
+		}
+	}
+	uma_zfree(zone_mbuf, m);
+}
+
+/*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
@@ -364,22 +519,22 @@ m_dup(struct mbuf *m, int how)
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
-		MGET(n, how, m->m_type);
+		if (remain >= MINCLSIZE) {
+			n = m_getcl(how, m->m_type, 0);
+			nsize = MCLBYTES;
+		} else {
+			n = m_get(how, m->m_type);
+			nsize = MLEN;
+		}
 		if (n == NULL)
 			goto nospace;
-		if (top == NULL) {		/* first one, must be PKTHDR */
-			if (!m_dup_pkthdr(n, m, how))
-				goto nospace;
-			nsize = MHLEN;
-		} else				/* not the first one */
-			nsize = MLEN;
-		if (remain >= MINCLSIZE) {
-			MCLGET(n, how);
-			if ((n->m_flags & M_EXT) == 0) {
-				(void)m_free(n);
+
+		if (top == NULL) {		/* First one, must be PKTHDR */
+			if (!m_dup_pkthdr(n, m, how)) {
+				m_free(n);
 				goto nospace;
 			}
-			nsize = MCLBYTES;
+			nsize = MHLEN;
 		}
 		n->m_len = 0;
 
@@ -651,39 +806,42 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
 	 void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
-	struct mbuf *top = 0, **mp = &top;
+	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
-	MGETHDR(m, M_DONTWAIT, MT_DATA);
-	if (m == NULL)
-		return (NULL);
-	m->m_pkthdr.rcvif = ifp;
-	m->m_pkthdr.len = totlen;
-	len = MHLEN;
-
 	while (totlen > 0) {
-		if (top) {
-			MGET(m, M_DONTWAIT, MT_DATA);
-			if (m == NULL) {
-				m_freem(top);
-				return (NULL);
-			}
-			len = MLEN;
-		}
-		if (totlen + off >= MINCLSIZE) {
-			MCLGET(m, M_DONTWAIT);
-			if (m->m_flags & M_EXT)
+		if (top == NULL) {	/* First one, must be PKTHDR */
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
+			} else {
+				m = m_gethdr(M_DONTWAIT, MT_DATA);
+				len = MHLEN;
+
+				/* Place initial small packet/header at end of mbuf */
+				if (m && totlen + off + max_linkhdr <= MLEN) {
+					m->m_data += max_linkhdr;
+					len -= max_linkhdr;
+				}
+			}
+			if (m == NULL)
+				return NULL;
+			m->m_pkthdr.rcvif = ifp;
+			m->m_pkthdr.len = totlen;
 		} else {
-			/*
-			 * Place initial small packet/header at end of mbuf.
-			 */
-			if (top == NULL && totlen + off + max_linkhdr <= len) {
-				m->m_data += max_linkhdr;
-				len -= max_linkhdr;
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, 0);
+				len = MCLBYTES;
+			} else {
+				m = m_get(M_DONTWAIT, MT_DATA);
+				len = MLEN;
+			}
+			if (m == NULL) {
+				m_freem(top);
+				return NULL;
 			}
 		}
 		if (off) {
@@ -722,9 +880,10 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
-			n = m_get_clrd(M_DONTWAIT, m->m_type);
+			n = m_get(M_DONTWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
+			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
index 0d11aac..ff7944d 100644
--- a/sys/kern/uipc_mbuf2.c
+++ b/sys/kern/uipc_mbuf2.c
@@ -230,14 +230,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
 	 * now, we need to do the hard way.  don't m_copy as there's no room
 	 * on both end.
 	 */
-	MGET(o, M_DONTWAIT, m->m_type);
-	if (o && len > MLEN) {
-		MCLGET(o, M_DONTWAIT);
-		if ((o->m_flags & M_EXT) == 0) {
-			m_free(o);
-			o = NULL;
-		}
-	}
+	if (len > MLEN)
+		o = m_getcl(M_DONTWAIT, m->m_type, 0);
+	else
+		o = m_get(M_DONTWAIT, m->m_type);
 	if (!o) {
 		m_freem(m);
 		return NULL;	/* ENOBUFS */
@@ -274,29 +270,27 @@ static struct mbuf *
 m_dup1(struct mbuf *m, int off, int len, int wait)
 {
 	struct mbuf *n;
-	int l;
 	int copyhdr;
 
 	if (len > MCLBYTES)
 		return NULL;
-	if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+	if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
 		copyhdr = 1;
-		MGETHDR(n, wait, m->m_type);
-		l = MHLEN;
-	} else {
+	else
 		copyhdr = 0;
-		MGET(n, wait, m->m_type);
-		l = MLEN;
-	}
-	if (n && len > l) {
-		MCLGET(n, wait);
-		if ((n->m_flags & M_EXT) == 0) {
-			m_free(n);
-			n = NULL;
-		}
+	if (len >= MINCLSIZE) {
+		if (copyhdr == 1)
+			n = m_getcl(wait, m->m_type, M_PKTHDR);
+		else
+			n = m_getcl(wait, m->m_type, 0);
+	} else {
+		if (copyhdr == 1)
+			n = m_gethdr(wait, m->m_type);
+		else
+			n = m_get(wait, m->m_type);
 	}
 	if (!n)
-		return NULL;
+		return NULL; /* ENOBUFS */
 
 	if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
 		m_free(n);
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
 
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index e07f4ef..6735e49 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -527,8 +527,8 @@ sosend(so, addr, uio, top, control, flags, td)
 {
 	struct mbuf **mp;
 	struct mbuf *m;
-	long space, len, resid;
-	int clen = 0, error, s, dontroute, mlen;
+	long space, len = 0, resid;
+	int clen = 0, error, s, dontroute;
 	int atomic = sosendallatonce(so) || top;
 #ifdef ZERO_COPY_SOCKETS
 	int cow_send;
@@ -624,25 +624,23 @@ restart:
 #ifdef ZERO_COPY_SOCKETS
 			cow_send = 0;
 #endif /* ZERO_COPY_SOCKETS */
-			if (top == 0) {
-				MGETHDR(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MHLEN;
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = (struct ifnet *)0;
-			} else {
-				MGET(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MLEN;
-			}
 			if (resid >= MINCLSIZE) {
 #ifdef ZERO_COPY_SOCKETS
+				if (top == NULL) {
+					MGETHDR(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else {
+					MGET(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+				}
 				if (so_zero_copy_send &&
 				    resid>=PAGE_SIZE &&
 				    space>=PAGE_SIZE &&
@@ -654,29 +652,48 @@ restart:
 						cow_send = socow_setup(m, uio);
 					}
 				}
-				if (!cow_send){
+				if (!cow_send) {
+					MCLGET(m, M_TRYWAIT);
+					if ((m->m_flags & M_EXT) == 0) {
+						m_free(m);
+						m = NULL;
+					} else {
+						len = min(min(MCLBYTES, resid), space);
+					}
+				} else
+					len = PAGE_SIZE;
+#else /* ZERO_COPY_SOCKETS */
+				if (top == NULL) {
+					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else
+					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+				len = min(min(MCLBYTES, resid), space);
 #endif /* ZERO_COPY_SOCKETS */
-				MCLGET(m, M_TRYWAIT);
-				if ((m->m_flags & M_EXT) == 0)
-					goto nopages;
-				mlen = MCLBYTES;
-				len = min(min(mlen, resid), space);
 			} else {
-#ifdef ZERO_COPY_SOCKETS
-					len = PAGE_SIZE;
+				if (top == NULL) {
+					m = m_gethdr(M_TRYWAIT, MT_DATA);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+
+					len = min(min(MHLEN, resid), space);
+					/*
+					 * For datagram protocols, leave room
+					 * for protocol headers in first mbuf.
+					 */
+					if (atomic && m && len < MHLEN)
+						MH_ALIGN(m, len);
+				} else {
+					m = m_get(M_TRYWAIT, MT_DATA);
+					len = min(min(MLEN, resid), space);
 				}
-
-			} else {
-#endif /* ZERO_COPY_SOCKETS */
-nopages:
-				len = min(min(mlen, resid), space);
-				/*
-				 * For datagram protocols, leave room
-				 * for protocol headers in first mbuf.
-				 */
-				if (atomic && top == 0 && len < mlen)
-					MH_ALIGN(m, len);
 			}
+			if (m == NULL) {
+				error = ENOBUFS;
+				goto release;
+			}
+
 			space -= len;
 #ifdef ZERO_COPY_SOCKETS
 			if (cow_send)
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
 
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 1b886f5..978c30e 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
@@ -85,6 +86,21 @@ static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 /*
+ * NSFBUFS-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+    "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+    "Number of sendfile(2) sf_bufs in use");
+
+/*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c
index fe263f1..4a34567 100644
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@@ -86,6 +86,10 @@
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	sf_buf_init(void *arg);
 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
 
@@ -351,6 +355,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;
 
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
 	SLIST_INIT(&sf_freelist.sf_head);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index d86c57c..2170599 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -10,7 +10,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -33,7 +33,12 @@
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
+/* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <vm/uma.h>
+#endif
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which
@@ -57,6 +62,16 @@
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	dtom(x)		((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1)))
+
+/*
+ * Argument structure passed to UMA routines during mbuf and packet
+ * allocations.
+ */
+struct mb_args {
+	int	flags;	/* Flags for mbuf being allocated */
+	int	how;	/* How to allocate: M_WAITOK or M_DONTWAIT */
+	short	type;	/* Type of mbuf being allocated */
+};
 #endif /* _KERNEL */
 
 /*
@@ -167,6 +182,7 @@ struct mbuf {
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
+#define	EXT_PACKET	3	/* came out of Packet zone */
 #define	EXT_NET_DRV	100	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	200	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	300	/* can throw this buffer away w/page flipping */
@@ -223,28 +239,12 @@ struct mbuf {
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */
 
 /*
- * Mbuf and cluster allocation statistics PCPU structure.
- */
-struct mbpstat {
-	u_long	mb_mbfree;
-	u_long	mb_mbbucks;
-	u_long	mb_clfree;
-	u_long	mb_clbucks;
-	long	mb_mbtypes[MT_NTYPES];
-	short	mb_active;
-};
-
-/*
  * General mbuf allocator statistics structure.
- * XXX: Modifications of these are not protected by any mutex locks nor by
- * any atomic() manipulations.  As a result, we may occasionally lose
- * a count or two.  Luckily, not all of these fields are modified at all
- * and remain static, and those that are manipulated are only manipulated
- * in failure situations, which do not occur (hopefully) very often.
  */
 struct mbstat {
-	u_long	m_drops;	/* times failed to allocate */
-	u_long	m_wait;		/* times succesfully returned from wait */
+	u_long	m_mbufs;	/* XXX */
+	u_long	m_mclusts;	/* XXX */
+
 	u_long	m_drain;	/* times drained protocols for space */
 	u_long	m_mcfail;	/* XXX: times m_copym failed */
 	u_long	m_mpfail;	/* XXX: times m_pullup failed */
@@ -253,10 +253,10 @@ struct mbstat {
 	u_long	m_minclsize;	/* min length of data to allocate a cluster */
 	u_long	m_mlen;		/* length of data in an mbuf */
 	u_long	m_mhlen;	/* length of data in a header mbuf */
-	u_int	m_mbperbuck;	/* number of mbufs per "bucket" */
-	u_int	m_clperbuck;	/* number of clusters per "bucket" */
-	/* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */
+
+	/* Number of mbtypes (gives # elems in mbtypes[] array: */
 	short	m_numtypes;
+
 	/* XXX: Sendfile stats should eventually move to their own struct */
 	u_long	sf_iocnt;	/* times sendfile had to do disk I/O */
 	u_long	sf_allocfail;	/* times sfbuf allocation failed */
@@ -265,14 +265,23 @@ struct mbstat {
 
 /*
  * Flags specifying how an allocation should be made.
- * M_DONTWAIT means "don't block if nothing is available" whereas
- * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is
- * available."
+ *
+ * The flag to use is as follows:
+ * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation.
+ * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block.
+ *
+ * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly
+ * and if we cannot allocate immediately we may return NULL,
+ * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate
+ * resources we will block until they are available, and thus never
+ * return NULL.
+ *
+ * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT.
  */
-#define	M_DONTWAIT	0x4		/* don't conflict with M_NOWAIT */
-#define	M_TRYWAIT	0x8		/* or M_WAITOK */
-#define	M_WAIT		M_TRYWAIT	/* XXX: deprecated */
-#define	MBTOM(how)	((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT)
+#define	MBTOM(how)	(how)
+#define	M_DONTWAIT	M_NOWAIT
+#define	M_TRYWAIT	M_WAITOK
+#define	M_WAIT		M_WAITOK
 
 #ifdef _KERNEL
 /*-
@@ -296,12 +305,114 @@ struct mbstat {
 #define	MEXT_ADD_REF(m)	atomic_add_int((m)->m_ext.ref_cnt, 1)
 
 /*
+ * Network buffer allocation API
+ *
+ * The rest of it is defined in kern/subr_mbuf.c
+ */
+
+extern uma_zone_t	zone_mbuf;
+extern uma_zone_t	zone_clust;
+extern uma_zone_t	zone_pack;
+
+static __inline struct mbuf	*m_get(int how, short type);
+static __inline struct mbuf	*m_gethdr(int how, short type);
+static __inline struct mbuf	*m_getcl(int how, short type, int flags);
+static __inline struct mbuf	*m_getclr(int how, short type);	/* XXX */
+static __inline struct mbuf	*m_free(struct mbuf *m);
+static __inline void		 m_clget(struct mbuf *m, int how);
+static __inline void		 m_chtype(struct mbuf *m, short new_type);
+void				 mb_free_ext(struct mbuf *);
+
+static __inline
+struct mbuf *
+m_get(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+/* XXX This should be depracated, very little use */
+static __inline
+struct mbuf *
+m_getclr(int how, short type)
+{
+	struct mbuf *m;
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m != NULL)
+		bzero(m->m_data, MLEN);
+	return m;
+}
+
+static __inline
+struct mbuf *
+m_gethdr(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = M_PKTHDR;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_getcl(int how, short type, int flags)
+{
+	struct mb_args args;
+
+	args.flags = flags;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_pack, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_free(struct mbuf *m)
+{
+	struct mbuf *n = m->m_next;
+
+#ifdef INVARIANTS
+	m->m_flags |= M_FREELIST;
+#endif
+	if (m->m_flags & M_EXT)
+		mb_free_ext(m);
+	else
+		uma_zfree(zone_mbuf, m);
+	return n;
+}
+
+static __inline
+void
+m_clget(struct mbuf *m, int how)
+{
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, how);
+}
+
+static __inline
+void
+m_chtype(struct mbuf *m, short new_type)
+{
+	m->m_type = new_type;
+}
+
+/*
  * mbuf, cluster, and external object allocation macros
  * (for compatibility purposes).
  */
 /* NB: M_COPY_PKTHDR is deprecated.  Use M_MOVE_PKTHDR or m_dup_pktdr. */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
-#define	m_getclr(how, type)	m_get_clrd((how), (type))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
@@ -309,23 +420,6 @@ struct mbstat {
     m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type))
 
 /*
- * MEXTFREE(m): disassociate (and possibly free) an external object from (m).
- *
- * If the atomic_cmpset_int() returns 0, then we effectively do nothing
- * in terms of "cleaning up" (freeing the ext buf and ref. counter) as
- * this means that either there are still references, or another thread
- * is taking care of the clean-up.
- */
-#define	MEXTFREE(m) do {						\
-	struct mbuf *_mb = (m);						\
-									\
-	MEXT_REM_REF(_mb);						\
-	if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1))		\
-		_mext_free(_mb);					\
-	_mb->m_flags &= ~M_EXT;						\
-} while (0)
-
-/*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this
  * can be both the local data payload, or an external buffer area,
  * depending on whether M_EXT is set).
@@ -425,18 +519,13 @@ extern	int max_linkhdr;		/* Largest link-level header */
 extern	int max_protohdr;		/* Largest protocol header */
 extern	struct mbstat mbstat;		/* General mbuf stats/infos */
 extern	int nmbclusters;		/* Maximum number of clusters */
-extern	int nmbcnt;			/* Scale kmem_map for counter space */
-extern	int nmbufs;			/* Maximum number of mbufs */
 
 struct uio;
 
-void		 _mext_free(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 void		 m_cat(struct mbuf *, struct mbuf *);
-void		 m_chtype(struct mbuf *, short);
-void		 m_clget(struct mbuf *, int);
 void		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(void *, void *), void *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
@@ -451,13 +540,7 @@ struct	mbuf	*m_dup(struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct	mbuf	*m_fragment(struct mbuf *, int, int);
-struct	mbuf	*m_free(struct mbuf *);
 void		 m_freem(struct mbuf *);
-struct	mbuf	*m_get(int, short);
-struct	mbuf	*m_get_clrd(int, short);
-struct	mbuf	*m_getcl(int, short, int);
-struct	mbuf	*m_gethdr(int, short);
-struct	mbuf	*m_gethdr_clrd(int, short);
 struct	mbuf	*m_getm(struct mbuf *, int, int, short);
 struct	mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
@@ -470,7 +553,7 @@ struct	mbuf	*m_split(struct mbuf *, int, int);
 struct	mbuf	*m_uiotombuf(struct uio *, int, int);
 
 /*-
- * Packets may have annotations attached by affixing a list
+ * Network packets may have annotations attached by affixing a list
  * of "packet tags" to the pkthdr structure.  Packet tags are
  * dynamically allocated semi-opaque data structures that have
  * a fixed header (struct m_tag) that specifies the size of the
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index 4de1efa..0d34ca3 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -43,7 +43,7 @@
 
 /* Types and type defs */
 
-struct uma_zone; 
+struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
@@ -157,12 +157,46 @@ typedef void (*uma_fini)(void *mem, int size);
  *	A pointer to a structure which is intended to be opaque to users of
  *	the interface.  The value may be null if the wait flag is not set.
  */
-
 uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 			uma_init uminit, uma_fini fini, int align,
 			u_int16_t flags);
 
 /*
+ * Create a secondary uma zone
+ *
+ * Arguments:
+ *	name  The text name of the zone for debugging and stats, this memory
+ *		should not be freed until the zone has been deallocated.
+ *	ctor  The constructor that is called when the object is allocated
+ *	dtor  The destructor that is called when the object is freed.
+ *	zinit  An initializer that sets up the initial state of the memory
+ *		as the object passes from the Keg's slab to the Zone's cache.
+ *	zfini  A discard function that undoes initialization done by init
+ *		as the object passes from the Zone's cache to the Keg's slab.
+ *
+ *		ctor/dtor/zinit/zfini may all be null, see notes above.
+ *		Note that the zinit and zfini specified here are NOT
+ *		exactly the same as the init/fini specified to uma_zcreate()
+ *		when creating a master zone.  These zinit/zfini are called
+ *		on the TRANSITION from keg to zone (and vice-versa). Once
+ *		these are set, the primary zone may alter its init/fini
+ *		(which are called when the object passes from VM to keg)
+ *		using uma_zone_set_init/fini()) as well as its own
+ *		zinit/zfini (unset by default for master zone) with
+ *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
+ *
+ *	align A bitmask that corisponds to the requested alignment
+ *		eg 4 would be 0x3
+ *	flags A set of parameters that control the behavior of the zone
+ *
+ * Returns:
+ *	A pointer to a structure which is intended to be opaque to users of
+ *	the interface.  The value may be null if the wait flag is not set.
+ */
+uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_zone_t master);
+
+/*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
@@ -185,6 +219,9 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
+#define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
+#define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
+#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@@ -201,7 +238,6 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
  *	zone  The zone we want to destroy.
  *
  */
-
 void uma_zdestroy(uma_zone_t zone);
 
 /*
@@ -376,6 +412,28 @@ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
 void uma_zone_set_max(uma_zone_t zone, int nitems);
 
 /*
+ * The following two routines (uma_zone_set_init/fini)
+ * are used to set the backend init/fini pair which acts on an
+ * object as it becomes allocated and is placed in a slab within
+ * the specified zone's backing keg.  These should probably not
+ * be changed once allocations have already begun and only
+ * immediately upon zone creation.
+ */
+void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
+void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
+
+/*
+ * The following two routines (uma_zone_set_zinit/zfini) are
+ * used to set the zinit/zfini pair which acts on an object as
+ * it passes from the backing Keg's slab cache to the
+ * specified Zone's bucket cache.  These should probably not
+ * be changed once allocations have already begun and
+ * only immediately upon zone creation.
+ */
+void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
+void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
+
+/*
  * Replaces the standard page_alloc or obj_alloc functions for this zone
  *
  * Arguments:
@@ -430,5 +488,19 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
  */
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
+/*
+ * Used to lookup the reference counter allocated for an item
+ * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
+ * reference counters are allocated for items and stored in
+ * the underlying slab header.
+ *
+ * Arguments:
+ * 	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
+ *	item  The address of the item for which we want a refcnt.
+ *
+ * Returns:
+ * 	A pointer to a u_int32_t reference counter.
+ */
+u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
 
 #endif
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index f693540..82d60c6 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -84,15 +84,19 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 /*
- * This is the zone from which all zones are spawned.  The idea is that even
- * the zone heads are allocated from the allocator, so we use the bss section
- * to bootstrap us.
+ * This is the zone and keg from which all zones are spawned.  The idea is that
+ * even the zone & keg heads are allocated from the allocator, so we use the
+ * bss section to bootstrap us.
  */
-static struct uma_zone masterzone;
-static uma_zone_t zones = &masterzone;
+static struct uma_keg masterkeg;
+static struct uma_zone masterzone_k;
+static struct uma_zone masterzone_z;
+static uma_zone_t kegs = &masterzone_k;
+static uma_zone_t zones = &masterzone_z;
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
+static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
@@ -107,10 +111,10 @@ static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
  */
 static int bucketdisable = 1;
 
-/* Linked list of all zones in the system */
-static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones);
+/* Linked list of all kegs in the system */
+static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
 
-/* This mutex protects the zone list */
+/* This mutex protects the keg list */
 static struct mtx uma_mtx;
 
 /* These are the pcpu cache locks */
@@ -144,6 +148,16 @@ struct uma_zctor_args {
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
+	uma_keg_t keg;
+	int align;
+	u_int16_t flags;
+};
+
+struct uma_kctor_args {
+	uma_zone_t zone;
+	size_t size;
+	uma_init uminit;
+	uma_fini fini;
 	int align;
 	u_int16_t flags;
 };
@@ -179,6 +193,8 @@ static uma_slab_t slab_zalloc(uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
+static void keg_ctor(void *, int, void *);
+static void keg_dtor(void *, int, void *);
 static void zone_ctor(void *, int, void *);
 static void zone_dtor(void *, int, void *);
 static void zero_init(void *, int);
@@ -202,6 +218,8 @@ static int uma_zalloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
 static void zone_drain(uma_zone_t);
+static void uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
+    uma_fini fini, int align, u_int16_t flags);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -328,10 +346,12 @@ uma_timeout(void *unused)
 static void
 zone_timeout(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	uma_cache_t cache;
 	u_int64_t alloc;
 	int cpu;
 
+	keg = zone->uz_keg;
 	alloc = 0;
 
 	/*
@@ -344,7 +364,7 @@ zone_timeout(uma_zone_t zone)
 	 * to lock and do it here instead so that the statistics don't get too
 	 * far out of sync.
 	 */
-	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
+	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) {
 		for (cpu = 0; cpu <= mp_maxid; cpu++) {
 			if (CPU_ABSENT(cpu))
 				continue;
@@ -369,8 +389,8 @@ zone_timeout(uma_zone_t zone)
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 
-	if (zone->uz_flags & UMA_ZONE_HASH &&
-	    zone->uz_pages / zone->uz_ppera >= zone->uz_hash.uh_hashsize) {
+	if (keg->uk_flags & UMA_ZONE_HASH &&
+	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
@@ -381,14 +401,14 @@ zone_timeout(uma_zone_t zone)
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
-		newhash = zone->uz_hash;
+		newhash = keg->uk_hash;
 		ZONE_UNLOCK(zone);
 		ret = hash_alloc(&newhash);
 		ZONE_LOCK(zone);
 		if (ret) {
-			if (hash_expand(&zone->uz_hash, &newhash)) {
-				oldhash = zone->uz_hash;
-				zone->uz_hash = newhash;
+			if (hash_expand(&keg->uk_hash, &newhash)) {
+				oldhash = keg->uk_hash;
+				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
@@ -530,7 +550,7 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 	mzone = 0;
 
 	/* We have to lookup the slab again for malloc.. */
-	if (zone->uz_flags & UMA_ZONE_MALLOC)
+	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
 		mzone = 1;
 
 	while (bucket->ub_cnt > 0)  {
@@ -636,29 +656,32 @@ static void
 zone_drain(uma_zone_t zone)
 {
 	struct slabhead freeslabs = {};
+	uma_keg_t keg;
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
+	keg = zone->uz_keg;
+
 	/*
-	 * We don't want to take pages from staticly allocated zones at this
+	 * We don't want to take pages from statically allocated zones at this
 	 * time
 	 */
-	if (zone->uz_flags & UMA_ZONE_NOFREE || zone->uz_freef == NULL)
+	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 	ZONE_LOCK(zone);
 
 #ifdef UMA_DEBUG
-	printf("%s free items: %u\n", zone->uz_name, zone->uz_free);
+	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
 #endif
 	bucket_cache_drain(zone);
-	if (zone->uz_free == 0)
+	if (keg->uk_free == 0)
 		goto finished;
 
-	slab = LIST_FIRST(&zone->uz_free_slab);
+	slab = LIST_FIRST(&keg->uk_free_slab);
 	while (slab) {
 		n = LIST_NEXT(slab, us_link);
 
@@ -669,11 +692,11 @@ zone_drain(uma_zone_t zone)
 		}
 
 		LIST_REMOVE(slab, us_link);
-		zone->uz_pages -= zone->uz_ppera;
-		zone->uz_free -= zone->uz_ipers;
+		keg->uk_pages -= keg->uk_ppera;
+		keg->uk_free -= keg->uk_ipers;
 
-		if (zone->uz_flags & UMA_ZONE_HASH)
-			UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data);
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
@@ -684,34 +707,34 @@ finished:
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
-		if (zone->uz_fini)
-			for (i = 0; i < zone->uz_ipers; i++)
-				zone->uz_fini(
-				    slab->us_data + (zone->uz_rsize * i),
-				    zone->uz_size);
+		if (keg->uk_fini)
+			for (i = 0; i < keg->uk_ipers; i++)
+				keg->uk_fini(
+				    slab->us_data + (keg->uk_rsize * i),
+				    keg->uk_size);
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
-		if (zone->uz_flags & UMA_ZONE_OFFPAGE)
-			uma_zfree_internal(slabzone, slab, NULL, 0);
-		if (zone->uz_flags & UMA_ZONE_MALLOC) {
+		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
 				obj = kmem_object;
 			else
 				obj = NULL;
-			for (i = 0; i < zone->uz_ppera; i++)
+			for (i = 0; i < keg->uk_ppera; i++)
 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 				    obj);
 		}
+		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+			uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
-		    zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera);
+		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
-		zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags);
+		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
-
 }
 
 /*
@@ -728,20 +751,23 @@ finished:
 static uma_slab_t
 slab_zalloc(uma_zone_t zone, int wait)
 {
-	uma_slab_t slab;	/* Starting slab */
+	uma_slabrefcnt_t slabref;
+	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
 	slab = NULL;
+	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG
 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
 #endif
 	ZONE_UNLOCK(zone);
 
-	if (zone->uz_flags & UMA_ZONE_OFFPAGE) {
-		slab = uma_zalloc_internal(slabzone, NULL, wait);
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
+		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
 			ZONE_LOCK(zone);
 			return NULL;
@@ -755,12 +781,12 @@ slab_zalloc(uma_zone_t zone, int wait)
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
-	if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
-	mem = zone->uz_allocf(zone, zone->uz_ppera * UMA_SLAB_SIZE,
+	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
 	    &flags, wait);
 	if (mem == NULL) {
 		ZONE_LOCK(zone);
@@ -768,32 +794,39 @@ slab_zalloc(uma_zone_t zone, int wait)
 	}
 
 	/* Point the slab into the allocated memory */
-	if (!(zone->uz_flags & UMA_ZONE_OFFPAGE))
-		slab = (uma_slab_t )(mem + zone->uz_pgoff);
+	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
+		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
-	if (zone->uz_flags & UMA_ZONE_MALLOC)
-		for (i = 0; i < zone->uz_ppera; i++)
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+	    (keg->uk_flags & UMA_ZONE_REFCNT))
+		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
-	slab->us_zone = zone;
+	slab->us_keg = keg;
 	slab->us_data = mem;
-	slab->us_freecount = zone->uz_ipers;
+	slab->us_freecount = keg->uk_ipers;
 	slab->us_firstfree = 0;
 	slab->us_flags = flags;
-	for (i = 0; i < zone->uz_ipers; i++)
-		slab->us_freelist[i] = i+1;
+	for (i = 0; i < keg->uk_ipers; i++)
+		slab->us_freelist[i].us_item = i+1;
 
-	if (zone->uz_init)
-		for (i = 0; i < zone->uz_ipers; i++)
-			zone->uz_init(slab->us_data + (zone->uz_rsize * i),
-			    zone->uz_size);
+	if (keg->uk_flags & UMA_ZONE_REFCNT) {
+		slabref = (uma_slabrefcnt_t)slab;
+		for (i = 0; i < keg->uk_ipers; i++)
+			slabref->us_freelist[i].us_refcnt = 0;
+	}
+
+	if (keg->uk_init)
+		for (i = 0; i < keg->uk_ipers; i++)
+			keg->uk_init(slab->us_data + (keg->uk_rsize * i),
+			    keg->uk_size);
 	ZONE_LOCK(zone);
 
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		UMA_HASH_INSERT(&zone->uz_hash, slab, mem);
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
-	zone->uz_pages += zone->uz_ppera;
-	zone->uz_free += zone->uz_ipers;
+	keg->uk_pages += keg->uk_ppera;
+	keg->uk_free += keg->uk_ipers;
 
 	return (slab);
 }
@@ -806,6 +839,10 @@ slab_zalloc(uma_zone_t zone, int wait)
 static void *
 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
+
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
@@ -827,11 +864,11 @@ startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 	 * Now that we've booted reset these users to their real allocator.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
-	zone->uz_allocf = uma_small_alloc;
+	keg->uk_allocf = uma_small_alloc;
 #else
-	zone->uz_allocf = page_alloc;
+	keg->uk_allocf = page_alloc;
 #endif
-	return zone->uz_allocf(zone, bytes, pflag, wait);
+	return keg->uk_allocf(zone, bytes, pflag, wait);
 }
 
 /*
@@ -877,7 +914,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 	vm_page_t p;
 	int pages, startpages;
 
-	object = zone->uz_obj;
+	object = zone->uz_keg->uk_obj;
 	retkva = 0;
 
 	/*
@@ -887,7 +924,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 	p = TAILQ_LAST(&object->memq, pglist);
 	pages = p != NULL ? p->pindex + 1 : 0;
 	startpages = pages;
-	zkva = zone->uz_kva + pages * PAGE_SIZE;
+	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
 	for (; bytes > 0; bytes -= PAGE_SIZE) {
 		p = vm_page_alloc(object, pages,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
@@ -965,29 +1002,33 @@ zero_init(void *mem, int size)
 static void
 zone_small_init(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	int rsize;
 	int memused;
 	int ipers;
 
-	rsize = zone->uz_size;
+	keg = zone->uz_keg;
+	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
+	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
 		rsize = UMA_SMALLEST_UNIT;
 
-	if (rsize & zone->uz_align)
-		rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1);
+	if (rsize & keg->uk_align)
+		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 
-	zone->uz_rsize = rsize;
+	keg->uk_rsize = rsize;
 
 	rsize += 1;	/* Account for the byte of linkage */
-	zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
-	zone->uz_ppera = 1;
+	keg->uk_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
+	keg->uk_ppera = 1;
 
-	KASSERT(zone->uz_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
-	memused = zone->uz_ipers * zone->uz_rsize;
+	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
+	memused = keg->uk_ipers * keg->uk_rsize;
 
 	/* Can we do any better? */
-	if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) {
+	if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+	    ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE)) {
 		/*
 		 * We can't do this if we're internal or if we've been
 		 * asked to not go to the VM for buckets.  If we do this we
@@ -995,15 +1036,16 @@ zone_small_init(uma_zone_t zone)
 		 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
 		 * result of UMA_ZONE_VM, which clearly forbids it.
 		 */
-		if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) ||
-		    (zone->uz_flags & UMA_ZFLAG_CACHEONLY))
+		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
+		    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 			return;
-		ipers = UMA_SLAB_SIZE / zone->uz_rsize;
-		if (ipers > zone->uz_ipers) {
-			zone->uz_flags |= UMA_ZONE_OFFPAGE;
-			if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
-				zone->uz_flags |= UMA_ZONE_HASH;
-			zone->uz_ipers = ipers;
+		ipers = UMA_SLAB_SIZE / keg->uk_rsize;
+		if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+		    (ipers > keg->uk_ipers)) {
+			keg->uk_flags |= UMA_ZONE_OFFPAGE;
+			if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+				keg->uk_flags |= UMA_ZONE_HASH;
+			keg->uk_ipers = ipers;
 		}
 	}
 }
@@ -1022,179 +1064,298 @@ zone_small_init(uma_zone_t zone)
 static void
 zone_large_init(uma_zone_t zone)
 {
+	uma_keg_t keg;
 	int pages;
 
-	KASSERT((zone->uz_flags & UMA_ZFLAG_CACHEONLY) == 0,
+	keg = zone->uz_keg;
+
+	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
+	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
 
-	pages = zone->uz_size / UMA_SLAB_SIZE;
+	pages = keg->uk_size / UMA_SLAB_SIZE;
 
 	/* Account for remainder */
-	if ((pages * UMA_SLAB_SIZE) < zone->uz_size)
+	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
 		pages++;
 
-	zone->uz_ppera = pages;
-	zone->uz_ipers = 1;
+	keg->uk_ppera = pages;
+	keg->uk_ipers = 1;
 
-	zone->uz_flags |= UMA_ZONE_OFFPAGE;
-	if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
-		zone->uz_flags |= UMA_ZONE_HASH;
+	keg->uk_flags |= UMA_ZONE_OFFPAGE;
+	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+		keg->uk_flags |= UMA_ZONE_HASH;
 
-	zone->uz_rsize = zone->uz_size;
+	keg->uk_rsize = keg->uk_size;
 }
 
 /*
- * Zone header ctor.  This initializes all fields, locks, etc.  And inserts
- * the zone onto the global zone list.
+ * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
+ * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
- *	udata  Actually uma_zcreat_args
+ *	udata  Actually uma_kctor_args
  */
-
 static void
-zone_ctor(void *mem, int size, void *udata)
+keg_ctor(void *mem, int size, void *udata)
 {
-	struct uma_zctor_args *arg = udata;
-	uma_zone_t zone = mem;
-	int privlc;
+	struct uma_kctor_args *arg = udata;
+	uma_keg_t keg = mem;
+	uma_zone_t zone;
 
-	bzero(zone, size);
-	zone->uz_name = arg->name;
-	zone->uz_size = arg->size;
-	zone->uz_ctor = arg->ctor;
-	zone->uz_dtor = arg->dtor;
-	zone->uz_init = arg->uminit;
-	zone->uz_fini = arg->fini;
-	zone->uz_align = arg->align;
-	zone->uz_free = 0;
-	zone->uz_pages = 0;
-	zone->uz_flags = arg->flags;
-	zone->uz_allocf = page_alloc;
-	zone->uz_freef = page_free;
+	bzero(keg, size);
+	keg->uk_size = arg->size;
+	keg->uk_init = arg->uminit;
+	keg->uk_fini = arg->fini;
+	keg->uk_align = arg->align;
+	keg->uk_free = 0;
+	keg->uk_pages = 0;
+	keg->uk_flags = arg->flags;
+	keg->uk_allocf = page_alloc;
+	keg->uk_freef = page_free;
+	keg->uk_recurse = 0;
+	keg->uk_slabzone = NULL;
 
-	if (arg->flags & UMA_ZONE_ZINIT)
-		zone->uz_init = zero_init;
+	/*
+	 * The master zone is passed to us at keg-creation time.
+	 */
+	zone = arg->zone;
+	zone->uz_keg = keg;
 
 	if (arg->flags & UMA_ZONE_VM)
-		zone->uz_flags |= UMA_ZFLAG_CACHEONLY;
+		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
+
+	if (arg->flags & UMA_ZONE_ZINIT)
+		keg->uk_init = zero_init;
 
 	/*
-	 * XXX:
-	 * The +1 byte added to uz_size is to account for the byte of
+	 * The +1 byte added to uk_size is to account for the byte of
 	 * linkage that is added to the size in zone_small_init().  If
 	 * we don't account for this here then we may end up in
 	 * zone_small_init() with a calculated 'ipers' of 0.
 	 */
-	if ((zone->uz_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
+	if ((keg->uk_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 		zone_large_init(zone);
 	else
 		zone_small_init(zone);
+
+	if (keg->uk_flags & UMA_ZONE_REFCNT)
+		keg->uk_slabzone = slabrefzone;
+	else if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		keg->uk_slabzone = slabzone;
+
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
-	if (zone->uz_ppera == 1) {
+	if (keg->uk_ppera == 1) {
 #ifdef UMA_MD_SMALL_ALLOC
-		zone->uz_allocf = uma_small_alloc;
-		zone->uz_freef = uma_small_free;
+		keg->uk_allocf = uma_small_alloc;
+		keg->uk_freef = uma_small_free;
 #endif
 		if (booted == 0)
-			zone->uz_allocf = startup_alloc;
+			keg->uk_allocf = startup_alloc;
 	}
+
+	/*
+	 * Initialize keg's lock (shared among zones) through
+	 * Master zone
+	 */
+	zone->uz_lock = &keg->uk_lock;
 	if (arg->flags & UMA_ZONE_MTXCLASS)
-		privlc = 1;
+		ZONE_LOCK_INIT(zone, 1);
 	else
-		privlc = 0;
+		ZONE_LOCK_INIT(zone, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
-	if (!(zone->uz_flags & UMA_ZONE_OFFPAGE)) {
+	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		int totsize;
 
 		/* Size of the slab struct and free list */
-		totsize = sizeof(struct uma_slab) + zone->uz_ipers;
+		totsize = sizeof(struct uma_slab) + keg->uk_ipers;
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
-		zone->uz_pgoff = UMA_SLAB_SIZE - totsize;
-		totsize = zone->uz_pgoff + sizeof(struct uma_slab)
-		    + zone->uz_ipers;
+		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
+		totsize = keg->uk_pgoff + sizeof(struct uma_slab)
+		    + keg->uk_ipers;
 		/* I don't think it's possible, but I'll make sure anyway */
 		if (totsize > UMA_SLAB_SIZE) {
 			printf("zone %s ipers %d rsize %d size %d\n",
-			    zone->uz_name, zone->uz_ipers, zone->uz_rsize,
-			    zone->uz_size);
+			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
+			    keg->uk_size);
 			panic("UMA slab won't fit.\n");
 		}
 	}
 
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		hash_alloc(&zone->uz_hash);
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
 	    zone->uz_name, zone,
-	    zone->uz_size, zone->uz_ipers,
-	    zone->uz_ppera, zone->uz_pgoff);
+	    keg->uk_size, keg->uk_ipers,
+	    keg->uk_ppera, keg->uk_pgoff);
 #endif
-	ZONE_LOCK_INIT(zone, privlc);
+
+	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	mtx_lock(&uma_mtx);
-	LIST_INSERT_HEAD(&uma_zones, zone, uz_link);
+	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	mtx_unlock(&uma_mtx);
+}
+
+/*
+ * Zone header ctor.  This initializes all fields, locks, etc.
+ *
+ * Arguments/Returns follow uma_ctor specifications
+ *	udata  Actually uma_zctor_args
+ */
+
+static void
+zone_ctor(void *mem, int size, void *udata)
+{
+	struct uma_zctor_args *arg = udata;
+	uma_zone_t zone = mem;
+	uma_zone_t z;
+	uma_keg_t keg;
+
+	bzero(zone, size);
+	zone->uz_name = arg->name;
+	zone->uz_ctor = arg->ctor;
+	zone->uz_dtor = arg->dtor;
+	zone->uz_init = NULL;
+	zone->uz_fini = NULL;
+	zone->uz_allocs = 0;
+	zone->uz_fills = zone->uz_count = 0;
+
+	if (arg->flags & UMA_ZONE_SECONDARY) {
+		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
+		keg = arg->keg;
+		zone->uz_keg = keg;
+		zone->uz_init = arg->uminit;
+		zone->uz_fini = arg->fini;
+		zone->uz_lock = &keg->uk_lock;
+		mtx_lock(&uma_mtx);
+		ZONE_LOCK(zone);
+		keg->uk_flags |= UMA_ZONE_SECONDARY;
+		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
+			if (LIST_NEXT(z, uz_link) == NULL) {
+				LIST_INSERT_AFTER(z, zone, uz_link);
+				break;
+			}
+		}
+		ZONE_UNLOCK(zone);
+		mtx_unlock(&uma_mtx);
+	} else if (arg->keg == NULL) {
+		uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
+		    arg->align, arg->flags);
+	} else {
+		struct uma_kctor_args karg;
+
+		/* We should only be here from uma_startup() */
+		karg.size = arg->size;
+		karg.uminit = arg->uminit;
+		karg.fini = arg->fini;
+		karg.align = arg->align;
+		karg.flags = arg->flags;
+		karg.zone = zone;
+		keg_ctor(arg->keg, sizeof(struct uma_keg), &karg);
+	}
+	keg = zone->uz_keg;
+	zone->uz_lock = &keg->uk_lock;
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
-	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
+		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
+		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return;
+	}
 
-	if (zone->uz_ipers <= BUCKET_MAX)
-		zone->uz_count = zone->uz_ipers;
+	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
+		zone->uz_count = BUCKET_MAX;
+	else if (keg->uk_ipers <= BUCKET_MAX)
+		zone->uz_count = keg->uk_ipers;
 	else
 		zone->uz_count = BUCKET_MAX;
 }
 
 /*
- * Zone header dtor.  This frees all data, destroys locks, frees the hash table
- * and removes the zone from the global list.
+ * Keg header dtor.  This frees all data, destroys locks, frees the hash
+ * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
+static void
+keg_dtor(void *arg, int size, void *udata)
+{
+	uma_keg_t keg;
 
+	keg = (uma_keg_t)arg;
+	mtx_lock(&keg->uk_lock);
+	if (keg->uk_free != 0) {
+		printf("Freed UMA keg was not empty (%d items). "
+		    " Lost %d pages of memory.\n",
+		    keg->uk_free, keg->uk_pages);
+	}
+	mtx_unlock(&keg->uk_lock);
+
+	if (keg->uk_flags & UMA_ZONE_HASH)
+		hash_free(&keg->uk_hash);
+
+	mtx_destroy(&keg->uk_lock);
+}
+
+/*
+ * Zone header dtor.
+ *
+ * Arguments/Returns follow uma_dtor specifications
+ *	udata  unused
+ */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_zone_t zone;
+	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
+	keg = zone->uz_keg;
 
-	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
+	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
+
 	mtx_lock(&uma_mtx);
-	LIST_REMOVE(zone, uz_link);
 	zone_drain(zone);
-	mtx_unlock(&uma_mtx);
-
-	ZONE_LOCK(zone);
-	if (zone->uz_free != 0) {
-		printf("Zone %s was not empty (%d items). "
-		    " Lost %d pages of memory.\n",
-		    zone->uz_name, zone->uz_free, zone->uz_pages);
-		uma_print_zone(zone);
+	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
+		LIST_REMOVE(zone, uz_link);
+		/*
+		 * XXX there are some races here where
+		 * the zone can be drained but zone lock
+		 * released and then refilled before we
+		 * remove it... we dont care for now
+		 */
+		ZONE_LOCK(zone);
+		if (LIST_EMPTY(&keg->uk_zones))
+			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
+		ZONE_UNLOCK(zone);
+		mtx_unlock(&uma_mtx);
+	} else {
+		LIST_REMOVE(keg, uk_link);
+		LIST_REMOVE(zone, uz_link);
+		mtx_unlock(&uma_mtx);
+		uma_zfree_internal(kegs, keg, NULL, 0);
 	}
-
-	ZONE_UNLOCK(zone);
-	if (zone->uz_flags & UMA_ZONE_HASH)
-		hash_free(&zone->uz_hash);
-
-	ZONE_LOCK_FINI(zone);
+	zone->uz_keg = NULL;
 }
+
 /*
  * Traverses every zone in the system and calls a callback
  *
@@ -1208,11 +1369,14 @@ zone_dtor(void *arg, int size, void *udata)
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
+	uma_keg_t keg;
 	uma_zone_t zone;
 
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(zone, &uma_zones, uz_link)
-		zfunc(zone);
+	LIST_FOREACH(keg, &uma_kegs, uk_link) {
+		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
+			zfunc(zone);
+	}
 	mtx_unlock(&uma_mtx);
 }
 
@@ -1227,25 +1391,23 @@ uma_startup(void *bootmem)
 	int i;
 
 #ifdef UMA_DEBUG
-	printf("Creating uma zone headers zone.\n");
+	printf("Creating uma keg headers zone and keg.\n");
 #endif
 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
-	/* "manually" Create the initial zone */
-	args.name = "UMA Zones";
-	args.size = sizeof(struct uma_zone) +
-	    (sizeof(struct uma_cache) * (mp_maxid + 1));
-	args.ctor = zone_ctor;
-	args.dtor = zone_dtor;
+
+	/* "manually" create the initial zone */
+	args.name = "UMA Kegs";
+	args.size = sizeof(struct uma_keg);
+	args.ctor = keg_ctor;
+	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
+	args.keg = &masterkeg;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
-	zone_ctor(zones, sizeof(struct uma_zone), &args);
+	zone_ctor(kegs, sizeof(struct uma_zone), &args);
 
-	/* Initialize the pcpu cache lock set once and for all */
-	for (i = 0; i <= mp_maxid; i++)
-		CPU_LOCK_INIT(i);
 #ifdef UMA_DEBUG
 	printf("Filling boot free list.\n");
 #endif
@@ -1258,7 +1420,30 @@ uma_startup(void *bootmem)
 	}
 
 #ifdef UMA_DEBUG
-	printf("Creating slab zone.\n");
+	printf("Creating uma zone headers zone and keg.\n");
+#endif
+	args.name = "UMA Zones";
+	args.size = sizeof(struct uma_zone) +
+	    (sizeof(struct uma_cache) * (mp_maxid + 1));
+	args.ctor = zone_ctor;
+	args.dtor = zone_dtor;
+	args.uminit = zero_init;
+	args.fini = NULL;
+	args.keg = NULL;
+	args.align = 32 - 1;
+	args.flags = UMA_ZFLAG_INTERNAL;
+	/* The initial zone has no Per cpu queues so it's smaller */
+	zone_ctor(zones, sizeof(struct uma_zone), &args);
+
+#ifdef UMA_DEBUG
+	printf("Initializing pcpu cache locks.\n");
+#endif
+	/* Initialize the pcpu cache lock set once and for all */
+	for (i = 0; i <= mp_maxid; i++)
+		CPU_LOCK_INIT(i);
+
+#ifdef UMA_DEBUG
+	printf("Creating slab and hash zones.\n");
 #endif
 
 	/*
@@ -1276,6 +1461,20 @@ uma_startup(void *bootmem)
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
+	/*
+	 * We also create a zone for the bigger slabs with reference
+	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
+	 */
+	slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt);
+	slabsize /= UMA_MAX_WASTE;
+	slabsize++;
+	slabsize += 4 * slabsize;
+	slabsize += sizeof(struct uma_slab_refcnt);
+	slabrefzone = uma_zcreate("UMA RCntSlabs",
+				  slabsize,
+				  NULL, NULL, NULL, NULL,
+				  UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
+
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
@@ -1321,6 +1520,21 @@ uma_startup3(void)
 #endif
 }
 
+static void
+uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
+		int align, u_int16_t flags)
+{
+	struct uma_kctor_args args;
+
+	args.size = size;
+	args.uminit = uminit;
+	args.fini = fini;
+	args.align = align;
+	args.flags = flags;
+	args.zone = zone;
+	zone = uma_zalloc_internal(kegs, &args, M_WAITOK);
+}
+
 /* See uma.h */
 uma_zone_t
 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
@@ -1338,6 +1552,27 @@ uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 	args.fini = fini;
 	args.align = align;
 	args.flags = flags;
+	args.keg = NULL;
+
+	return (uma_zalloc_internal(zones, &args, M_WAITOK));
+}
+
+/* See uma.h */
+uma_zone_t
+uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_zone_t master)
+{
+	struct uma_zctor_args args;
+
+	args.name = name;
+	args.size = master->uz_keg->uk_size;
+	args.ctor = ctor;
+	args.dtor = dtor;
+	args.uminit = zinit;
+	args.fini = zfini;
+	args.align = master->uz_keg->uk_align;
+	args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
+	args.keg = master->uz_keg;
 
 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
 }
@@ -1357,35 +1592,25 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
+	int badness = 1;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 
-#ifdef INVARIANTS
-	/*
-	 * To make sure that WAITOK or NOWAIT is set, but not more than
-	 * one, and check against the API botches that are common.
-	 * The uma code implies M_WAITOK if M_NOWAIT is not set, so
-	 * we default to waiting if none of the flags is set.
-	 */
-	cpu = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
-	if (cpu != M_NOWAIT && cpu != M_WAITOK) {
-		static	struct timeval lasterr;
-		static	int curerr, once;
-		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
-			printf("Bad uma_zalloc flags: %x\n", cpu);
-			backtrace();
-			once++;
-		}
-	}
-#endif
 	if (!(flags & M_NOWAIT)) {
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
-		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-		    "malloc() of \"%s\"", zone->uz_name);
+#ifdef WITNESS
+		badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
+		    zone->uz_name);
+#endif
+		if (badness) {
+			flags &= ~M_WAITOK;
+			flags |= M_NOWAIT;
+		}
 	}
 
 zalloc_restart:
@@ -1413,9 +1638,9 @@ zalloc_start:
 #endif
 			CPU_UNLOCK(cpu);
 			if (zone->uz_ctor)
-				zone->uz_ctor(item, zone->uz_size, udata);
+				zone->uz_ctor(item,zone->uz_keg->uk_size,udata);
 			if (flags & M_ZERO)
-				bzero(item, zone->uz_size);
+				bzero(item, zone->uz_keg->uk_size);
 			return (item);
 		} else if (cache->uc_freebucket) {
 			/*
@@ -1465,6 +1690,7 @@ zalloc_start:
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
+
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the begining.
@@ -1488,6 +1714,9 @@ static uma_slab_t
 uma_zone_slab(uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
 
 	/*
 	 * This is to prevent us from recursively trying to allocate
@@ -1498,7 +1727,7 @@ uma_zone_slab(uma_zone_t zone, int flags)
 	 * things happen.  So instead we return a NULL bucket, and make
 	 * the code that allocates buckets smart enough to deal with it
 	 */
-	if (zone->uz_flags & UMA_ZFLAG_INTERNAL && zone->uz_recurse != 0)
+	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
 		return (NULL);
 
 	slab = NULL;
@@ -1509,14 +1738,14 @@ uma_zone_slab(uma_zone_t zone, int flags)
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
-		if (zone->uz_free != 0) {
-			if (!LIST_EMPTY(&zone->uz_part_slab)) {
-				slab = LIST_FIRST(&zone->uz_part_slab);
+		if (keg->uk_free != 0) {
+			if (!LIST_EMPTY(&keg->uk_part_slab)) {
+				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
-				slab = LIST_FIRST(&zone->uz_free_slab);
+				slab = LIST_FIRST(&keg->uk_free_slab);
 				LIST_REMOVE(slab, us_link);
-				LIST_INSERT_HEAD(&zone->uz_part_slab, slab,
-				us_link);
+				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
+				    us_link);
 			}
 			return (slab);
 		}
@@ -1527,27 +1756,28 @@ uma_zone_slab(uma_zone_t zone, int flags)
 		if (flags & M_NOVM)
 			break;
 
-		if (zone->uz_maxpages &&
-		    zone->uz_pages >= zone->uz_maxpages) {
-			zone->uz_flags |= UMA_ZFLAG_FULL;
+		if (keg->uk_maxpages &&
+		    keg->uk_pages >= keg->uk_maxpages) {
+			keg->uk_flags |= UMA_ZFLAG_FULL;
 
 			if (flags & M_NOWAIT)
 				break;
 			else
-				msleep(zone, &zone->uz_lock, PVM,
+				msleep(keg, &keg->uk_lock, PVM,
 				    "zonelimit", 0);
 			continue;
 		}
-		zone->uz_recurse++;
+		keg->uk_recurse++;
 		slab = slab_zalloc(zone, flags);
-		zone->uz_recurse--;
+		keg->uk_recurse--;
+
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
-			LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
@@ -1564,22 +1794,25 @@ uma_zone_slab(uma_zone_t zone, int flags)
 static void *
 uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
 {
+	uma_keg_t keg;
 	void *item;
 	u_int8_t freei;
 
+	keg = zone->uz_keg;
+
 	freei = slab->us_firstfree;
-	slab->us_firstfree = slab->us_freelist[freei];
-	item = slab->us_data + (zone->uz_rsize * freei);
+	slab->us_firstfree = slab->us_freelist[freei].us_item;
+	item = slab->us_data + (keg->uk_rsize * freei);
 
 	slab->us_freecount--;
-	zone->uz_free--;
+	keg->uk_free--;
 #ifdef INVARIANTS
 	uma_dbg_alloc(zone, slab, item);
 #endif
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
 	}
 
 	return (item);
@@ -1590,6 +1823,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 {
 	uma_bucket_t bucket;
 	uma_slab_t slab;
+	int16_t saved;
 	int max;
 
 	/*
@@ -1603,7 +1837,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 		int bflags;
 
 		bflags = (flags & ~M_ZERO);
-		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+		if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 			bflags |= M_NOVM;
 
 		ZONE_UNLOCK(zone);
@@ -1628,18 +1862,36 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	/* Try to keep the buckets totally full */
+	saved = bucket->ub_cnt;
 	while (bucket->ub_cnt < max &&
 	    (slab = uma_zone_slab(zone, flags)) != NULL) {
 		while (slab->us_freecount && bucket->ub_cnt < max) {
 			bucket->ub_bucket[bucket->ub_cnt++] =
 			    uma_slab_alloc(zone, slab);
 		}
+
 		/* Don't block on the next fill */
 		flags |= M_NOWAIT;
 	}
 
-	zone->uz_fills--;
+	/*
+	 * We unlock here because we need to call the zone's init.
+	 * It should be safe to unlock because the slab dealt with
+	 * above is already on the appropriate list within the keg
+	 * and the bucket we filled is not yet on any list, so we
+	 * own it.
+	 */
+	if (zone->uz_init != NULL) {
+		int i;
+
+		ZONE_UNLOCK(zone);
+		for (i = saved; i < bucket->ub_cnt; i++)
+			zone->uz_init(bucket->ub_bucket[i],
+			    zone->uz_keg->uk_size);
+		ZONE_LOCK(zone);
+	}
 
+	zone->uz_fills--;
 	if (bucket->ub_cnt != 0) {
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
@@ -1668,10 +1920,12 @@ done:
 static void *
 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 {
+	uma_keg_t keg;
 	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
+	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -1688,10 +1942,18 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 
 	ZONE_UNLOCK(zone);
 
+	/*
+	 * We have to call both the zone's init (not the keg's init)
+	 * and the zone's ctor.  This is because the item is going from
+	 * a keg slab directly to the user, and the user is expecting it
+	 * to be both zone-init'd as well as zone-ctor'd.
+	 */
+	if (zone->uz_init != NULL)
+		zone->uz_init(item, keg->uk_size);
 	if (zone->uz_ctor != NULL)
-		zone->uz_ctor(item, zone->uz_size, udata);
+		zone->uz_ctor(item, keg->uk_size, udata);
 	if (flags & M_ZERO)
-		bzero(item, zone->uz_size);
+		bzero(item, keg->uk_size);
 
 	return (item);
 }
@@ -1700,6 +1962,7 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
+	uma_keg_t keg;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int bflags;
@@ -1708,6 +1971,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 
 	/* This is the fast path free */
 	skip = 0;
+	keg = zone->uz_keg;
+
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
@@ -1716,11 +1981,11 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 	 * a little longer for the limits to be reset.
 	 */
 
-	if (zone->uz_flags & UMA_ZFLAG_FULL)
+	if (keg->uk_flags & UMA_ZFLAG_FULL)
 		goto zfree_internal;
 
 	if (zone->uz_dtor) {
-		zone->uz_dtor(item, zone->uz_size, udata);
+		zone->uz_dtor(item, keg->uk_size, udata);
 		skip = 1;
 	}
 
@@ -1745,7 +2010,7 @@ zfree_start:
 			bucket->ub_cnt++;
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
-			if (zone->uz_flags & UMA_ZONE_MALLOC)
+			if (keg->uk_flags & UMA_ZONE_MALLOC)
 				uma_dbg_free(zone, udata, item);
 			else
 				uma_dbg_free(zone, NULL, item);
@@ -1810,7 +2075,7 @@ zfree_start:
 #endif
 	bflags = M_NOWAIT;
 
-	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+	if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 		bflags |= M_NOVM;
 	bucket = bucket_alloc(zone->uz_count, bflags);
 	if (bucket) {
@@ -1836,7 +2101,7 @@ zfree_internal:
 	 */
 	if (skip) {
 		ZONE_LOCK(zone);
-		if (zone->uz_flags & UMA_ZONE_MALLOC)
+		if (keg->uk_flags & UMA_ZONE_MALLOC)
 			uma_dbg_free(zone, udata, item);
 		else
 			uma_dbg_free(zone, NULL, item);
@@ -1846,7 +2111,6 @@ zfree_internal:
 	uma_zfree_internal(zone, item, udata, skip);
 
 	return;
-
 }
 
 /*
@@ -1862,20 +2126,25 @@ static void
 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t freei;
 
+	keg = zone->uz_keg;
+
 	if (!skip && zone->uz_dtor)
-		zone->uz_dtor(item, zone->uz_size, udata);
+		zone->uz_dtor(item, keg->uk_size, udata);
+	if (zone->uz_fini)
+		zone->uz_fini(item, keg->uk_size);
 
 	ZONE_LOCK(zone);
 
-	if (!(zone->uz_flags & UMA_ZONE_MALLOC)) {
+	if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-		if (zone->uz_flags & UMA_ZONE_HASH)
-			slab = hash_sfind(&zone->uz_hash, mem);
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			slab = hash_sfind(&keg->uk_hash, mem);
 		else {
-			mem += zone->uz_pgoff;
+			mem += keg->uk_pgoff;
 			slab = (uma_slab_t)mem;
 		}
 	} else {
@@ -1883,36 +2152,36 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 	}
 
 	/* Do we need to remove from any lists? */
-	if (slab->us_freecount+1 == zone->uz_ipers) {
+	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
-		LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
 	/* Slab management stuff */
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-		/ zone->uz_rsize;
+		/ keg->uk_rsize;
 
 #ifdef INVARIANTS
 	if (!skip)
 		uma_dbg_free(zone, slab, item);
 #endif
 
-	slab->us_freelist[freei] = slab->us_firstfree;
+	slab->us_freelist[freei].us_item = slab->us_firstfree;
 	slab->us_firstfree = freei;
 	slab->us_freecount++;
 
 	/* Zone statistics */
-	zone->uz_free++;
+	keg->uk_free++;
 
-	if (zone->uz_flags & UMA_ZFLAG_FULL) {
-		if (zone->uz_pages < zone->uz_maxpages)
-			zone->uz_flags &= ~UMA_ZFLAG_FULL;
+	if (keg->uk_flags & UMA_ZFLAG_FULL) {
+		if (keg->uk_pages < keg->uk_maxpages)
+			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 
 		/* We can handle one more allocation */
-		wakeup_one(zone);
+		wakeup_one(keg);
 	}
 
 	ZONE_UNLOCK(zone);
@@ -1922,24 +2191,71 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
 void
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
+	uma_keg_t keg;
+
+	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
-	if (zone->uz_ppera > 1)
-		zone->uz_maxpages = nitems * zone->uz_ppera;
+	if (keg->uk_ppera > 1)
+		keg->uk_maxpages = nitems * keg->uk_ppera;
 	else
-		zone->uz_maxpages = nitems / zone->uz_ipers;
+		keg->uk_maxpages = nitems / keg->uk_ipers;
 
-	if (zone->uz_maxpages * zone->uz_ipers < nitems)
-		zone->uz_maxpages++;
+	if (keg->uk_maxpages * keg->uk_ipers < nitems)
+		keg->uk_maxpages++;
 
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
+uma_zone_set_init(uma_zone_t zone, uma_init uminit)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_init on non-empty keg"));
+	zone->uz_keg->uk_init = uminit;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_fini on non-empty keg"));
+	zone->uz_keg->uk_fini = fini;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_zinit on non-empty keg"));
+	zone->uz_init = zinit;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
+{
+	ZONE_LOCK(zone);
+	KASSERT(zone->uz_keg->uk_pages == 0,
+	    ("uma_zone_set_zfini on non-empty keg"));
+	zone->uz_fini = zfini;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	ZONE_LOCK(zone);
-	zone->uz_freef = freef;
+	zone->uz_keg->uk_freef = freef;
 	ZONE_UNLOCK(zone);
 }
 
@@ -1948,8 +2264,8 @@ void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	ZONE_LOCK(zone);
-	zone->uz_flags |= UMA_ZFLAG_PRIVALLOC;
-	zone->uz_allocf = allocf;
+	zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
+	zone->uz_keg->uk_allocf = allocf;
 	ZONE_UNLOCK(zone);
 }
 
@@ -1957,12 +2273,14 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 int
 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 {
-	int pages;
+	uma_keg_t keg;
 	vm_offset_t kva;
+	int pages;
 
-	pages = count / zone->uz_ipers;
+	keg = zone->uz_keg;
+	pages = count / keg->uk_ipers;
 
-	if (pages * zone->uz_ipers < count)
+	if (pages * keg->uk_ipers < count)
 		pages++;
 
 	kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
@@ -1978,11 +2296,11 @@ uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 		    pages, obj);
 	}
 	ZONE_LOCK(zone);
-	zone->uz_kva = kva;
-	zone->uz_obj = obj;
-	zone->uz_maxpages = pages;
-	zone->uz_allocf = obj_alloc;
-	zone->uz_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
+	keg->uk_kva = kva;
+	keg->uk_obj = obj;
+	keg->uk_maxpages = pages;
+	keg->uk_allocf = obj_alloc;
+	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
 	ZONE_UNLOCK(zone);
 	return (1);
 }
@@ -1993,20 +2311,41 @@ uma_prealloc(uma_zone_t zone, int items)
 {
 	int slabs;
 	uma_slab_t slab;
+	uma_keg_t keg;
 
+	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
-	slabs = items / zone->uz_ipers;
-	if (slabs * zone->uz_ipers < items)
+	slabs = items / keg->uk_ipers;
+	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = slab_zalloc(zone, M_WAITOK);
-		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
+u_int32_t *
+uma_find_refcnt(uma_zone_t zone, void *item)
+{
+	uma_slabrefcnt_t slab;
+	uma_keg_t keg;
+	u_int32_t *refcnt;
+	int idx;
+
+	keg = zone->uz_keg;
+	slab = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
+	KASSERT(slab != NULL,
+	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
+	idx = ((unsigned long)item - (unsigned long)slab->us_data)
+	    / keg->uk_rsize;
+	refcnt = &(slab->us_freelist[idx].us_refcnt);
+	return refcnt;
+}
+
+/* See uma.h */
 void
 uma_reclaim(void)
 {
@@ -2021,6 +2360,7 @@ uma_reclaim(void)
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
+	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
@@ -2044,7 +2384,6 @@ uma_large_malloc(int size, int wait)
 		uma_zfree_internal(slabzone, slab, NULL, 0);
 	}
 
-
 	return (mem);
 }
 
@@ -2065,8 +2404,8 @@ uma_print_stats(void)
 static void
 slab_print(uma_slab_t slab)
 {
-	printf("slab: zone %p, data %p, freecount %d, firstfree %d\n",
-		slab->us_zone, slab->us_data, slab->us_freecount,
+	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
+		slab->us_keg, slab->us_data, slab->us_freecount,
 		slab->us_firstfree);
 }
 
@@ -2084,21 +2423,23 @@ void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
+	uma_keg_t keg;
 	uma_slab_t slab;
 	int i;
 
+	keg = zone->uz_keg;
 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
-	    zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags,
-	    zone->uz_ipers, zone->uz_ppera,
-	    (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free);
+	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
+	    keg->uk_ipers, keg->uk_ppera,
+	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 	printf("Part slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_part_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
 	printf("Free slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_free_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
 		slab_print(slab);
 	printf("Full slabs:\n");
-	LIST_FOREACH(slab, &zone->uz_full_slab, us_link)
+	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
 		slab_print(slab);
 	for (i = 0; i <= mp_maxid; i++) {
 		if (CPU_ABSENT(i))
@@ -2122,6 +2463,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 	int totalfree;
 	char *tmpbuf, *offset;
 	uma_zone_t z;
+	uma_keg_t zk;
 	char *p;
 	int cpu;
 	int cachefree;
@@ -2130,8 +2472,10 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 
 	cnt = 0;
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(z, &uma_zones, uz_link)
-		cnt++;
+	LIST_FOREACH(zk, &uma_kegs, uk_link) {
+		LIST_FOREACH(z, &zk->uk_zones, uz_link)
+			cnt++;
+	}
 	mtx_unlock(&uma_mtx);
 	MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
 			M_TEMP, M_WAITOK);
@@ -2144,10 +2488,11 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		goto out;
 	offset = tmpbuf;
 	mtx_lock(&uma_mtx);
-	LIST_FOREACH(z, &uma_zones, uz_link) {
+	LIST_FOREACH(zk, &uma_kegs, uk_link) {
+	  LIST_FOREACH(z, &zk->uk_zones, uz_link) {
 		if (cnt == 0)	/* list may have changed size */
 			break;
-		if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
 					continue;
@@ -2156,7 +2501,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		}
 		ZONE_LOCK(z);
 		cachefree = 0;
-		if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
 					continue;
@@ -2171,12 +2516,12 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
 			cachefree += bucket->ub_cnt;
 		}
-		totalfree = z->uz_free + cachefree;
+		totalfree = zk->uk_free + cachefree;
 		len = snprintf(offset, linesize,
 		    "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
-		    z->uz_name, z->uz_size,
-		    z->uz_maxpages * z->uz_ipers,
-		    (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree,
+		    z->uz_name, zk->uk_size,
+		    zk->uk_maxpages * zk->uk_ipers,
+		    (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
 		    totalfree,
 		    (unsigned long long)z->uz_allocs);
 		ZONE_UNLOCK(z);
@@ -2185,6 +2530,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
 		p[1] = ':';
 		cnt--;
 		offset += len;
+	  }
 	}
 	mtx_unlock(&uma_mtx);
 	*offset++ = '\0';
diff --git a/sys/vm/uma_dbg.c b/sys/vm/uma_dbg.c
index 85d067d..0f845cf 100644
--- a/sys/vm/uma_dbg.c
+++ b/sys/vm/uma_dbg.c
@@ -192,15 +192,17 @@ static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;
 
+	keg = zone->uz_keg;
 	mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-	if (zone->uz_flags & UMA_ZONE_MALLOC) {
+	if (keg->uk_flags & UMA_ZONE_MALLOC) {
 		slab = vtoslab((vm_offset_t)mem);
-	} else if (zone->uz_flags & UMA_ZONE_HASH) {
-		slab = hash_sfind(&zone->uz_hash, mem);
+	} else if (keg->uk_flags & UMA_ZONE_HASH) {
+		slab = hash_sfind(&keg->uk_hash, mem);
 	} else {
-		mem += zone->uz_pgoff;
+		mem += keg->uk_pgoff;
 		slab = (uma_slab_t)mem;
 	}
 
@@ -215,8 +217,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item)
 void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;
 
+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -225,9 +229,9 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 	}
 
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;
 
-	slab->us_freelist[freei] = 255;
+	slab->us_freelist[freei].us_item = 255;
 
 	return;
 }
@@ -241,8 +245,10 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;
 
+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -251,22 +257,22 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	}
 
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;
 
-	if (freei >= zone->uz_ipers)
+	if (freei >= keg->uk_ipers)
 		panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
-		    zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
+		    zone->uz_name, zone, slab, freei, keg->uk_ipers-1);
 
-	if (((freei * zone->uz_rsize) + slab->us_data) != item) {
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) {
 		printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
 		    zone->uz_name, zone, slab, item);
 		panic("should be %p\n",
-		    (freei * zone->uz_rsize) + slab->us_data);
+		    (freei * keg->uk_rsize) + slab->us_data);
 	}
 
-	if (slab->us_freelist[freei] != 255) {
+	if (slab->us_freelist[freei].us_item != 255) {
 		printf("Slab at %p, freei %d = %d.\n",
-		    slab, freei, slab->us_freelist[freei]);
+		    slab, freei, slab->us_freelist[freei].us_item);
 		panic("Duplicate free of item %p from zone %p(%s)\n",
 		    item, zone, zone->uz_name);
 	}
@@ -276,5 +282,5 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	 * Until then the count of valid slabs will make sure we don't
 	 * accidentally follow this and assume it's a valid index.
 	 */
-	slab->us_freelist[freei] = 0;
+	slab->us_freelist[freei].us_item = 0;
 }
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 35acfde..a4cbe5f 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -35,10 +35,10 @@
 /* 
  * Here's a quick description of the relationship between the objects:
  *
- * Zones contain lists of slabs which are stored in either the full bin, empty
+ * Kegs contain lists of slabs which are stored in either the full bin, empty
  * bin, or partially allocated bin, to reduce fragmentation.  They also contain
  * the user supplied value for size, which is adjusted for alignment purposes
- * and rsize is the result of that.  The zone also stores information for
+ * and rsize is the result of that.  The Keg also stores information for
  * managing a hash of page addresses that maps pages to uma_slab_t structures
  * for pages that don't have embedded uma_slab_t's.
  *  
@@ -67,6 +67,20 @@
  * so at this time it may not make sense to optimize for it.  This can, of 
  * course, be solved with dynamic slab sizes.
  *
+ * Kegs may serve multiple Zones but by far most of the time they only serve
+ * one.  When a Zone is created, a Keg is allocated and setup for it.  While
+ * the backing Keg stores slabs, the Zone caches Buckets of items allocated
+ * from the slabs.  Each Zone is equipped with an init/fini and ctor/dtor
+ * pair, as well as with its own set of small per-CPU caches, layered above
+ * the Zone's general Bucket cache.
+ *
+ * The PCPU caches are protected by their own locks, while the Zones backed
+ * by the same Keg all share a common Keg lock (to coalesce contention on
+ * the backing slabs).  The backing Keg typically only serves one Zone but
+ * in the case of multiple Zones, one of the Zones is considered the
+ * Master Zone and all Zone-related stats from the Keg are done in the
+ * Master Zone.  For an example of a Multi-Zone setup, refer to the
+ * Mbuf allocation code.
  */
 
 /*
@@ -134,28 +148,6 @@
 		SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h),		\
 		    (mem))], (s), uma_slab, us_hlink);
 
-/* Page management structure */
-
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab {
-	uma_zone_t	us_zone;		/* Zone we live in */
-	union {
-		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
-		unsigned long	_us_size;	/* Size of allocation */
-	} us_type;
-	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
-	u_int8_t	*us_data;		/* First item */
-	u_int8_t	us_flags;		/* Page flags see uma.h */
-	u_int8_t	us_freecount;	/* How many are free? */
-	u_int8_t	us_firstfree;	/* First free item index */
-	u_int8_t	us_freelist[1];	/* Free List (actually larger) */
-};
-
-#define us_link	us_type._us_link
-#define us_size	us_type._us_size
-
-typedef struct uma_slab * uma_slab_t;
-
 /* Hash table for freed address -> slab translation */
 
 SLIST_HEAD(slabhead, uma_slab);
@@ -188,6 +180,97 @@ struct uma_cache {
 typedef struct uma_cache * uma_cache_t;
 
 /*
+ * Keg management structure
+ *
+ * TODO: Optimize for cache line size
+ *
+ */
+struct uma_keg {
+	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
+
+	struct mtx	uk_lock;	/* Lock for the keg */
+	struct uma_hash	uk_hash;
+
+	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
+	LIST_HEAD(,uma_slab)	uk_part_slab;	/* partially allocated slabs */
+	LIST_HEAD(,uma_slab)	uk_free_slab;	/* empty slab list */
+	LIST_HEAD(,uma_slab)	uk_full_slab;	/* full slabs */
+
+	u_int32_t	uk_recurse;	/* Allocation recursion count */
+	u_int32_t	uk_align;	/* Alignment mask */
+	u_int32_t	uk_pages;	/* Total page count */
+	u_int32_t	uk_free;	/* Count of items free in slabs */
+	u_int32_t	uk_size;	/* Requested size of each item */
+	u_int32_t	uk_rsize;	/* Real size of each item */
+	u_int32_t	uk_maxpages;	/* Maximum number of pages to alloc */
+
+	uma_init	uk_init;	/* Keg's init routine */
+	uma_fini	uk_fini;	/* Keg's fini routine */
+	uma_alloc	uk_allocf;	/* Allocation function */
+	uma_free	uk_freef;	/* Free routine */
+
+	struct vm_object	*uk_obj;	/* Zone specific object */
+	vm_offset_t	uk_kva;		/* Base kva for zones with objs */
+	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
+
+	u_int16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	u_int16_t	uk_ppera;	/* pages per allocation from backend */
+	u_int16_t	uk_ipers;	/* Items per slab */
+	u_int16_t	uk_flags;	/* Internal flags */
+};
+
+/* Simpler reference to uma_keg for internal use. */
+typedef struct uma_keg * uma_keg_t;
+
+/* Page management structure */
+
+/* Sorry for the union, but space efficiency is important */
+struct uma_slab_head {
+	uma_keg_t	us_keg;			/* Keg we live in */
+	union {
+		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
+		unsigned long	_us_size;	/* Size of allocation */
+	} us_type;
+	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
+	u_int8_t	*us_data;		/* First item */
+	u_int8_t	us_flags;		/* Page flags see uma.h */
+	u_int8_t	us_freecount;	/* How many are free? */
+	u_int8_t	us_firstfree;	/* First free item index */
+};
+
+/* The standard slab structure */
+struct uma_slab {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+/*
+ * The slab structure for UMA_ZONE_REFCNT zones for whose items we
+ * maintain reference counters in the slab for.
+ */
+struct uma_slab_refcnt {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+		u_int32_t	us_refcnt;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+#define	us_keg		us_head.us_keg
+#define	us_link		us_head.us_type._us_link
+#define	us_size		us_head.us_type._us_size
+#define	us_hlink	us_head.us_hlink
+#define	us_data		us_head.us_data
+#define	us_flags	us_head.us_flags
+#define	us_freecount	us_head.us_freecount
+#define	us_firstfree	us_head.us_firstfree
+
+typedef struct uma_slab * uma_slab_t;
+typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
+
+/*
  * Zone management structure 
  *
  * TODO: Optimize for cache line size
@@ -195,42 +278,22 @@ typedef struct uma_cache * uma_cache_t;
  */
 struct uma_zone {
 	char		*uz_name;	/* Text name of the zone */
-	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones */
-	u_int32_t	uz_align;	/* Alignment mask */
-	u_int32_t	uz_pages;	/* Total page count */
-
-/* Used during alloc / free */
-	struct mtx	uz_lock;	/* Lock for the zone */
-	u_int32_t	uz_free;	/* Count of items free in slabs */
-	u_int16_t	uz_ipers;	/* Items per slab */
-	u_int16_t	uz_flags;	/* Internal flags */
-
-	LIST_HEAD(,uma_slab)	uz_part_slab;	/* partially allocated slabs */
-	LIST_HEAD(,uma_slab)	uz_free_slab;	/* empty slab list */
-	LIST_HEAD(,uma_slab)	uz_full_slab;	/* full slabs */
+	struct mtx	*uz_lock;	/* Lock for the zone (keg's lock) */
+	uma_keg_t	uz_keg;		/* Our underlying Keg */
+
+	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones in keg */
 	LIST_HEAD(,uma_bucket)	uz_full_bucket;	/* full buckets */
 	LIST_HEAD(,uma_bucket)	uz_free_bucket;	/* Buckets for frees */
-	u_int32_t	uz_size;	/* Requested size of each item */
-	u_int32_t	uz_rsize;	/* Real size of each item */
-
-	struct uma_hash	uz_hash;
-	u_int16_t	uz_pgoff;	/* Offset to uma_slab struct */
-	u_int16_t	uz_ppera;	/* pages per allocation from backend */
 
 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
-	u_int64_t	uz_allocs;	/* Total number of allocations */
-
 	uma_init	uz_init;	/* Initializer for each item */
 	uma_fini	uz_fini;	/* Discards memory */
-	uma_alloc	uz_allocf;	/* Allocation function */
-	uma_free	uz_freef;	/* Free routine */
-	struct vm_object	*uz_obj;	/* Zone specific object */
-	vm_offset_t	uz_kva;		/* Base kva for zones with objs */
-	u_int32_t	uz_maxpages;	/* Maximum number of pages to alloc */
-	int		uz_recurse;	/* Allocation recursion count */
+
+	u_int64_t	uz_allocs;	/* Total number of allocations */
 	uint16_t	uz_fills;	/* Outstanding bucket fills */
 	uint16_t	uz_count;	/* Highest value ub_ptr can have */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@@ -256,16 +319,16 @@ void uma_large_free(uma_slab_t slab);
 #define	ZONE_LOCK_INIT(z, lc)					\
 	do {							\
 		if ((lc))					\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    (z)->uz_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 	    
-#define	ZONE_LOCK_FINI(z)	mtx_destroy(&(z)->uz_lock)
-#define	ZONE_LOCK(z)	mtx_lock(&(z)->uz_lock)
-#define ZONE_UNLOCK(z)	mtx_unlock(&(z)->uz_lock)
+#define	ZONE_LOCK_FINI(z)	mtx_destroy((z)->uz_lock)
+#define	ZONE_LOCK(z)	mtx_lock((z)->uz_lock)
+#define ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lock)
 
 #define	CPU_LOCK_INIT(cpu)					\
 	mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu",	\
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 3e21a99..f71785f 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -320,16 +320,6 @@ kmem_malloc(map, size, flags)
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
 		vm_map_unlock(map);
-		if (map != kmem_map) {
-			static int last_report; /* when we did it (in ticks) */
-			if (ticks < last_report ||
-			    (ticks - last_report) >= hz) {
-				last_report = ticks;
-				printf("Out of mbuf address space!\n");
-				printf("Consider increasing NMBCLUSTERS\n");
-			}
-			return (0);
-		}
 		if ((flags & M_NOWAIT) == 0)
 			panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
 				(long)size, (long)map->size);
author	bmilekic <bmilekic@FreeBSD.org>	2004-05-31 21:46:06 +0000
committer	bmilekic <bmilekic@FreeBSD.org>	2004-05-31 21:46:06 +0000
commit	f7574a2276b935509aba6b131a39c685a68e61d2 (patch)
tree	dacbb577a5d3ed365d11df0435010eee4c5380da /sys
parent	d5d90e314729317ee9cce434f3c548b3f4aaaf04 (diff)
download	FreeBSD-src-f7574a2276b935509aba6b131a39c685a68e61d2.zip FreeBSD-src-f7574a2276b935509aba6b131a39c685a68e61d2.tar.gz