summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/conf/files2
-rw-r--r--sys/i386/i386/vm_machdep.c7
-rw-r--r--sys/kern/kern_malloc.c27
-rw-r--r--sys/kern/kern_mbuf.c385
-rw-r--r--sys/kern/subr_mbuf.c1548
-rw-r--r--sys/kern/uipc_mbuf.c235
-rw-r--r--sys/kern/uipc_mbuf2.c40
-rw-r--r--sys/kern/uipc_sockbuf.c13
-rw-r--r--sys/kern/uipc_socket.c93
-rw-r--r--sys/kern/uipc_socket2.c13
-rw-r--r--sys/kern/uipc_syscalls.c16
-rw-r--r--sys/sparc64/sparc64/vm_machdep.c7
-rw-r--r--sys/sys/mbuf.h203
-rw-r--r--sys/vm/uma.h78
-rw-r--r--sys/vm/uma_core.c890
-rw-r--r--sys/vm/uma_dbg.c34
-rw-r--r--sys/vm/uma_int.h175
-rw-r--r--sys/vm/vm_kern.c10
-rw-r--r--usr.bin/netstat/main.c8
-rw-r--r--usr.bin/netstat/mbuf.c196
-rw-r--r--usr.bin/netstat/netstat.14
-rw-r--r--usr.bin/netstat/netstat.h1
-rw-r--r--usr.bin/systat/mbufs.c53
23 files changed, 1701 insertions, 2337 deletions
diff --git a/sys/conf/files b/sys/conf/files
index c2d7e7e..0d48a92 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1075,6 +1075,7 @@ kern/kern_lock.c standard
kern/kern_lockf.c standard
kern/kern_mac.c standard
kern/kern_malloc.c standard
+kern/kern_mbuf.c standard
kern/kern_mib.c standard
kern/kern_module.c standard
kern/kern_mutex.c standard
@@ -1116,7 +1117,6 @@ kern/subr_hints.c standard
kern/subr_kobj.c standard
kern/subr_log.c standard
kern/subr_mbpool.c optional libmbpool
-kern/subr_mbuf.c standard
kern/subr_mchain.c optional libmchain
kern/subr_module.c standard
kern/subr_msgbuf.c standard
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 50fd06e..9a2f9e3 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -95,6 +95,10 @@ __FBSDID("$FreeBSD$");
#include <i386/isa/isa.h>
#endif
+#ifndef NSFBUFS
+#define NSFBUFS (512 + maxusers * 16)
+#endif
+
static void cpu_reset_real(void);
#ifdef SMP
static void cpu_reset_proxy(void);
@@ -584,6 +588,9 @@ sf_buf_init(void *arg)
vm_offset_t sf_base;
int i;
+ nsfbufs = NSFBUFS;
+ TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
TAILQ_INIT(&sf_buf_freelist);
sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index c92e70f..4bc3348 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -191,6 +191,7 @@ malloc(size, type, flags)
int indx;
caddr_t va;
uma_zone_t zone;
+ uma_keg_t keg;
#ifdef DIAGNOSTIC
unsigned long osize = size;
#endif
@@ -235,6 +236,7 @@ malloc(size, type, flags)
size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
indx = kmemsize[size >> KMEM_ZSHIFT];
zone = kmemzones[indx].kz_zone;
+ keg = zone->uz_keg;
#ifdef MALLOC_PROFILE
krequests[size >> KMEM_ZSHIFT]++;
#endif
@@ -244,10 +246,11 @@ malloc(size, type, flags)
goto out;
ksp->ks_size |= 1 << indx;
- size = zone->uz_size;
+ size = keg->uk_size;
} else {
size = roundup(size, PAGE_SIZE);
zone = NULL;
+ keg = NULL;
va = uma_large_malloc(size, flags);
mtx_lock(&ksp->ks_mtx);
if (va == NULL)
@@ -309,7 +312,7 @@ free(addr, type)
#ifdef INVARIANTS
struct malloc_type **mtp = addr;
#endif
- size = slab->us_zone->uz_size;
+ size = slab->us_keg->uk_size;
#ifdef INVARIANTS
/*
* Cache a pointer to the malloc_type that most recently freed
@@ -325,7 +328,7 @@ free(addr, type)
sizeof(struct malloc_type *);
*mtp = type;
#endif
- uma_zfree_arg(slab->us_zone, addr, slab);
+ uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
} else {
size = slab->us_size;
uma_large_free(slab);
@@ -364,8 +367,8 @@ realloc(addr, size, type, flags)
("realloc: address %p out of range", (void *)addr));
/* Get the size of the original block */
- if (slab->us_zone)
- alloc = slab->us_zone->uz_size;
+ if (slab->us_keg)
+ alloc = slab->us_keg->uk_size;
else
alloc = slab->us_size;
@@ -410,7 +413,6 @@ kmeminit(dummy)
void *dummy;
{
u_int8_t indx;
- u_long npg;
u_long mem_size;
int i;
@@ -428,7 +430,7 @@ kmeminit(dummy)
* Note that the kmem_map is also used by the zone allocator,
* so make sure that there is enough space.
*/
- vm_kmem_size = VM_KMEM_SIZE;
+ vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
mem_size = cnt.v_page_count;
#if defined(VM_KMEM_SIZE_SCALE)
@@ -462,17 +464,8 @@ kmeminit(dummy)
*/
init_param3(vm_kmem_size / PAGE_SIZE);
- /*
- * In mbuf_init(), we set up submaps for mbufs and clusters, in which
- * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
- * respectively. Mathematically, this means that what we do here may
- * amount to slightly more address space than we need for the submaps,
- * but it never hurts to have an extra page in kmem_map.
- */
- npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE;
-
kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
- (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+ (vm_offset_t *)&kmemlimit, vm_kmem_size);
kmem_map->system_map = 1;
uma_startup2();
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
new file mode 100644
index 0000000..2bec5ad
--- /dev/null
+++ b/sys/kern/kern_mbuf.c
@@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2004
+ * Bosko Milekic <bmilekic@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors may be
+ * used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ * m_clget() m_getcl()
+ * | |
+ * | .------------>[(Packet Cache)] m_get(), m_gethdr()
+ * | | [ Packet ] |
+ * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ]
+ * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ]
+ * | \________ |
+ * [ Cluster Keg ] \ /
+ * | [ Mbuf Keg ]
+ * [ Cluster Slabs ] |
+ * | [ Mbuf Slabs ]
+ * \____________(VM)_________________/
+ */
+
+int nmbclusters;
+struct mbstat mbstat;
+
+static void
+tunable_mbinit(void *dummy)
+{
+
+ /* This has to be done before VM init. */
+ nmbclusters = 1024 + maxusers * 64;
+ TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
+ "Maximum number of mbuf clusters allowed");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+ "Mbuf general information and statistics");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t zone_mbuf;
+uma_zone_t zone_clust;
+uma_zone_t zone_pack;
+
+/*
+ * Local prototypes.
+ */
+static void mb_ctor_mbuf(void *, int, void *);
+static void mb_ctor_clust(void *, int, void *);
+static void mb_ctor_pack(void *, int, void *);
+static void mb_dtor_mbuf(void *, int, void *);
+static void mb_dtor_clust(void *, int, void *); /* XXX */
+static void mb_dtor_pack(void *, int, void *); /* XXX */
+static void mb_init_pack(void *, int);
+static void mb_fini_pack(void *, int);
+
+static void mb_reclaim(void *);
+static void mbuf_init(void *);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+static void
+mbuf_init(void *dummy)
+{
+
+ /*
+ * Configure UMA zones for Mbufs, Clusters, and Packets.
+ */
+ zone_mbuf = uma_zcreate("Mbuf", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MAXBUCKET);
+ zone_clust = uma_zcreate("MbufClust", MCLBYTES, mb_ctor_clust,
+ mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+ if (nmbclusters > 0)
+ uma_zone_set_max(zone_clust, nmbclusters);
+ zone_pack = uma_zsecond_create("Packet", mb_ctor_pack, mb_dtor_pack,
+ mb_init_pack, mb_fini_pack, zone_mbuf);
+
+ /* uma_prealloc() goes here */
+
+ /*
+ * Hook event handler for low-memory situation, used to
+ * drain protocols and push data back to the caches (UMA
+ * later pushes it back to VM).
+ */
+ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+ EVENTHANDLER_PRI_FIRST);
+
+ /*
+ * [Re]set counters and local statistics knobs.
+ * XXX Some of these should go and be replaced, but UMA stat
+ * gathering needs to be revised.
+ */
+ mbstat.m_mbufs = 0;
+ mbstat.m_mclusts = 0;
+ mbstat.m_drain = 0;
+ mbstat.m_msize = MSIZE;
+ mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_minclsize = MINCLSIZE;
+ mbstat.m_mlen = MLEN;
+ mbstat.m_mhlen = MHLEN;
+ mbstat.m_numtypes = MT_NTYPES;
+
+ mbstat.m_mcfail = mbstat.m_mpfail = 0;
+ mbstat.sf_iocnt = 0;
+ mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API.
+ */
+static void
+mb_ctor_mbuf(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+ struct mb_args *args;
+ int flags;
+ int how;
+ short type;
+
+ m = (struct mbuf *)mem;
+ args = (struct mb_args *)arg;
+ flags = args->flags;
+ how = args->how;
+ type = args->type;
+
+ m->m_type = type;
+ m->m_next = NULL;
+ m->m_nextpkt = NULL;
+ if (flags & M_PKTHDR) {
+ m->m_data = m->m_pktdat;
+ m->m_flags = M_PKTHDR;
+ m->m_pkthdr.rcvif = NULL;
+ m->m_pkthdr.csum_flags = 0;
+ SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+ /* If the label init fails, fail the alloc */
+ if (mac_init_mbuf(m, how) != 0) {
+ m_free(m);
+/* XXX*/ panic("mb_ctor_mbuf(): can't deal with failure!");
+/* return 0; */
+ }
+#endif
+ } else {
+ m->m_data = m->m_dat;
+ m->m_flags = 0;
+ }
+ mbstat.m_mbufs += 1; /* XXX */
+/* return 1;
+*/
+}
+
+/*
+ * The Mbuf master zone and Packet secondary zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+ if ((m->m_flags & M_PKTHDR) != 0)
+ m_tag_delete_chain(m, NULL);
+ mbstat.m_mbufs -= 1; /* XXX */
+}
+
+/* XXX Only because of stats */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+ if ((m->m_flags & M_PKTHDR) != 0)
+ m_tag_delete_chain(m, NULL);
+ mbstat.m_mbufs -= 1; /* XXX */
+ mbstat.m_mclusts -= 1; /* XXX */
+}
+
+/*
+ * The Cluster zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for.
+ */
+static void
+mb_ctor_clust(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)arg;
+ m->m_ext.ext_buf = (caddr_t)mem;
+ m->m_data = m->m_ext.ext_buf;
+ m->m_flags |= M_EXT;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_args = NULL;
+ m->m_ext.ext_size = MCLBYTES;
+ m->m_ext.ext_type = EXT_CLUSTER;
+ m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+ m->m_ext.ext_buf);
+ *(m->m_ext.ref_cnt) = 1;
+ mbstat.m_mclusts += 1; /* XXX */
+/* return 1;
+*/
+}
+
+/* XXX */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+ mbstat.m_mclusts -= 1; /* XXX */
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from keg slab to zone cache.
+ */
+static void
+mb_init_pack(void *mem, int size)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+ m->m_ext.ext_buf = NULL;
+ uma_zalloc_arg(zone_clust, m, M_NOWAIT);
+ if (m->m_ext.ext_buf == NULL) /* XXX */
+ panic("mb_init_pack(): Can't deal with failure yet.");
+ mbstat.m_mclusts -= 1; /* XXX */
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_fini_pack(void *mem, int size)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+ uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+ m->m_ext.ext_buf = NULL;
+ mbstat.m_mclusts += 1; /* XXX */
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static void
+mb_ctor_pack(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+ struct mb_args *args;
+ int flags, how;
+ short type;
+
+ m = (struct mbuf *)mem;
+ args = (struct mb_args *)arg;
+ flags = args->flags;
+ type = args->type;
+ how = args->how;
+
+ m->m_type = type;
+ m->m_next = NULL;
+ m->m_data = m->m_ext.ext_buf;
+ m->m_flags = flags|M_EXT;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_args = NULL;
+ m->m_ext.ext_size = MCLBYTES;
+ m->m_ext.ext_type = EXT_PACKET;
+ *(m->m_ext.ref_cnt) = 1;
+
+ if (flags & M_PKTHDR) {
+ m->m_nextpkt = NULL;
+ m->m_pkthdr.rcvif = NULL;
+ m->m_pkthdr.csum_flags = 0;
+ SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+ /* If the label init fails, fail the alloc */
+ if (mac_init_mbuf(m, how) != 0) {
+ m_free(m);
+/* XXX*/ panic("mb_ctor_pack(): can't deal with failure!");
+/* return 0; */
+ }
+#endif
+ }
+ mbstat.m_mbufs += 1; /* XXX */
+ mbstat.m_mclusts += 1; /* XXX */
+/* return 1;
+*/
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called. The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+ "mb_reclaim()");
+
+ mbstat.m_drain++;
+ for (dp = domains; dp != NULL; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_drain != NULL)
+ (*pr->pr_drain)();
+}
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
deleted file mode 100644
index d84ef31..0000000
--- a/sys/kern/subr_mbuf.c
+++ /dev/null
@@ -1,1548 +0,0 @@
-/*-
- * Copyright (c) 2001, 2002, 2003
- * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_mac.h"
-#include "opt_param.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/mac.h>
-#include <sys/mbuf.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/smp.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-
-#include <vm/vm.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-
-/*
- * mb_alloc: network buffer allocator
- *
- * XXX: currently, the "low watermark" sysctl is marked read-only as its
- * effects are not completely implemented. To be fixed soon.
- */
-
-/*
- * Maximum number of PCPU containers. If you know what you're doing you could
- * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
- * system during compilation, and thus prevent kernel structure bloat.
- *
- * SMP and non-SMP kernels clearly have a different number of possible CPUs,
- * but because we cannot assume a dense array of CPUs, we always allocate
- * and traverse PCPU containers up to NCPU amount and merely check for
- * CPU availability.
- */
-#ifdef MBALLOC_NCPU
-#define NCPU MBALLOC_NCPU
-#else
-#define NCPU MAXCPU
-#endif
-
-/*-
- * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
- * "memcache" proof-of-concept allocator which was itself based on
- * several well-known SMP-friendly allocators.
- *
- * The mb_alloc mbuf allocator is a special when compared to other
- * general-purpose allocators. Some things to take note of:
- *
- * Mbufs and mbuf clusters are two different objects. Sometimes we
- * will allocate a single mbuf, other times a single cluster,
- * other times both. Further, we may sometimes wish to allocate a
- * whole chain of mbufs with clusters. This allocator will perform
- * the common case of each scenario in one function call (this
- * includes constructing or destructing the object) while only
- * locking/unlocking the cache once, if it can get away with it.
- * The caches consist of pure mbufs and pure clusters; that is
- * there are no 'zones' containing mbufs with already pre-hooked
- * clusters. Since we can allocate both objects atomically anyway,
- * we don't bother fragmenting our caches for any particular 'scenarios.'
- *
- * We allocate from seperate sub-maps of kmem_map, thus imposing
- * an ultimate upper-limit on the number of allocatable clusters
- * and mbufs and also, since the clusters all come from a
- * virtually contiguous region, we can keep reference counters
- * for them and "allocate" them purely by indexing into a
- * dense refcount vector.
- *
- * We call out to protocol drain routines (which can be hooked
- * into us) when we're low on space.
- *
- * The mbuf allocator keeps all objects that it allocates in mb_buckets.
- * The buckets keep a number of objects (an object can be an mbuf or an
- * mbuf cluster) and facilitate moving larger sets of contiguous objects
- * from the per-CPU caches to the global cache. The buckets also have
- * the added advantage that objects, when migrated from cache to cache,
- * are migrated in chunks that keep contiguous objects together,
- * minimizing TLB pollution.
- *
- * The buckets are kept on singly-linked lists called "containers." A container
- * is protected by a mutex in order to ensure consistency. The mutex
- * itself is allocated separately and attached to the container at boot time,
- * thus allowing for certain containers to share the same lock. Per-CPU
- * containers for mbufs and mbuf clusters all share the same per-CPU
- * lock whereas the global cache containers for these objects share one
- * global lock.
- */
-struct mb_bucket {
- SLIST_ENTRY(mb_bucket) mb_blist;
- int mb_owner;
- int mb_numfree;
- void *mb_free[0];
-};
-
-struct mb_container {
- SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
- struct mtx *mc_lock;
- int mc_numowner;
- u_int mc_starved;
- long *mc_types;
- u_long *mc_objcount;
- u_long *mc_numbucks;
-};
-
-struct mb_gen_list {
- struct mb_container mb_cont;
- struct cv mgl_mstarved;
-};
-
-struct mb_pcpu_list {
- struct mb_container mb_cont;
-};
-
-/*
- * Boot-time configurable object counts that will determine the maximum
- * number of permitted objects in the mbuf and mcluster cases. In the
- * ext counter (nmbcnt) case, it's just an indicator serving to scale
- * kmem_map size properly - in other words, we may be allowed to allocate
- * more than nmbcnt counters, whereas we will never be allowed to allocate
- * more than nmbufs mbufs or nmbclusters mclusters.
- * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
- * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
- */
-#ifndef NMBCLUSTERS
-#define NMBCLUSTERS (1024 + maxusers * 64)
-#endif
-#ifndef NMBUFS
-#define NMBUFS (nmbclusters * 2)
-#endif
-#ifndef NSFBUFS
-#define NSFBUFS (512 + maxusers * 16)
-#endif
-#ifndef NMBCNTS
-#define NMBCNTS (nmbclusters + nsfbufs)
-#endif
-int nmbufs;
-int nmbclusters;
-int nmbcnt;
-int nsfbufs;
-int nsfbufspeak;
-int nsfbufsused;
-
-/*
- * Sizes of objects per bucket. There are this size's worth of mbufs
- * or clusters in each bucket. Please keep these a power-of-2.
- */
-#define MBUF_BUCK_SZ (PAGE_SIZE * 2)
-#define CLUST_BUCK_SZ (PAGE_SIZE * 4)
-
-/*
- * Perform sanity checks of tunables declared above.
- */
-static void
-tunable_mbinit(void *dummy)
-{
-
- /*
- * This has to be done before VM init.
- */
- nmbclusters = NMBCLUSTERS;
- TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
- nmbufs = NMBUFS;
- TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
- nsfbufs = NSFBUFS;
- TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
- nmbcnt = NMBCNTS;
- TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
- /* Sanity checks */
- if (nmbufs < nmbclusters * 2)
- nmbufs = nmbclusters * 2;
- if (nmbcnt < nmbclusters + nsfbufs)
- nmbcnt = nmbclusters + nsfbufs;
-}
-SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
-
-/*
- * The freelist structures and mutex locks. The number statically declared
- * here depends on the number of CPUs.
- *
- * We set up in such a way that all the objects (mbufs, clusters)
- * share the same mutex lock. It has been established that we do not benefit
- * from different locks for different objects, so we use the same lock,
- * regardless of object type. This also allows us to do optimised
- * multi-object allocations without dropping the lock in between.
- */
-struct mb_lstmngr {
- struct mb_gen_list *ml_genlist;
- struct mb_pcpu_list *ml_cntlst[NCPU];
- struct mb_bucket **ml_btable;
- vm_map_t ml_map;
- vm_offset_t ml_mapbase;
- vm_offset_t ml_maptop;
- int ml_mapfull;
- u_int ml_objsize;
- u_int ml_objbucks;
- u_int *ml_wmhigh;
- u_int *ml_wmlow;
-};
-static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
-static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
-static u_int *cl_refcntmap;
-
-/*
- * Local macros for internal allocator structure manipulations.
- */
-#ifdef SMP
-#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
-#else
-#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
-#endif
-
-#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
-
-#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
-
-#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
-
-#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
- (mb_lst)->ml_cntlst[(num)]
-
-#define MB_BUCKET_INDX(mb_obj, mb_lst) \
- (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \
- ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
-
-#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
-{ \
- struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
- \
- (mb_bckt)->mb_numfree--; \
- (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
- (*((mb_lst)->mb_cont.mc_objcount))--; \
- if ((mb_bckt)->mb_numfree == 0) { \
- SLIST_REMOVE_HEAD(_mchd, mb_blist); \
- SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
- (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
- } \
-}
-
-#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
- (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
- (mb_bckt)->mb_numfree++; \
- (*((mb_lst)->mb_cont.mc_objcount))++;
-
-#define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
- if ((mb_type) != MT_NOTMBUF) \
- (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
-
-#define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
- if ((mb_type) != MT_NOTMBUF) \
- (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
-
-/*
- * Ownership of buckets/containers is represented by integers. The PCPU
- * lists range from 0 to NCPU-1. We need a free numerical id for the general
- * list (we use NCPU). We also need a non-conflicting free bit to indicate
- * that the bucket is free and removed from a container, while not losing
- * the bucket's originating container id. We use the highest bit
- * for the free marker.
- */
-#define MB_GENLIST_OWNER (NCPU)
-#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
-
-/* Statistics structures for allocator (per-CPU and general). */
-static struct mbpstat mb_statpcpu[NCPU + 1];
-struct mbstat mbstat;
-
-/* Sleep time for wait code (in ticks). */
-static int mbuf_wait = 64;
-
-static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */
-static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */
-static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */
-static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */
-
-/*
- * Objects exported by sysctl(8).
- */
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0,
- "Maximum number of mbuf clusters available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0,
- "Maximum number of mbufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0,
- "Number used to scale kmem_map to ensure sufficient space for counters");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
- "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
- "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
- "Number of sendfile(2) sf_bufs in use");
-SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
- "Sleep time of mbuf subsystem wait allocations during exhaustion");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
- "Upper limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0,
- "Lower limit of number of mbufs allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
- "Upper limit of number of mbuf clusters allowed in each cache");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0,
- "Lower limit of number of mbuf clusters allowed in each cache");
-SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
- "Mbuf general information and statistics");
-SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
- sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
-
-/*
- * Prototypes of local allocator routines.
- */
-static void *mb_alloc_wait(struct mb_lstmngr *, short);
-static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
- struct mb_pcpu_list *);
-static void mb_reclaim(void);
-static void mbuf_init(void *);
-
-/*
- * Initial allocation numbers. Each parameter represents the number of buckets
- * of each object that will be placed initially in each PCPU container for
- * said object.
- */
-#define NMB_MBUF_INIT 2
-#define NMB_CLUST_INIT 8
-
-/*
- * Internal flags that allow for cache locks to remain "persistent" across
- * allocation and free calls. They may be used in combination.
- */
-#define MBP_PERSIST 0x1 /* Return with lock still held. */
-#define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */
-
-/*
- * Initialize the mbuf subsystem.
- *
- * We sub-divide the kmem_map into several submaps; this way, we don't have
- * to worry about artificially limiting the number of mbuf or mbuf cluster
- * allocations, due to fear of one type of allocation "stealing" address
- * space initially reserved for another.
- *
- * Set up both the general containers and all the PCPU containers. Populate
- * the PCPU containers with initial numbers.
- */
-MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
-SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
-static void
-mbuf_init(void *dummy)
-{
- struct mb_pcpu_list *pcpu_cnt;
- vm_size_t mb_map_size;
- int i, j;
-
- /*
- * Set up all the submaps, for each type of object that we deal
- * with in this allocator.
- */
- mb_map_size = (vm_size_t)(nmbufs * MSIZE);
- mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
- mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
- MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
- if (mb_list_mbuf.ml_btable == NULL)
- goto bad;
- mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
- &(mb_list_mbuf.ml_maptop), mb_map_size);
- mb_list_mbuf.ml_map->system_map = 1;
- mb_list_mbuf.ml_mapfull = 0;
- mb_list_mbuf.ml_objsize = MSIZE;
- mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize;
- mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
- mb_list_mbuf.ml_wmlow = &mbuf_lowm;
-
- mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
- mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
- mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
- CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
- if (mb_list_clust.ml_btable == NULL)
- goto bad;
- mb_list_clust.ml_map = kmem_suballoc(kmem_map,
- &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
- mb_map_size);
- mb_list_clust.ml_map->system_map = 1;
- mb_list_clust.ml_mapfull = 0;
- mb_list_clust.ml_objsize = MCLBYTES;
- mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize;
- mb_list_clust.ml_wmhigh = &clust_hiwm;
- mb_list_clust.ml_wmlow = &clust_lowm;
-
- /*
- * Allocate required general (global) containers for each object type.
- */
- mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
- M_NOWAIT);
- mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
- M_NOWAIT);
- if ((mb_list_mbuf.ml_genlist == NULL) ||
- (mb_list_clust.ml_genlist == NULL))
- goto bad;
-
- /*
- * Initialize condition variables and general container mutex locks.
- */
- mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, MTX_DEF);
- cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
- cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
- "mcluster pool starved");
- mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
- mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
-
- /*
- * Set up the general containers for each object.
- */
- mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
- mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
- mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
- mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
- mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
- mb_list_clust.ml_genlist->mb_cont.mc_objcount =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
- mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
- mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
- mb_list_mbuf.ml_genlist->mb_cont.mc_types =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
- mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
- SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
- SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
-
- /*
- * Allocate all the required counters for clusters. This makes
- * cluster allocations/deallocations much faster.
- */
- cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
- if (cl_refcntmap == NULL)
- goto bad;
-
- /*
- * Initialize general mbuf statistics.
- */
- mbstat.m_msize = mb_list_mbuf.ml_objsize;
- mbstat.m_mclbytes = mb_list_clust.ml_objsize;
- mbstat.m_minclsize = MINCLSIZE;
- mbstat.m_mlen = MLEN;
- mbstat.m_mhlen = MHLEN;
- mbstat.m_numtypes = MT_NTYPES;
- mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks;
- mbstat.m_clperbuck = mb_list_clust.ml_objbucks;
-
- /*
- * Allocate and initialize PCPU containers.
- */
- for (i = 0; i < NCPU; i++) {
- if (CPU_ABSENT(i)) {
- mb_statpcpu[i].mb_active = 0;
- continue;
- }
-
- mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
- M_MBUF, M_NOWAIT);
- mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
- M_MBUF, M_NOWAIT);
- if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
- (mb_list_clust.ml_cntlst[i] == NULL))
- goto bad;
-
- mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, MTX_DEF);
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
-
- mb_statpcpu[i].mb_active = 1;
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
- &(mb_statpcpu[i].mb_mbfree);
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
- &(mb_statpcpu[i].mb_clfree);
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
- &(mb_statpcpu[i].mb_mbbucks);
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
- &(mb_statpcpu[i].mb_clbucks);
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
- &(mb_statpcpu[i].mb_mbtypes[0]);
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
-
- SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
- SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
-
- /*
- * Perform initial allocations.
- */
- pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
- MB_LOCK_CONT(pcpu_cnt);
- for (j = 0; j < NMB_MBUF_INIT; j++) {
- if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
- == NULL)
- goto bad;
- }
- MB_UNLOCK_CONT(pcpu_cnt);
-
- pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
- MB_LOCK_CONT(pcpu_cnt);
- for (j = 0; j < NMB_CLUST_INIT; j++) {
- if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
- == NULL)
- goto bad;
- }
- MB_UNLOCK_CONT(pcpu_cnt);
- }
-
- return;
-bad:
- panic("mbuf_init(): failed to initialize mbuf subsystem!");
-}
-
-/*
- * Populate a given mbuf PCPU container with a bucket full of fresh new
- * buffers. Return a pointer to the new bucket (already in the container if
- * successful), or return NULL on failure.
- *
- * LOCKING NOTES:
- * PCPU container lock must be held when this is called.
- * The lock is dropped here so that we can cleanly call the underlying VM
- * code. If we fail, we return with no locks held. If we succeed (i.e., return
- * non-NULL), we return with the PCPU lock held, ready for allocation from
- * the returned bucket.
- */
-static struct mb_bucket *
-mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
-{
- struct mb_bucket *bucket;
- caddr_t p;
- int i;
-
- MB_UNLOCK_CONT(cnt_lst);
- /*
- * If our object's (finite) map is starved now (i.e., no more address
- * space), bail out now.
- */
- if (mb_list->ml_mapfull)
- return (NULL);
-
- bucket = malloc(sizeof(struct mb_bucket) +
- mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how));
- if (bucket == NULL)
- return (NULL);
-
- p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize *
- mb_list->ml_objbucks, MBTOM(how));
- if (p == NULL) {
- free(bucket, M_MBUF);
- if (how == M_TRYWAIT)
- mb_list->ml_mapfull = 1;
- return (NULL);
- }
-
- bucket->mb_numfree = 0;
- mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
- for (i = 0; i < mb_list->ml_objbucks; i++) {
- bucket->mb_free[i] = p;
- bucket->mb_numfree++;
- p += mb_list->ml_objsize;
- }
-
- MB_LOCK_CONT(cnt_lst);
- bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
- SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
- (*(cnt_lst->mb_cont.mc_numbucks))++;
- *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
-
- return (bucket);
-}
-
-/*
- * Allocate a network buffer.
- * The general case is very easy. Complications only arise if our PCPU
- * container is empty. Things get worse if the PCPU container is empty,
- * the general container is empty, and we've run out of address space
- * in our map; then we try to block if we're willing to (M_TRYWAIT).
- */
-static
-void *
-mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist,
- int *pers_list)
-{
- static int last_report;
- struct mb_pcpu_list *cnt_lst;
- struct mb_bucket *bucket;
- void *m;
-
-#ifdef INVARIANTS
- int flags;
-
- flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
- if (flags != M_DONTWAIT && flags != M_TRYWAIT) {
- static struct timeval lasterr;
- static int curerr;
- if (ppsratecheck(&lasterr, &curerr, 1)) {
- printf("Bad mbuf alloc flags: %x\n", flags);
- backtrace();
- how = M_TRYWAIT;
- }
- }
-#endif
-
- m = NULL;
- if ((persist & MBP_PERSISTENT) != 0) {
- /*
- * If we're a "persistent" call, then the per-CPU #(pers_list)
- * cache lock is already held, and we just need to refer to
- * the correct cache descriptor.
- */
- cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
- } else {
- cnt_lst = MB_GET_PCPU_LIST(mb_list);
- MB_LOCK_CONT(cnt_lst);
- }
-
- if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
- /*
- * This is the easy allocation case. We just grab an object
- * from a bucket in the PCPU container. At worst, we
- * have just emptied the bucket and so we remove it
- * from the container.
- */
- MB_GET_OBJECT(m, bucket, cnt_lst);
- MB_MBTYPES_INC(cnt_lst, type, 1);
-
- /* If asked to persist, do not drop the lock. */
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(cnt_lst);
- else
- *pers_list = cnt_lst->mb_cont.mc_numowner;
- } else {
- struct mb_gen_list *gen_list;
-
- /*
- * This is the less-common more difficult case. We must
- * first verify if the general list has anything for us
- * and if that also fails, we must allocate a page from
- * the map and create a new bucket to place in our PCPU
- * container (already locked). If the map is starved then
- * we're really in for trouble, as we have to wait on
- * the general container's condition variable.
- */
- gen_list = MB_GET_GEN_LIST(mb_list);
- MB_LOCK_CONT(gen_list);
-
- if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
- != NULL) {
- /*
- * Give ownership of the bucket to our CPU's
- * container, but only actually put the bucket
- * in the container if it doesn't become free
- * upon removing an mbuf from it.
- */
- SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
- mb_blist);
- bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
- (*(gen_list->mb_cont.mc_numbucks))--;
- (*(cnt_lst->mb_cont.mc_numbucks))++;
- *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
- bucket->mb_numfree--;
- m = bucket->mb_free[(bucket->mb_numfree)];
- if (bucket->mb_numfree == 0) {
- SLIST_NEXT(bucket, mb_blist) = NULL;
- bucket->mb_owner |= MB_BUCKET_FREE;
- } else {
- SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
- bucket, mb_blist);
- *(cnt_lst->mb_cont.mc_objcount) +=
- bucket->mb_numfree;
- }
- MB_UNLOCK_CONT(gen_list);
- MB_MBTYPES_INC(cnt_lst, type, 1);
-
- /* If asked to persist, do not drop the lock. */
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(cnt_lst);
- else
- *pers_list = cnt_lst->mb_cont.mc_numowner;
- } else {
- /*
- * We'll have to allocate a new page.
- */
- MB_UNLOCK_CONT(gen_list);
- bucket = mb_pop_cont(mb_list, how, cnt_lst);
- if (bucket != NULL) {
- MB_GET_OBJECT(m, bucket, cnt_lst);
- MB_MBTYPES_INC(cnt_lst, type, 1);
-
- /* If asked to persist, do not drop the lock. */
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(cnt_lst);
- else
- *pers_list=cnt_lst->mb_cont.mc_numowner;
- } else {
- if (how == M_TRYWAIT) {
- /*
- * Absolute worst-case scenario.
- * We block if we're willing to, but
- * only after trying to steal from
- * other lists.
- */
- m = mb_alloc_wait(mb_list, type);
- } else {
- /* XXX: No consistency. */
- mbstat.m_drops++;
-
- if (ticks < last_report ||
- (ticks - last_report) >= hz) {
- last_report = ticks;
- printf(
-"All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
- }
-
- }
- if (m != NULL && (persist & MBP_PERSIST) != 0) {
- cnt_lst = MB_GET_PCPU_LIST(mb_list);
- MB_LOCK_CONT(cnt_lst);
- *pers_list=cnt_lst->mb_cont.mc_numowner;
- }
- }
- }
- }
-
- return (m);
-}
-
-/*
- * This is the worst-case scenario called only if we're allocating with
- * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
- * by looking in every PCPU container. If we're still unsuccesful, we
- * try the general container one last time and possibly block on our
- * starved cv.
- */
-static void *
-mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
-{
- struct mb_pcpu_list *cnt_lst;
- struct mb_gen_list *gen_list;
- struct mb_bucket *bucket;
- void *m;
- int i, cv_ret;
-
- /*
- * Try to reclaim mbuf-related objects (mbufs, clusters).
- */
- mb_reclaim();
-
- /*
- * Cycle all the PCPU containers. Increment starved counts if found
- * empty.
- */
- for (i = 0; i < NCPU; i++) {
- if (CPU_ABSENT(i))
- continue;
- cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
- MB_LOCK_CONT(cnt_lst);
-
- /*
- * If container is non-empty, get a single object from it.
- * If empty, increment starved count.
- */
- if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
- NULL) {
- MB_GET_OBJECT(m, bucket, cnt_lst);
- MB_MBTYPES_INC(cnt_lst, type, 1);
- MB_UNLOCK_CONT(cnt_lst);
- mbstat.m_wait++; /* XXX: No consistency. */
- return (m);
- } else
- cnt_lst->mb_cont.mc_starved++;
-
- MB_UNLOCK_CONT(cnt_lst);
- }
-
- /*
- * We're still here, so that means it's time to get the general
- * container lock, check it one more time (now that mb_reclaim()
- * has been called) and if we still get nothing, block on the cv.
- */
- gen_list = MB_GET_GEN_LIST(mb_list);
- MB_LOCK_CONT(gen_list);
- if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
- MB_GET_OBJECT(m, bucket, gen_list);
- MB_MBTYPES_INC(gen_list, type, 1);
- MB_UNLOCK_CONT(gen_list);
- mbstat.m_wait++; /* XXX: No consistency. */
- return (m);
- }
-
- gen_list->mb_cont.mc_starved++;
- cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
- gen_list->mb_cont.mc_lock, mbuf_wait);
- gen_list->mb_cont.mc_starved--;
-
- if ((cv_ret == 0) &&
- ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
- MB_GET_OBJECT(m, bucket, gen_list);
- MB_MBTYPES_INC(gen_list, type, 1);
- mbstat.m_wait++; /* XXX: No consistency. */
- } else {
- mbstat.m_drops++; /* XXX: No consistency. */
- m = NULL;
- }
-
- MB_UNLOCK_CONT(gen_list);
-
- return (m);
-}
-
-/*-
- * Free an object to its rightful container.
- * In the very general case, this operation is really very easy.
- * Complications arise primarily if:
- * (a) We've hit the high limit on number of free objects allowed in
- * our PCPU container.
- * (b) We're in a critical situation where our container has been
- * marked 'starved' and we need to issue wakeups on the starved
- * condition variable.
- * (c) Minor (odd) cases: our bucket has migrated while we were
- * waiting for the lock; our bucket is in the general container;
- * our bucket is empty.
- */
-static
-void
-mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
- int *pers_list)
-{
- struct mb_pcpu_list *cnt_lst;
- struct mb_gen_list *gen_list;
- struct mb_bucket *bucket;
- u_int owner;
-
- bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
-
- /*
- * Make sure that if after we lock the bucket's present container the
- * bucket has migrated, that we drop the lock and get the new one.
- */
-retry_lock:
- owner = bucket->mb_owner & ~MB_BUCKET_FREE;
- switch (owner) {
- case MB_GENLIST_OWNER:
- gen_list = MB_GET_GEN_LIST(mb_list);
- if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
- if (*pers_list != MB_GENLIST_OWNER) {
- cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
- *pers_list);
- MB_UNLOCK_CONT(cnt_lst);
- MB_LOCK_CONT(gen_list);
- }
- } else {
- MB_LOCK_CONT(gen_list);
- }
- if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
- MB_UNLOCK_CONT(gen_list);
- *pers_list = -1;
- goto retry_lock;
- }
-
- /*
- * If we're intended for the general container, this is
- * real easy: no migrating required. The only `bogon'
- * is that we're now contending with all the threads
- * dealing with the general list, but this is expected.
- */
- MB_PUT_OBJECT(m, bucket, gen_list);
- MB_MBTYPES_DEC(gen_list, type, 1);
- if (bucket->mb_owner & MB_BUCKET_FREE) {
- SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
- bucket, mb_blist);
- bucket->mb_owner = MB_GENLIST_OWNER;
- }
- if (gen_list->mb_cont.mc_starved > 0)
- cv_signal(&(gen_list->mgl_mstarved));
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(gen_list);
- else
- *pers_list = MB_GENLIST_OWNER;
- break;
-
- default:
- cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
- if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
- if (*pers_list == MB_GENLIST_OWNER) {
- gen_list = MB_GET_GEN_LIST(mb_list);
- MB_UNLOCK_CONT(gen_list);
- MB_LOCK_CONT(cnt_lst);
- } else {
- cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
- *pers_list);
- owner = *pers_list;
- }
- } else {
- MB_LOCK_CONT(cnt_lst);
- }
- if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
- MB_UNLOCK_CONT(cnt_lst);
- *pers_list = -1;
- goto retry_lock;
- }
-
- MB_PUT_OBJECT(m, bucket, cnt_lst);
- MB_MBTYPES_DEC(cnt_lst, type, 1);
- if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) ||
- (cnt_lst->mb_cont.mc_starved > 0)) {
- /*
- * We've hit the high limit of allowed numbers of mbufs
- * on this PCPU list or we've been flagged that we need
- * to transfer a bucket over to the general cache.
- * We must now migrate a bucket over to the general
- * container.
- */
- gen_list = MB_GET_GEN_LIST(mb_list);
- MB_LOCK_CONT(gen_list);
- if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
- bucket =
- SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
- SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
- mb_blist);
- }
- SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
- bucket, mb_blist);
- bucket->mb_owner = MB_GENLIST_OWNER;
- *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
- *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
- (*(cnt_lst->mb_cont.mc_numbucks))--;
- (*(gen_list->mb_cont.mc_numbucks))++;
-
- /*
- * While we're at it, transfer some of the mbtypes
- * "count load" onto the general list's mbtypes
- * array, seeing as how we're moving the bucket
- * there now, meaning that the freeing of objects
- * there will now decrement the _general list's_
- * mbtypes counters, and no longer our PCPU list's
- * mbtypes counters. We do this for the type presently
- * being freed in an effort to keep the mbtypes
- * counters approximately balanced across all lists.
- */
- MB_MBTYPES_DEC(cnt_lst, type,
- mb_list->ml_objbucks - bucket->mb_numfree);
- MB_MBTYPES_INC(gen_list, type,
- mb_list->ml_objbucks - bucket->mb_numfree);
-
- if (cnt_lst->mb_cont.mc_starved > 0) {
- /*
- * Determine whether or not to keep
- * transferring buckets to the general list
- * or whether we've transferred enough already.
- * The thread that is blocked may end up waking
- * up in the meantime, but transferring an
- * extra bucket in a constrained situation
- * is not so bad, as we're likely to need
- * it soon anyway.
- */
- if (gen_list->mb_cont.mc_starved > 0) {
- cnt_lst->mb_cont.mc_starved--;
- cv_signal(&(gen_list->mgl_mstarved));
- } else
- cnt_lst->mb_cont.mc_starved = 0;
- }
- MB_UNLOCK_CONT(gen_list);
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(cnt_lst);
- else
- *pers_list = owner;
- break;
- }
-
- if (bucket->mb_owner & MB_BUCKET_FREE) {
- SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
- bucket, mb_blist);
- bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
- }
-
- if ((persist & MBP_PERSIST) == 0)
- MB_UNLOCK_CONT(cnt_lst);
- else
- *pers_list = owner;
- break;
- }
-}
-
-/*
- * Drain protocols in hopes to free up some resources.
- *
- * LOCKING NOTES:
- * No locks should be held when this is called. The drain routines have to
- * presently acquire some locks which raises the possibility of lock order
- * violation if we're holding any mutex if that mutex is acquired in reverse
- * order relative to one of the locks in the drain routines.
- */
-static void
-mb_reclaim(void)
-{
- struct domain *dp;
- struct protosw *pr;
-
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
- "mb_reclaim()");
-
- mbstat.m_drain++; /* XXX: No consistency. */
-
- for (dp = domains; dp != NULL; dp = dp->dom_next)
- for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
- if (pr->pr_drain != NULL)
- (*pr->pr_drain)();
-}
-
-/******************************************************************************
- * Internal setup macros.
- */
-
-#define _mb_setup(m, type) do { \
- (m)->m_type = (type); \
- (m)->m_next = NULL; \
- (m)->m_nextpkt = NULL; \
- (m)->m_data = (m)->m_dat; \
- (m)->m_flags = 0; \
-} while (0)
-
-#define _mbhdr_setup(m, type) do { \
- (m)->m_type = (type); \
- (m)->m_next = NULL; \
- (m)->m_nextpkt = NULL; \
- (m)->m_data = (m)->m_pktdat; \
- (m)->m_flags = M_PKTHDR; \
- (m)->m_pkthdr.rcvif = NULL; \
- (m)->m_pkthdr.csum_flags = 0; \
- SLIST_INIT(&(m)->m_pkthdr.tags); \
-} while (0)
-
-#define _mcl_setup(m) do { \
- (m)->m_data = (m)->m_ext.ext_buf; \
- (m)->m_flags |= M_EXT; \
- (m)->m_ext.ext_free = NULL; \
- (m)->m_ext.ext_args = NULL; \
- (m)->m_ext.ext_size = MCLBYTES; \
- (m)->m_ext.ext_type = EXT_CLUSTER; \
-} while (0)
-
-#define _mext_init_ref(m, ref) do { \
- (m)->m_ext.ref_cnt = ((ref) == NULL) ? \
- malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \
- if ((m)->m_ext.ref_cnt != NULL) { \
- *((m)->m_ext.ref_cnt) = 0; \
- MEXT_ADD_REF((m)); \
- } \
-} while (0)
-
-#define cl2ref(cl) \
- (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT)
-
-#define _mext_dealloc_ref(m) \
- if ((m)->m_ext.ext_type != EXT_EXTREF) \
- free((m)->m_ext.ref_cnt, M_MBUF)
-
-/******************************************************************************
- * Internal routines.
- *
- * Because mb_alloc() and mb_free() are inlines (to keep the common
- * cases down to a maximum of one function call), below are a few
- * routines used only internally for the sole purpose of making certain
- * functions smaller.
- *
- * - _mext_free(): frees associated storage when the ref. count is
- * exactly one and we're freeing.
- *
- * - _mgetm_internal(): common "persistent-lock" routine that allocates
- * an mbuf and a cluster in one shot, but where the lock is already
- * held coming in (which is what makes it different from the exported
- * m_getcl()). The lock is dropped when done. This is used by m_getm()
- * and, therefore, is very m_getm()-specific.
- */
-static struct mbuf *_mgetm_internal(int, short, short, int);
-
-void
-_mext_free(struct mbuf *mb)
-{
-
- if (mb->m_ext.ext_type == EXT_CLUSTER) {
- mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
- 0, NULL);
- } else {
- (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
- _mext_dealloc_ref(mb);
- }
-}
-
-static struct mbuf *
-_mgetm_internal(int how, short type, short persist, int cchnum)
-{
- struct mbuf *mb;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
- if (mb == NULL)
- return NULL;
- _mb_setup(mb, type);
-
- if ((persist & MBP_PERSIST) != 0) {
- mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
- how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
- if (mb->m_ext.ext_buf == NULL) {
- (void)m_free(mb);
- mb = NULL;
- }
- _mcl_setup(mb);
- _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
- }
- return (mb);
-}
-
-/******************************************************************************
- * Exported buffer allocation and de-allocation routines.
- */
-
-/*
- * Allocate and return a single (normal) mbuf. NULL is returned on failure.
- *
- * Arguments:
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get(int how, short type)
-{
- struct mbuf *mb;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
- if (mb != NULL)
- _mb_setup(mb, type);
- return (mb);
-}
-
-/*
- * Allocate a given length worth of mbufs and/or clusters (whatever fits
- * best) and return a pointer to the top of the allocated chain. If an
- * existing mbuf chain is provided, then we will append the new chain
- * to the existing one but still return the top of the newly allocated
- * chain. NULL is returned on failure, in which case the [optional]
- * provided chain is left untouched, and any memory already allocated
- * is freed.
- *
- * Arguments:
- * - m: existing chain to which to append new chain (optional).
- * - len: total length of data to append, either in mbufs or clusters
- * (we allocate whatever combination yields the best fit).
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
-{
- struct mbuf *mb, *top, *cur, *mtail;
- int num, rem, cchnum;
- short persist;
- int i;
-
- KASSERT(len >= 0, ("m_getm(): len is < 0"));
-
- /* If m != NULL, we will append to the end of that chain. */
- if (m != NULL)
- for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
- else
- mtail = NULL;
-
- /*
- * In the best-case scenario (which should be the common case
- * unless we're in a starvation situation), we will be able to
- * go through the allocation of all the desired mbufs and clusters
- * here without dropping our per-CPU cache lock in between.
- */
- num = len / MCLBYTES;
- rem = len % MCLBYTES;
- persist = 0;
- cchnum = -1;
- top = cur = NULL;
- for (i = 0; i < num; i++) {
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
- MBP_PERSIST | persist, &cchnum);
- if (mb == NULL)
- goto failed;
- _mb_setup(mb, type);
- mb->m_len = 0;
-
- persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
- mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
- how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
- if (mb->m_ext.ext_buf == NULL) {
- (void)m_free(mb);
- goto failed;
- }
- _mcl_setup(mb);
- _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
- persist = MBP_PERSISTENT;
-
- if (cur == NULL)
- top = cur = mb;
- else
- cur = (cur->m_next = mb);
- }
- if (rem > 0) {
- if (cchnum >= 0) {
- persist = MBP_PERSISTENT;
- persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
- mb = _mgetm_internal(how, type, persist, cchnum);
- if (mb == NULL)
- goto failed;
- } else if (rem > MINCLSIZE) {
- mb = m_getcl(how, type, 0);
- } else {
- mb = m_get(how, type);
- }
- if (mb != NULL) {
- mb->m_len = 0;
- if (cur == NULL)
- top = mb;
- else
- cur->m_next = mb;
- } else
- goto failed;
- }
-
- if (mtail != NULL)
- mtail->m_next = top;
- return top;
-failed:
- if (top != NULL)
- m_freem(top);
- return NULL;
-}
-
-/*
- * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure.
- *
- * Arguments:
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr(int how, short type)
-{
- struct mbuf *mb;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
- if (mb != NULL) {
- _mbhdr_setup(mb, type);
-#ifdef MAC
- if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
- m_free(mb);
- return (NULL);
- }
-#endif
- }
- return (mb);
-}
-
-/*
- * Allocate and return a single (normal) pre-zero'd mbuf. NULL is
- * returned on failure.
- *
- * Arguments:
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_get_clrd(int how, short type)
-{
- struct mbuf *mb;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
- if (mb != NULL) {
- _mb_setup(mb, type);
- bzero(mtod(mb, caddr_t), MLEN);
- }
- return (mb);
-}
-
-/*
- * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is
- * returned on failure.
- *
- * Arguments:
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- */
-struct mbuf *
-m_gethdr_clrd(int how, short type)
-{
- struct mbuf *mb;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
- if (mb != NULL) {
- _mbhdr_setup(mb, type);
-#ifdef MAC
- if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
- m_free(mb);
- return (NULL);
- }
-#endif
- bzero(mtod(mb, caddr_t), MHLEN);
- }
- return (mb);
-}
-
-/*
- * Free a single mbuf and any associated storage that it may have attached
- * to it. The associated storage may not be immediately freed if its
- * reference count is above 1. Returns the next mbuf in the chain following
- * the mbuf being freed.
- *
- * Arguments:
- * - mb: the mbuf to free.
- */
-struct mbuf *
-m_free(struct mbuf *mb)
-{
- struct mbuf *nb;
- int cchnum;
- short persist = 0;
-
-#ifdef INVARIANTS
- if (mb->m_flags & M_FREELIST)
- panic("m_free detected a mbuf double-free");
- mb->m_flags |= M_FREELIST;
-#endif
- if ((mb->m_flags & M_PKTHDR) != 0)
- m_tag_delete_chain(mb, NULL);
- nb = mb->m_next;
- if ((mb->m_flags & M_EXT) != 0) {
- MEXT_REM_REF(mb);
- if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
- if (mb->m_ext.ext_type == EXT_CLUSTER) {
- mb_free(&mb_list_clust,
- (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
- MBP_PERSIST, &cchnum);
- persist = MBP_PERSISTENT;
- } else {
- (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
- mb->m_ext.ext_args);
- _mext_dealloc_ref(mb);
- persist = 0;
- }
- }
- }
- mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
- return (nb);
-}
-
-/*
- * Free an entire chain of mbufs and associated external buffers, if
- * applicable. Right now, we only optimize a little so that the cache
- * lock may be held across a single mbuf+cluster free. Hopefully,
- * we'll eventually be holding the lock across more than merely two
- * consecutive frees but right now this is hard to implement because of
- * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
- * loop.
- *
- * - mb: the mbuf chain to free.
- */
-void
-m_freem(struct mbuf *mb)
-{
-
- while (mb != NULL)
- mb = m_free(mb);
-}
-
-/*
- * Fetch an mbuf with a cluster attached to it. If one of the
- * allocations fails, the entire allocation fails. This routine is
- * the preferred way of fetching both the mbuf and cluster together,
- * as it avoids having to unlock/relock between allocations. Returns
- * NULL on failure.
- *
- * Arguments:
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- * - type: the type of the mbuf being allocated.
- * - flags: any flags to pass to the mbuf being allocated; if this includes
- * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
- */
-struct mbuf *
-m_getcl(int how, short type, int flags)
-{
- struct mbuf *mb;
- int cchnum;
-
- mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
- MBP_PERSIST, &cchnum);
- if (mb == NULL)
- return NULL;
- mb->m_type = type;
- mb->m_next = NULL;
- mb->m_flags = flags;
- if ((flags & M_PKTHDR) != 0) {
- mb->m_nextpkt = NULL;
- mb->m_pkthdr.rcvif = NULL;
- mb->m_pkthdr.csum_flags = 0;
- SLIST_INIT(&mb->m_pkthdr.tags);
- }
-
- mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
- MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
- if (mb->m_ext.ext_buf == NULL) {
- (void)m_free(mb);
- mb = NULL;
- } else {
- _mcl_setup(mb);
- _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
-#ifdef MAC
- if (flags & M_PKTHDR) {
- if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
- m_free(mb);
- return (NULL);
- }
- }
-#endif
- }
- return (mb);
-}
-
-/*
- * Fetch a single mbuf cluster and attach it to an existing mbuf. If
- * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
- * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
- * The M_EXT bit is not set on failure.
- *
- * Arguments:
- * - mb: the existing mbuf to which to attach the allocated cluster.
- * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
- * if really starved for memory. M_DONTWAIT to never block.
- */
-void
-m_clget(struct mbuf *mb, int how)
-{
-
- mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
- 0, NULL);
- if (mb->m_ext.ext_buf != NULL) {
- _mcl_setup(mb);
- _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
- }
-}
-
-/*
- * Configure a provided mbuf to refer to the provided external storage
- * buffer and setup a reference count for said buffer. If the setting
- * up of the reference count fails, the M_EXT bit will not be set. If
- * successfull, the M_EXT bit is set in the mbuf's flags.
- *
- * Arguments:
- * - mb: the existing mbuf to which to attach the provided buffer.
- * - buf: the address of the provided external storage buffer.
- * - size: the size of the provided buffer.
- * - freef: a pointer to a routine that is responsible for freeing the
- * provided external storage buffer.
- * - args: a pointer to an argument structure (of any type) to be passed
- * to the provided freef routine (may be NULL).
- * - flags: any other flags to be passed to the provided mbuf.
- * - type: the type that the external storage buffer should be labeled with.
- */
-void
-m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
- void (*freef)(void *, void *), void *args, int flags, int type)
-{
- u_int *ref_cnt = NULL;
-
- if (type == EXT_CLUSTER)
- ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)];
- else if (type == EXT_EXTREF)
- ref_cnt = mb->m_ext.ref_cnt;
- _mext_init_ref(mb, ref_cnt);
- if (mb->m_ext.ref_cnt != NULL) {
- mb->m_flags |= (M_EXT | flags);
- mb->m_ext.ext_buf = buf;
- mb->m_data = mb->m_ext.ext_buf;
- mb->m_ext.ext_size = size;
- mb->m_ext.ext_free = freef;
- mb->m_ext.ext_args = args;
- mb->m_ext.ext_type = type;
- }
-}
-
-/*
- * Change type of provided mbuf. This is a relatively expensive operation
- * (due to the cost of statistics manipulations) and should be avoided, where
- * possible.
- *
- * Arguments:
- * - mb: the provided mbuf for which the type needs to be changed.
- * - new_type: the new type to change the mbuf to.
- */
-void
-m_chtype(struct mbuf *mb, short new_type)
-{
- struct mb_gen_list *gen_list;
-
- gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
- MB_LOCK_CONT(gen_list);
- MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
- MB_MBTYPES_INC(gen_list, new_type, 1);
- MB_UNLOCK_CONT(gen_list);
- mb->m_type = new_type;
-}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 5815fae..e14aba1 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -86,6 +86,161 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
#endif
/*
+ * Malloc-type for external ext_buf ref counts.
+ */
+MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain. If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, short type)
+{
+ struct mbuf *mb, *top, *cur, *mtail;
+ int num, rem;
+ int i;
+
+ KASSERT(len >= 0, ("m_getm(): len is < 0"));
+
+ /* If m != NULL, we will append to the end of that chain. */
+ if (m != NULL)
+ for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+ else
+ mtail = NULL;
+
+ /*
+ * Calculate how many mbufs+clusters ("packets") we need and how much
+ * leftover there is after that and allocate the first mbuf+cluster
+ * if required.
+ */
+ num = len / MCLBYTES;
+ rem = len % MCLBYTES;
+ top = cur = NULL;
+ if (num > 0) {
+ if ((top = cur = m_getcl(how, type, 0)) == NULL)
+ goto failed;
+ }
+ num--;
+ top->m_len = 0;
+
+ for (i = 0; i < num; i++) {
+ mb = m_getcl(how, type, 0);
+ if (mb == NULL)
+ goto failed;
+ mb->m_len = 0;
+ cur = (cur->m_next = mb);
+ }
+ if (rem > 0) {
+ mb = (rem > MINCLSIZE) ?
+ m_getcl(how, type, 0) : m_get(how, type);
+ if (mb == NULL)
+ goto failed;
+ mb->m_len = 0;
+ if (cur == NULL)
+ top = mb;
+ else
+ cur->m_next = mb;
+ }
+
+ if (mtail != NULL)
+ mtail->m_next = top;
+ return top;
+failed:
+ if (top != NULL)
+ m_freem(top);
+ return NULL;
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+ while (mb != NULL)
+ mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer. If the setting
+ * up of the reference count fails, the M_EXT bit will not be set. If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ * mb The existing mbuf to which to attach the provided buffer.
+ * buf The address of the provided external storage buffer.
+ * size The size of the provided buffer.
+ * freef A pointer to a routine that is responsible for freeing the
+ * provided external storage buffer.
+ * args A pointer to an argument structure (of any type) to be passed
+ * to the provided freef routine (may be NULL).
+ * flags Any other flags to be passed to the provided mbuf.
+ * type The type that the external storage buffer should be
+ * labeled with.
+ *
+ * Returns:
+ * Nothing.
+ */
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+ void (*freef)(void *, void *), void *args, int flags, int type)
+{
+ u_int *ref_cnt = NULL;
+
+ /* XXX Shouldn't be adding EXT_CLUSTER with this API */
+ if (type == EXT_CLUSTER)
+ ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+ mb->m_ext.ext_buf);
+ else if (type == EXT_EXTREF)
+ ref_cnt = mb->m_ext.ref_cnt;
+ mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
+ malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+ if (mb->m_ext.ref_cnt != NULL) {
+ *(mb->m_ext.ref_cnt) = 1;
+ mb->m_flags |= (M_EXT | flags);
+ mb->m_ext.ext_buf = buf;
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_ext.ext_size = size;
+ mb->m_ext.ext_free = freef;
+ mb->m_ext.ext_args = args;
+ mb->m_ext.ext_type = type;
+ }
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 0.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+
+ MEXT_REM_REF(m);
+ if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
+ if (m->m_ext.ext_type == EXT_PACKET) {
+ uma_zfree(zone_pack, m);
+ return;
+ } else if (m->m_ext.ext_type == EXT_CLUSTER) {
+ uma_zfree(zone_clust, m->m_ext.ext_buf);
+ m->m_ext.ext_buf = NULL;
+ } else {
+ (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
+ m->m_ext.ext_args);
+ if (m->m_ext.ext_type != EXT_EXTREF)
+ free(m->m_ext.ref_cnt, M_MBUF);
+ }
+ }
+ uma_zfree(zone_mbuf, m);
+}
+
+/*
* "Move" mbuf pkthdr from "from" to "to".
* "from" must have M_PKTHDR set, and "to" must be empty.
*/
@@ -364,22 +519,22 @@ m_dup(struct mbuf *m, int how)
struct mbuf *n;
/* Get the next new mbuf */
- MGET(n, how, m->m_type);
+ if (remain >= MINCLSIZE) {
+ n = m_getcl(how, m->m_type, 0);
+ nsize = MCLBYTES;
+ } else {
+ n = m_get(how, m->m_type);
+ nsize = MLEN;
+ }
if (n == NULL)
goto nospace;
- if (top == NULL) { /* first one, must be PKTHDR */
- if (!m_dup_pkthdr(n, m, how))
- goto nospace;
- nsize = MHLEN;
- } else /* not the first one */
- nsize = MLEN;
- if (remain >= MINCLSIZE) {
- MCLGET(n, how);
- if ((n->m_flags & M_EXT) == 0) {
- (void)m_free(n);
+
+ if (top == NULL) { /* First one, must be PKTHDR */
+ if (!m_dup_pkthdr(n, m, how)) {
+ m_free(n);
goto nospace;
}
- nsize = MCLBYTES;
+ nsize = MHLEN;
}
n->m_len = 0;
@@ -651,39 +806,42 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
void (*copy)(char *from, caddr_t to, u_int len))
{
struct mbuf *m;
- struct mbuf *top = 0, **mp = &top;
+ struct mbuf *top = NULL, **mp = &top;
int len;
if (off < 0 || off > MHLEN)
return (NULL);
- MGETHDR(m, M_DONTWAIT, MT_DATA);
- if (m == NULL)
- return (NULL);
- m->m_pkthdr.rcvif = ifp;
- m->m_pkthdr.len = totlen;
- len = MHLEN;
-
while (totlen > 0) {
- if (top) {
- MGET(m, M_DONTWAIT, MT_DATA);
- if (m == NULL) {
- m_freem(top);
- return (NULL);
- }
- len = MLEN;
- }
- if (totlen + off >= MINCLSIZE) {
- MCLGET(m, M_DONTWAIT);
- if (m->m_flags & M_EXT)
+ if (top == NULL) { /* First one, must be PKTHDR */
+ if (totlen + off >= MINCLSIZE) {
+ m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
len = MCLBYTES;
+ } else {
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ len = MHLEN;
+
+ /* Place initial small packet/header at end of mbuf */
+ if (m && totlen + off + max_linkhdr <= MLEN) {
+ m->m_data += max_linkhdr;
+ len -= max_linkhdr;
+ }
+ }
+ if (m == NULL)
+ return NULL;
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = totlen;
} else {
- /*
- * Place initial small packet/header at end of mbuf.
- */
- if (top == NULL && totlen + off + max_linkhdr <= len) {
- m->m_data += max_linkhdr;
- len -= max_linkhdr;
+ if (totlen + off >= MINCLSIZE) {
+ m = m_getcl(M_DONTWAIT, MT_DATA, 0);
+ len = MCLBYTES;
+ } else {
+ m = m_get(M_DONTWAIT, MT_DATA);
+ len = MLEN;
+ }
+ if (m == NULL) {
+ m_freem(top);
+ return NULL;
}
}
if (off) {
@@ -722,9 +880,10 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
off -= mlen;
totlen += mlen;
if (m->m_next == NULL) {
- n = m_get_clrd(M_DONTWAIT, m->m_type);
+ n = m_get(M_DONTWAIT, m->m_type);
if (n == NULL)
goto out;
+ bzero(mtod(n, caddr_t), MLEN);
n->m_len = min(MLEN, len + off);
m->m_next = n;
}
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
index 0d11aac..ff7944d 100644
--- a/sys/kern/uipc_mbuf2.c
+++ b/sys/kern/uipc_mbuf2.c
@@ -230,14 +230,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
* now, we need to do the hard way. don't m_copy as there's no room
* on both end.
*/
- MGET(o, M_DONTWAIT, m->m_type);
- if (o && len > MLEN) {
- MCLGET(o, M_DONTWAIT);
- if ((o->m_flags & M_EXT) == 0) {
- m_free(o);
- o = NULL;
- }
- }
+ if (len > MLEN)
+ o = m_getcl(M_DONTWAIT, m->m_type, 0);
+ else
+ o = m_get(M_DONTWAIT, m->m_type);
if (!o) {
m_freem(m);
return NULL; /* ENOBUFS */
@@ -274,29 +270,27 @@ static struct mbuf *
m_dup1(struct mbuf *m, int off, int len, int wait)
{
struct mbuf *n;
- int l;
int copyhdr;
if (len > MCLBYTES)
return NULL;
- if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+ if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
copyhdr = 1;
- MGETHDR(n, wait, m->m_type);
- l = MHLEN;
- } else {
+ else
copyhdr = 0;
- MGET(n, wait, m->m_type);
- l = MLEN;
- }
- if (n && len > l) {
- MCLGET(n, wait);
- if ((n->m_flags & M_EXT) == 0) {
- m_free(n);
- n = NULL;
- }
+ if (len >= MINCLSIZE) {
+ if (copyhdr == 1)
+ n = m_getcl(wait, m->m_type, M_PKTHDR);
+ else
+ n = m_getcl(wait, m->m_type, 0);
+ } else {
+ if (copyhdr == 1)
+ n = m_gethdr(wait, m->m_type);
+ else
+ n = m_get(wait, m->m_type);
}
if (!n)
- return NULL;
+ return NULL; /* ENOBUFS */
if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
m_free(n);
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
if (CMSG_SPACE((u_int)size) > MCLBYTES)
return ((struct mbuf *) NULL);
- if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ if (CMSG_SPACE((u_int)size > MLEN))
+ m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+ else
+ m = m_get(M_DONTWAIT, MT_CONTROL);
+ if (m == NULL)
return ((struct mbuf *) NULL);
- if (CMSG_SPACE((u_int)size) > MLEN) {
- MCLGET(m, M_DONTWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_free(m);
- return ((struct mbuf *) NULL);
- }
- }
cp = mtod(m, struct cmsghdr *);
m->m_len = 0;
KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index e07f4ef..6735e49 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -527,8 +527,8 @@ sosend(so, addr, uio, top, control, flags, td)
{
struct mbuf **mp;
struct mbuf *m;
- long space, len, resid;
- int clen = 0, error, s, dontroute, mlen;
+ long space, len = 0, resid;
+ int clen = 0, error, s, dontroute;
int atomic = sosendallatonce(so) || top;
#ifdef ZERO_COPY_SOCKETS
int cow_send;
@@ -624,25 +624,23 @@ restart:
#ifdef ZERO_COPY_SOCKETS
cow_send = 0;
#endif /* ZERO_COPY_SOCKETS */
- if (top == 0) {
- MGETHDR(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- goto release;
- }
- mlen = MHLEN;
- m->m_pkthdr.len = 0;
- m->m_pkthdr.rcvif = (struct ifnet *)0;
- } else {
- MGET(m, M_TRYWAIT, MT_DATA);
- if (m == NULL) {
- error = ENOBUFS;
- goto release;
- }
- mlen = MLEN;
- }
if (resid >= MINCLSIZE) {
#ifdef ZERO_COPY_SOCKETS
+ if (top == NULL) {
+ MGETHDR(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ } else {
+ MGET(m, M_TRYWAIT, MT_DATA);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+ }
if (so_zero_copy_send &&
resid>=PAGE_SIZE &&
space>=PAGE_SIZE &&
@@ -654,29 +652,48 @@ restart:
cow_send = socow_setup(m, uio);
}
}
- if (!cow_send){
+ if (!cow_send) {
+ MCLGET(m, M_TRYWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ m = NULL;
+ } else {
+ len = min(min(MCLBYTES, resid), space);
+ }
+ } else
+ len = PAGE_SIZE;
+#else /* ZERO_COPY_SOCKETS */
+ if (top == NULL) {
+ m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ } else
+ m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+ len = min(min(MCLBYTES, resid), space);
#endif /* ZERO_COPY_SOCKETS */
- MCLGET(m, M_TRYWAIT);
- if ((m->m_flags & M_EXT) == 0)
- goto nopages;
- mlen = MCLBYTES;
- len = min(min(mlen, resid), space);
} else {
-#ifdef ZERO_COPY_SOCKETS
- len = PAGE_SIZE;
+ if (top == NULL) {
+ m = m_gethdr(M_TRYWAIT, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+
+ len = min(min(MHLEN, resid), space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && m && len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ m = m_get(M_TRYWAIT, MT_DATA);
+ len = min(min(MLEN, resid), space);
}
-
- } else {
-#endif /* ZERO_COPY_SOCKETS */
-nopages:
- len = min(min(mlen, resid), space);
- /*
- * For datagram protocols, leave room
- * for protocol headers in first mbuf.
- */
- if (atomic && top == 0 && len < mlen)
- MH_ALIGN(m, len);
}
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto release;
+ }
+
space -= len;
#ifdef ZERO_COPY_SOCKETS
if (cow_send)
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index 3ab8f3a..a404d69 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)
if (CMSG_SPACE((u_int)size) > MCLBYTES)
return ((struct mbuf *) NULL);
- if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ if (CMSG_SPACE((u_int)size > MLEN))
+ m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+ else
+ m = m_get(M_DONTWAIT, MT_CONTROL);
+ if (m == NULL)
return ((struct mbuf *) NULL);
- if (CMSG_SPACE((u_int)size) > MLEN) {
- MCLGET(m, M_DONTWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- m_free(m);
- return ((struct mbuf *) NULL);
- }
- }
cp = mtod(m, struct cmsghdr *);
m->m_len = 0;
KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 1b886f5..978c30e 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#ifdef KTRACE
@@ -85,6 +86,21 @@ static int getpeername1(struct thread *td, struct getpeername_args *uap,
int compat);
/*
+ * NSFBUFS-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+ "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+ "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+ "Number of sendfile(2) sf_bufs in use");
+
+/*
* System call interface to the socket abstraction.
*/
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c
index fe263f1..4a34567 100644
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@@ -86,6 +86,10 @@
#include <machine/tlb.h>
#include <machine/tstate.h>
+#ifndef NSFBUFS
+#define NSFBUFS (512 + maxusers * 16)
+#endif
+
static void sf_buf_init(void *arg);
SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
@@ -351,6 +355,9 @@ sf_buf_init(void *arg)
vm_offset_t sf_base;
int i;
+ nsfbufs = NSFBUFS;
+ TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
SLIST_INIT(&sf_freelist.sf_head);
sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index d86c57c..2170599 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -10,7 +10,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -33,7 +33,12 @@
#ifndef _SYS_MBUF_H_
#define _SYS_MBUF_H_
+/* XXX: These includes suck. Sorry! */
#include <sys/queue.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <vm/uma.h>
+#endif
/*
* Mbufs are of a single size, MSIZE (sys/param.h), which
@@ -57,6 +62,16 @@
*/
#define mtod(m, t) ((t)((m)->m_data))
#define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1)))
+
+/*
+ * Argument structure passed to UMA routines during mbuf and packet
+ * allocations.
+ */
+struct mb_args {
+ int flags; /* Flags for mbuf being allocated */
+ int how; /* How to allocate: M_WAITOK or M_DONTWAIT */
+ short type; /* Type of mbuf being allocated */
+};
#endif /* _KERNEL */
/*
@@ -167,6 +182,7 @@ struct mbuf {
*/
#define EXT_CLUSTER 1 /* mbuf cluster */
#define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */
+#define EXT_PACKET 3 /* came out of Packet zone */
#define EXT_NET_DRV 100 /* custom ext_buf provided by net driver(s) */
#define EXT_MOD_TYPE 200 /* custom module's ext_buf type */
#define EXT_DISPOSABLE 300 /* can throw this buffer away w/page flipping */
@@ -223,28 +239,12 @@ struct mbuf {
#define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */
/*
- * Mbuf and cluster allocation statistics PCPU structure.
- */
-struct mbpstat {
- u_long mb_mbfree;
- u_long mb_mbbucks;
- u_long mb_clfree;
- u_long mb_clbucks;
- long mb_mbtypes[MT_NTYPES];
- short mb_active;
-};
-
-/*
* General mbuf allocator statistics structure.
- * XXX: Modifications of these are not protected by any mutex locks nor by
- * any atomic() manipulations. As a result, we may occasionally lose
- * a count or two. Luckily, not all of these fields are modified at all
- * and remain static, and those that are manipulated are only manipulated
- * in failure situations, which do not occur (hopefully) very often.
*/
struct mbstat {
- u_long m_drops; /* times failed to allocate */
- u_long m_wait; /* times succesfully returned from wait */
+ u_long m_mbufs; /* XXX */
+ u_long m_mclusts; /* XXX */
+
u_long m_drain; /* times drained protocols for space */
u_long m_mcfail; /* XXX: times m_copym failed */
u_long m_mpfail; /* XXX: times m_pullup failed */
@@ -253,10 +253,10 @@ struct mbstat {
u_long m_minclsize; /* min length of data to allocate a cluster */
u_long m_mlen; /* length of data in an mbuf */
u_long m_mhlen; /* length of data in a header mbuf */
- u_int m_mbperbuck; /* number of mbufs per "bucket" */
- u_int m_clperbuck; /* number of clusters per "bucket" */
- /* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */
+
+ /* Number of mbtypes (gives # elems in mbtypes[] array: */
short m_numtypes;
+
/* XXX: Sendfile stats should eventually move to their own struct */
u_long sf_iocnt; /* times sendfile had to do disk I/O */
u_long sf_allocfail; /* times sfbuf allocation failed */
@@ -265,14 +265,23 @@ struct mbstat {
/*
* Flags specifying how an allocation should be made.
- * M_DONTWAIT means "don't block if nothing is available" whereas
- * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is
- * available."
+ *
+ * The flag to use is as follows:
+ * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation.
+ * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block.
+ *
+ * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly
+ * and if we cannot allocate immediately we may return NULL,
+ * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate
+ * resources we will block until they are available, and thus never
+ * return NULL.
+ *
+ * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT.
*/
-#define M_DONTWAIT 0x4 /* don't conflict with M_NOWAIT */
-#define M_TRYWAIT 0x8 /* or M_WAITOK */
-#define M_WAIT M_TRYWAIT /* XXX: deprecated */
-#define MBTOM(how) ((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT)
+#define MBTOM(how) (how)
+#define M_DONTWAIT M_NOWAIT
+#define M_TRYWAIT M_WAITOK
+#define M_WAIT M_WAITOK
#ifdef _KERNEL
/*-
@@ -296,12 +305,114 @@ struct mbstat {
#define MEXT_ADD_REF(m) atomic_add_int((m)->m_ext.ref_cnt, 1)
/*
+ * Network buffer allocation API
+ *
+ * The rest of it is defined in kern/subr_mbuf.c
+ */
+
+extern uma_zone_t zone_mbuf;
+extern uma_zone_t zone_clust;
+extern uma_zone_t zone_pack;
+
+static __inline struct mbuf *m_get(int how, short type);
+static __inline struct mbuf *m_gethdr(int how, short type);
+static __inline struct mbuf *m_getcl(int how, short type, int flags);
+static __inline struct mbuf *m_getclr(int how, short type); /* XXX */
+static __inline struct mbuf *m_free(struct mbuf *m);
+static __inline void m_clget(struct mbuf *m, int how);
+static __inline void m_chtype(struct mbuf *m, short new_type);
+void mb_free_ext(struct mbuf *);
+
+static __inline
+struct mbuf *
+m_get(int how, short type)
+{
+ struct mb_args args;
+
+ args.flags = 0;
+ args.how = how;
+ args.type = type;
+ return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+/* XXX This should be depracated, very little use */
+static __inline
+struct mbuf *
+m_getclr(int how, short type)
+{
+ struct mbuf *m;
+ struct mb_args args;
+
+ args.flags = 0;
+ args.how = how;
+ args.type = type;
+ m = uma_zalloc_arg(zone_mbuf, &args, how);
+ if (m != NULL)
+ bzero(m->m_data, MLEN);
+ return m;
+}
+
+static __inline
+struct mbuf *
+m_gethdr(int how, short type)
+{
+ struct mb_args args;
+
+ args.flags = M_PKTHDR;
+ args.how = how;
+ args.type = type;
+ return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_getcl(int how, short type, int flags)
+{
+ struct mb_args args;
+
+ args.flags = flags;
+ args.how = how;
+ args.type = type;
+ return (uma_zalloc_arg(zone_pack, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_free(struct mbuf *m)
+{
+ struct mbuf *n = m->m_next;
+
+#ifdef INVARIANTS
+ m->m_flags |= M_FREELIST;
+#endif
+ if (m->m_flags & M_EXT)
+ mb_free_ext(m);
+ else
+ uma_zfree(zone_mbuf, m);
+ return n;
+}
+
+static __inline
+void
+m_clget(struct mbuf *m, int how)
+{
+ m->m_ext.ext_buf = NULL;
+ uma_zalloc_arg(zone_clust, m, how);
+}
+
+static __inline
+void
+m_chtype(struct mbuf *m, short new_type)
+{
+ m->m_type = new_type;
+}
+
+/*
* mbuf, cluster, and external object allocation macros
* (for compatibility purposes).
*/
/* NB: M_COPY_PKTHDR is deprecated. Use M_MOVE_PKTHDR or m_dup_pktdr. */
#define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from))
-#define m_getclr(how, type) m_get_clrd((how), (type))
#define MGET(m, how, type) ((m) = m_get((how), (type)))
#define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type)))
#define MCLGET(m, how) m_clget((m), (how))
@@ -309,23 +420,6 @@ struct mbstat {
m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type))
/*
- * MEXTFREE(m): disassociate (and possibly free) an external object from (m).
- *
- * If the atomic_cmpset_int() returns 0, then we effectively do nothing
- * in terms of "cleaning up" (freeing the ext buf and ref. counter) as
- * this means that either there are still references, or another thread
- * is taking care of the clean-up.
- */
-#define MEXTFREE(m) do { \
- struct mbuf *_mb = (m); \
- \
- MEXT_REM_REF(_mb); \
- if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1)) \
- _mext_free(_mb); \
- _mb->m_flags &= ~M_EXT; \
-} while (0)
-
-/*
* Evaluate TRUE if it's safe to write to the mbuf m's data region (this
* can be both the local data payload, or an external buffer area,
* depending on whether M_EXT is set).
@@ -425,18 +519,13 @@ extern int max_linkhdr; /* Largest link-level header */
extern int max_protohdr; /* Largest protocol header */
extern struct mbstat mbstat; /* General mbuf stats/infos */
extern int nmbclusters; /* Maximum number of clusters */
-extern int nmbcnt; /* Scale kmem_map for counter space */
-extern int nmbufs; /* Maximum number of mbufs */
struct uio;
-void _mext_free(struct mbuf *);
void m_adj(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, u_int), void *);
void m_cat(struct mbuf *, struct mbuf *);
-void m_chtype(struct mbuf *, short);
-void m_clget(struct mbuf *, int);
void m_extadd(struct mbuf *, caddr_t, u_int,
void (*)(void *, void *), void *, int, int);
void m_copyback(struct mbuf *, int, int, c_caddr_t);
@@ -451,13 +540,7 @@ struct mbuf *m_dup(struct mbuf *, int);
int m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
u_int m_fixhdr(struct mbuf *);
struct mbuf *m_fragment(struct mbuf *, int, int);
-struct mbuf *m_free(struct mbuf *);
void m_freem(struct mbuf *);
-struct mbuf *m_get(int, short);
-struct mbuf *m_get_clrd(int, short);
-struct mbuf *m_getcl(int, short, int);
-struct mbuf *m_gethdr(int, short);
-struct mbuf *m_gethdr_clrd(int, short);
struct mbuf *m_getm(struct mbuf *, int, int, short);
struct mbuf *m_getptr(struct mbuf *, int, int *);
u_int m_length(struct mbuf *, struct mbuf **);
@@ -470,7 +553,7 @@ struct mbuf *m_split(struct mbuf *, int, int);
struct mbuf *m_uiotombuf(struct uio *, int, int);
/*-
- * Packets may have annotations attached by affixing a list
+ * Network packets may have annotations attached by affixing a list
* of "packet tags" to the pkthdr structure. Packet tags are
* dynamically allocated semi-opaque data structures that have
* a fixed header (struct m_tag) that specifies the size of the
diff --git a/sys/vm/uma.h b/sys/vm/uma.h
index 4de1efa..0d34ca3 100644
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@@ -43,7 +43,7 @@
/* Types and type defs */
-struct uma_zone;
+struct uma_zone;
/* Opaque type used as a handle to the zone */
typedef struct uma_zone * uma_zone_t;
@@ -157,12 +157,46 @@ typedef void (*uma_fini)(void *mem, int size);
* A pointer to a structure which is intended to be opaque to users of
* the interface. The value may be null if the wait flag is not set.
*/
-
uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
uma_init uminit, uma_fini fini, int align,
u_int16_t flags);
/*
+ * Create a secondary uma zone
+ *
+ * Arguments:
+ * name The text name of the zone for debugging and stats, this memory
+ * should not be freed until the zone has been deallocated.
+ * ctor The constructor that is called when the object is allocated
+ * dtor The destructor that is called when the object is freed.
+ * zinit An initializer that sets up the initial state of the memory
+ * as the object passes from the Keg's slab to the Zone's cache.
+ * zfini A discard function that undoes initialization done by init
+ * as the object passes from the Zone's cache to the Keg's slab.
+ *
+ * ctor/dtor/zinit/zfini may all be null, see notes above.
+ * Note that the zinit and zfini specified here are NOT
+ * exactly the same as the init/fini specified to uma_zcreate()
+ * when creating a master zone. These zinit/zfini are called
+ * on the TRANSITION from keg to zone (and vice-versa). Once
+ * these are set, the primary zone may alter its init/fini
+ * (which are called when the object passes from VM to keg)
+ * using uma_zone_set_init/fini()) as well as its own
+ * zinit/zfini (unset by default for master zone) with
+ * uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
+ *
+ * align A bitmask that corisponds to the requested alignment
+ * eg 4 would be 0x3
+ * flags A set of parameters that control the behavior of the zone
+ *
+ * Returns:
+ * A pointer to a structure which is intended to be opaque to users of
+ * the interface. The value may be null if the wait flag is not set.
+ */
+uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+ uma_init zinit, uma_fini zfini, uma_zone_t master);
+
+/*
* Definitions for uma_zcreate flags
*
* These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to
@@ -185,6 +219,9 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
* Use a hash table instead of caching
* information in the vm_page.
*/
+#define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */
+#define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */
+#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */
/* Definitions for align */
#define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */
@@ -201,7 +238,6 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
* zone The zone we want to destroy.
*
*/
-
void uma_zdestroy(uma_zone_t zone);
/*
@@ -376,6 +412,28 @@ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
void uma_zone_set_max(uma_zone_t zone, int nitems);
/*
+ * The following two routines (uma_zone_set_init/fini)
+ * are used to set the backend init/fini pair which acts on an
+ * object as it becomes allocated and is placed in a slab within
+ * the specified zone's backing keg. These should probably not
+ * be changed once allocations have already begun and only
+ * immediately upon zone creation.
+ */
+void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
+void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
+
+/*
+ * The following two routines (uma_zone_set_zinit/zfini) are
+ * used to set the zinit/zfini pair which acts on an object as
+ * it passes from the backing Keg's slab cache to the
+ * specified Zone's bucket cache. These should probably not
+ * be changed once allocations have already begun and
+ * only immediately upon zone creation.
+ */
+void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
+void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
+
+/*
* Replaces the standard page_alloc or obj_alloc functions for this zone
*
* Arguments:
@@ -430,5 +488,19 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
*/
void uma_prealloc(uma_zone_t zone, int itemcnt);
+/*
+ * Used to lookup the reference counter allocated for an item
+ * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones,
+ * reference counters are allocated for items and stored in
+ * the underlying slab header.
+ *
+ * Arguments:
+ * zone The UMA_ZONE_REFCNT zone to which the item belongs.
+ * item The address of the item for which we want a refcnt.
+ *
+ * Returns:
+ * A pointer to a u_int32_t reference counter.
+ */
+u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
#endif
diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index f693540..82d60c6 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -84,15 +84,19 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
/*
- * This is the zone from which all zones are spawned. The idea is that even
- * the zone heads are allocated from the allocator, so we use the bss section
- * to bootstrap us.
+ * This is the zone and keg from which all zones are spawned. The idea is that
+ * even the zone & keg heads are allocated from the allocator, so we use the
+ * bss section to bootstrap us.
*/
-static struct uma_zone masterzone;
-static uma_zone_t zones = &masterzone;
+static struct uma_keg masterkeg;
+static struct uma_zone masterzone_k;
+static struct uma_zone masterzone_z;
+static uma_zone_t kegs = &masterzone_k;
+static uma_zone_t zones = &masterzone_z;
/* This is the zone from which all of uma_slab_t's are allocated. */
static uma_zone_t slabzone;
+static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */
/*
* The initial hash tables come out of this zone so they can be allocated
@@ -107,10 +111,10 @@ static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
*/
static int bucketdisable = 1;
-/* Linked list of all zones in the system */
-static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones);
+/* Linked list of all kegs in the system */
+static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
-/* This mutex protects the zone list */
+/* This mutex protects the keg list */
static struct mtx uma_mtx;
/* These are the pcpu cache locks */
@@ -144,6 +148,16 @@ struct uma_zctor_args {
uma_dtor dtor;
uma_init uminit;
uma_fini fini;
+ uma_keg_t keg;
+ int align;
+ u_int16_t flags;
+};
+
+struct uma_kctor_args {
+ uma_zone_t zone;
+ size_t size;
+ uma_init uminit;
+ uma_fini fini;
int align;
u_int16_t flags;
};
@@ -179,6 +193,8 @@ static uma_slab_t slab_zalloc(uma_zone_t, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
static void bucket_cache_drain(uma_zone_t zone);
+static void keg_ctor(void *, int, void *);
+static void keg_dtor(void *, int, void *);
static void zone_ctor(void *, int, void *);
static void zone_dtor(void *, int, void *);
static void zero_init(void *, int);
@@ -202,6 +218,8 @@ static int uma_zalloc_bucket(uma_zone_t zone, int flags);
static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
static void zone_drain(uma_zone_t);
+static void uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
+ uma_fini fini, int align, u_int16_t flags);
void uma_print_zone(uma_zone_t);
void uma_print_stats(void);
@@ -328,10 +346,12 @@ uma_timeout(void *unused)
static void
zone_timeout(uma_zone_t zone)
{
+ uma_keg_t keg;
uma_cache_t cache;
u_int64_t alloc;
int cpu;
+ keg = zone->uz_keg;
alloc = 0;
/*
@@ -344,7 +364,7 @@ zone_timeout(uma_zone_t zone)
* to lock and do it here instead so that the statistics don't get too
* far out of sync.
*/
- if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
+ if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) {
for (cpu = 0; cpu <= mp_maxid; cpu++) {
if (CPU_ABSENT(cpu))
continue;
@@ -369,8 +389,8 @@ zone_timeout(uma_zone_t zone)
* may be a little aggressive. Should I allow for two collisions max?
*/
- if (zone->uz_flags & UMA_ZONE_HASH &&
- zone->uz_pages / zone->uz_ppera >= zone->uz_hash.uh_hashsize) {
+ if (keg->uk_flags & UMA_ZONE_HASH &&
+ keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
struct uma_hash newhash;
struct uma_hash oldhash;
int ret;
@@ -381,14 +401,14 @@ zone_timeout(uma_zone_t zone)
* I have to do everything in stages and check for
* races.
*/
- newhash = zone->uz_hash;
+ newhash = keg->uk_hash;
ZONE_UNLOCK(zone);
ret = hash_alloc(&newhash);
ZONE_LOCK(zone);
if (ret) {
- if (hash_expand(&zone->uz_hash, &newhash)) {
- oldhash = zone->uz_hash;
- zone->uz_hash = newhash;
+ if (hash_expand(&keg->uk_hash, &newhash)) {
+ oldhash = keg->uk_hash;
+ keg->uk_hash = newhash;
} else
oldhash = newhash;
@@ -530,7 +550,7 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
mzone = 0;
/* We have to lookup the slab again for malloc.. */
- if (zone->uz_flags & UMA_ZONE_MALLOC)
+ if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
mzone = 1;
while (bucket->ub_cnt > 0) {
@@ -636,29 +656,32 @@ static void
zone_drain(uma_zone_t zone)
{
struct slabhead freeslabs = {};
+ uma_keg_t keg;
uma_slab_t slab;
uma_slab_t n;
u_int8_t flags;
u_int8_t *mem;
int i;
+ keg = zone->uz_keg;
+
/*
- * We don't want to take pages from staticly allocated zones at this
+ * We don't want to take pages from statically allocated zones at this
* time
*/
- if (zone->uz_flags & UMA_ZONE_NOFREE || zone->uz_freef == NULL)
+ if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
return;
ZONE_LOCK(zone);
#ifdef UMA_DEBUG
- printf("%s free items: %u\n", zone->uz_name, zone->uz_free);
+ printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
#endif
bucket_cache_drain(zone);
- if (zone->uz_free == 0)
+ if (keg->uk_free == 0)
goto finished;
- slab = LIST_FIRST(&zone->uz_free_slab);
+ slab = LIST_FIRST(&keg->uk_free_slab);
while (slab) {
n = LIST_NEXT(slab, us_link);
@@ -669,11 +692,11 @@ zone_drain(uma_zone_t zone)
}
LIST_REMOVE(slab, us_link);
- zone->uz_pages -= zone->uz_ppera;
- zone->uz_free -= zone->uz_ipers;
+ keg->uk_pages -= keg->uk_ppera;
+ keg->uk_free -= keg->uk_ipers;
- if (zone->uz_flags & UMA_ZONE_HASH)
- UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data);
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
@@ -684,34 +707,34 @@ finished:
while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
- if (zone->uz_fini)
- for (i = 0; i < zone->uz_ipers; i++)
- zone->uz_fini(
- slab->us_data + (zone->uz_rsize * i),
- zone->uz_size);
+ if (keg->uk_fini)
+ for (i = 0; i < keg->uk_ipers; i++)
+ keg->uk_fini(
+ slab->us_data + (keg->uk_rsize * i),
+ keg->uk_size);
flags = slab->us_flags;
mem = slab->us_data;
- if (zone->uz_flags & UMA_ZONE_OFFPAGE)
- uma_zfree_internal(slabzone, slab, NULL, 0);
- if (zone->uz_flags & UMA_ZONE_MALLOC) {
+ if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+ (keg->uk_flags & UMA_ZONE_REFCNT)) {
vm_object_t obj;
if (flags & UMA_SLAB_KMEM)
obj = kmem_object;
else
obj = NULL;
- for (i = 0; i < zone->uz_ppera; i++)
+ for (i = 0; i < keg->uk_ppera; i++)
vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
obj);
}
+ if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+ uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0);
#ifdef UMA_DEBUG
printf("%s: Returning %d bytes.\n",
- zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera);
+ zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
#endif
- zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags);
+ keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
}
-
}
/*
@@ -728,20 +751,23 @@ finished:
static uma_slab_t
slab_zalloc(uma_zone_t zone, int wait)
{
- uma_slab_t slab; /* Starting slab */
+ uma_slabrefcnt_t slabref;
+ uma_slab_t slab;
+ uma_keg_t keg;
u_int8_t *mem;
u_int8_t flags;
int i;
slab = NULL;
+ keg = zone->uz_keg;
#ifdef UMA_DEBUG
printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name);
#endif
ZONE_UNLOCK(zone);
- if (zone->uz_flags & UMA_ZONE_OFFPAGE) {
- slab = uma_zalloc_internal(slabzone, NULL, wait);
+ if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
+ slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
if (slab == NULL) {
ZONE_LOCK(zone);
return NULL;
@@ -755,12 +781,12 @@ slab_zalloc(uma_zone_t zone, int wait)
* Malloced items are zeroed in uma_zalloc.
*/
- if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
+ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
wait |= M_ZERO;
else
wait &= ~M_ZERO;
- mem = zone->uz_allocf(zone, zone->uz_ppera * UMA_SLAB_SIZE,
+ mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
&flags, wait);
if (mem == NULL) {
ZONE_LOCK(zone);
@@ -768,32 +794,39 @@ slab_zalloc(uma_zone_t zone, int wait)
}
/* Point the slab into the allocated memory */
- if (!(zone->uz_flags & UMA_ZONE_OFFPAGE))
- slab = (uma_slab_t )(mem + zone->uz_pgoff);
+ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
+ slab = (uma_slab_t )(mem + keg->uk_pgoff);
- if (zone->uz_flags & UMA_ZONE_MALLOC)
- for (i = 0; i < zone->uz_ppera; i++)
+ if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
+ (keg->uk_flags & UMA_ZONE_REFCNT))
+ for (i = 0; i < keg->uk_ppera; i++)
vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
- slab->us_zone = zone;
+ slab->us_keg = keg;
slab->us_data = mem;
- slab->us_freecount = zone->uz_ipers;
+ slab->us_freecount = keg->uk_ipers;
slab->us_firstfree = 0;
slab->us_flags = flags;
- for (i = 0; i < zone->uz_ipers; i++)
- slab->us_freelist[i] = i+1;
+ for (i = 0; i < keg->uk_ipers; i++)
+ slab->us_freelist[i].us_item = i+1;
- if (zone->uz_init)
- for (i = 0; i < zone->uz_ipers; i++)
- zone->uz_init(slab->us_data + (zone->uz_rsize * i),
- zone->uz_size);
+ if (keg->uk_flags & UMA_ZONE_REFCNT) {
+ slabref = (uma_slabrefcnt_t)slab;
+ for (i = 0; i < keg->uk_ipers; i++)
+ slabref->us_freelist[i].us_refcnt = 0;
+ }
+
+ if (keg->uk_init)
+ for (i = 0; i < keg->uk_ipers; i++)
+ keg->uk_init(slab->us_data + (keg->uk_rsize * i),
+ keg->uk_size);
ZONE_LOCK(zone);
- if (zone->uz_flags & UMA_ZONE_HASH)
- UMA_HASH_INSERT(&zone->uz_hash, slab, mem);
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
- zone->uz_pages += zone->uz_ppera;
- zone->uz_free += zone->uz_ipers;
+ keg->uk_pages += keg->uk_ppera;
+ keg->uk_free += keg->uk_ipers;
return (slab);
}
@@ -806,6 +839,10 @@ slab_zalloc(uma_zone_t zone, int wait)
static void *
startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
{
+ uma_keg_t keg;
+
+ keg = zone->uz_keg;
+
/*
* Check our small startup cache to see if it has pages remaining.
*/
@@ -827,11 +864,11 @@ startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
* Now that we've booted reset these users to their real allocator.
*/
#ifdef UMA_MD_SMALL_ALLOC
- zone->uz_allocf = uma_small_alloc;
+ keg->uk_allocf = uma_small_alloc;
#else
- zone->uz_allocf = page_alloc;
+ keg->uk_allocf = page_alloc;
#endif
- return zone->uz_allocf(zone, bytes, pflag, wait);
+ return keg->uk_allocf(zone, bytes, pflag, wait);
}
/*
@@ -877,7 +914,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
vm_page_t p;
int pages, startpages;
- object = zone->uz_obj;
+ object = zone->uz_keg->uk_obj;
retkva = 0;
/*
@@ -887,7 +924,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
p = TAILQ_LAST(&object->memq, pglist);
pages = p != NULL ? p->pindex + 1 : 0;
startpages = pages;
- zkva = zone->uz_kva + pages * PAGE_SIZE;
+ zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
for (; bytes > 0; bytes -= PAGE_SIZE) {
p = vm_page_alloc(object, pages,
VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
@@ -965,29 +1002,33 @@ zero_init(void *mem, int size)
static void
zone_small_init(uma_zone_t zone)
{
+ uma_keg_t keg;
int rsize;
int memused;
int ipers;
- rsize = zone->uz_size;
+ keg = zone->uz_keg;
+ KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
+ rsize = keg->uk_size;
if (rsize < UMA_SMALLEST_UNIT)
rsize = UMA_SMALLEST_UNIT;
- if (rsize & zone->uz_align)
- rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1);
+ if (rsize & keg->uk_align)
+ rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
- zone->uz_rsize = rsize;
+ keg->uk_rsize = rsize;
rsize += 1; /* Account for the byte of linkage */
- zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
- zone->uz_ppera = 1;
+ keg->uk_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
+ keg->uk_ppera = 1;
- KASSERT(zone->uz_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
- memused = zone->uz_ipers * zone->uz_rsize;
+ KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
+ memused = keg->uk_ipers * keg->uk_rsize;
/* Can we do any better? */
- if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) {
+ if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+ ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE)) {
/*
* We can't do this if we're internal or if we've been
* asked to not go to the VM for buckets. If we do this we
@@ -995,15 +1036,16 @@ zone_small_init(uma_zone_t zone)
* do not want to do if we're UMA_ZFLAG_CACHEONLY as a
* result of UMA_ZONE_VM, which clearly forbids it.
*/
- if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) ||
- (zone->uz_flags & UMA_ZFLAG_CACHEONLY))
+ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
+ (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
return;
- ipers = UMA_SLAB_SIZE / zone->uz_rsize;
- if (ipers > zone->uz_ipers) {
- zone->uz_flags |= UMA_ZONE_OFFPAGE;
- if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
- zone->uz_flags |= UMA_ZONE_HASH;
- zone->uz_ipers = ipers;
+ ipers = UMA_SLAB_SIZE / keg->uk_rsize;
+ if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
+ (ipers > keg->uk_ipers)) {
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+ keg->uk_flags |= UMA_ZONE_HASH;
+ keg->uk_ipers = ipers;
}
}
}
@@ -1022,179 +1064,298 @@ zone_small_init(uma_zone_t zone)
static void
zone_large_init(uma_zone_t zone)
{
+ uma_keg_t keg;
int pages;
- KASSERT((zone->uz_flags & UMA_ZFLAG_CACHEONLY) == 0,
+ keg = zone->uz_keg;
+
+ KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
+ KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
- pages = zone->uz_size / UMA_SLAB_SIZE;
+ pages = keg->uk_size / UMA_SLAB_SIZE;
/* Account for remainder */
- if ((pages * UMA_SLAB_SIZE) < zone->uz_size)
+ if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
pages++;
- zone->uz_ppera = pages;
- zone->uz_ipers = 1;
+ keg->uk_ppera = pages;
+ keg->uk_ipers = 1;
- zone->uz_flags |= UMA_ZONE_OFFPAGE;
- if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0)
- zone->uz_flags |= UMA_ZONE_HASH;
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+ keg->uk_flags |= UMA_ZONE_HASH;
- zone->uz_rsize = zone->uz_size;
+ keg->uk_rsize = keg->uk_size;
}
/*
- * Zone header ctor. This initializes all fields, locks, etc. And inserts
- * the zone onto the global zone list.
+ * Keg header ctor. This initializes all fields, locks, etc. And inserts
+ * the keg onto the global keg list.
*
* Arguments/Returns follow uma_ctor specifications
- * udata Actually uma_zcreat_args
+ * udata Actually uma_kctor_args
*/
-
static void
-zone_ctor(void *mem, int size, void *udata)
+keg_ctor(void *mem, int size, void *udata)
{
- struct uma_zctor_args *arg = udata;
- uma_zone_t zone = mem;
- int privlc;
+ struct uma_kctor_args *arg = udata;
+ uma_keg_t keg = mem;
+ uma_zone_t zone;
- bzero(zone, size);
- zone->uz_name = arg->name;
- zone->uz_size = arg->size;
- zone->uz_ctor = arg->ctor;
- zone->uz_dtor = arg->dtor;
- zone->uz_init = arg->uminit;
- zone->uz_fini = arg->fini;
- zone->uz_align = arg->align;
- zone->uz_free = 0;
- zone->uz_pages = 0;
- zone->uz_flags = arg->flags;
- zone->uz_allocf = page_alloc;
- zone->uz_freef = page_free;
+ bzero(keg, size);
+ keg->uk_size = arg->size;
+ keg->uk_init = arg->uminit;
+ keg->uk_fini = arg->fini;
+ keg->uk_align = arg->align;
+ keg->uk_free = 0;
+ keg->uk_pages = 0;
+ keg->uk_flags = arg->flags;
+ keg->uk_allocf = page_alloc;
+ keg->uk_freef = page_free;
+ keg->uk_recurse = 0;
+ keg->uk_slabzone = NULL;
- if (arg->flags & UMA_ZONE_ZINIT)
- zone->uz_init = zero_init;
+ /*
+ * The master zone is passed to us at keg-creation time.
+ */
+ zone = arg->zone;
+ zone->uz_keg = keg;
if (arg->flags & UMA_ZONE_VM)
- zone->uz_flags |= UMA_ZFLAG_CACHEONLY;
+ keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
+
+ if (arg->flags & UMA_ZONE_ZINIT)
+ keg->uk_init = zero_init;
/*
- * XXX:
- * The +1 byte added to uz_size is to account for the byte of
+ * The +1 byte added to uk_size is to account for the byte of
* linkage that is added to the size in zone_small_init(). If
* we don't account for this here then we may end up in
* zone_small_init() with a calculated 'ipers' of 0.
*/
- if ((zone->uz_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
+ if ((keg->uk_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
zone_large_init(zone);
else
zone_small_init(zone);
+
+ if (keg->uk_flags & UMA_ZONE_REFCNT)
+ keg->uk_slabzone = slabrefzone;
+ else if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+ keg->uk_slabzone = slabzone;
+
/*
* If we haven't booted yet we need allocations to go through the
* startup cache until the vm is ready.
*/
- if (zone->uz_ppera == 1) {
+ if (keg->uk_ppera == 1) {
#ifdef UMA_MD_SMALL_ALLOC
- zone->uz_allocf = uma_small_alloc;
- zone->uz_freef = uma_small_free;
+ keg->uk_allocf = uma_small_alloc;
+ keg->uk_freef = uma_small_free;
#endif
if (booted == 0)
- zone->uz_allocf = startup_alloc;
+ keg->uk_allocf = startup_alloc;
}
+
+ /*
+ * Initialize keg's lock (shared among zones) through
+ * Master zone
+ */
+ zone->uz_lock = &keg->uk_lock;
if (arg->flags & UMA_ZONE_MTXCLASS)
- privlc = 1;
+ ZONE_LOCK_INIT(zone, 1);
else
- privlc = 0;
+ ZONE_LOCK_INIT(zone, 0);
/*
* If we're putting the slab header in the actual page we need to
* figure out where in each page it goes. This calculates a right
* justified offset into the memory on an ALIGN_PTR boundary.
*/
- if (!(zone->uz_flags & UMA_ZONE_OFFPAGE)) {
+ if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
int totsize;
/* Size of the slab struct and free list */
- totsize = sizeof(struct uma_slab) + zone->uz_ipers;
+ totsize = sizeof(struct uma_slab) + keg->uk_ipers;
if (totsize & UMA_ALIGN_PTR)
totsize = (totsize & ~UMA_ALIGN_PTR) +
(UMA_ALIGN_PTR + 1);
- zone->uz_pgoff = UMA_SLAB_SIZE - totsize;
- totsize = zone->uz_pgoff + sizeof(struct uma_slab)
- + zone->uz_ipers;
+ keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
+ totsize = keg->uk_pgoff + sizeof(struct uma_slab)
+ + keg->uk_ipers;
/* I don't think it's possible, but I'll make sure anyway */
if (totsize > UMA_SLAB_SIZE) {
printf("zone %s ipers %d rsize %d size %d\n",
- zone->uz_name, zone->uz_ipers, zone->uz_rsize,
- zone->uz_size);
+ zone->uz_name, keg->uk_ipers, keg->uk_rsize,
+ keg->uk_size);
panic("UMA slab won't fit.\n");
}
}
- if (zone->uz_flags & UMA_ZONE_HASH)
- hash_alloc(&zone->uz_hash);
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ hash_alloc(&keg->uk_hash);
#ifdef UMA_DEBUG
printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
zone->uz_name, zone,
- zone->uz_size, zone->uz_ipers,
- zone->uz_ppera, zone->uz_pgoff);
+ keg->uk_size, keg->uk_ipers,
+ keg->uk_ppera, keg->uk_pgoff);
#endif
- ZONE_LOCK_INIT(zone, privlc);
+
+ LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
mtx_lock(&uma_mtx);
- LIST_INSERT_HEAD(&uma_zones, zone, uz_link);
+ LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
mtx_unlock(&uma_mtx);
+}
+
+/*
+ * Zone header ctor. This initializes all fields, locks, etc.
+ *
+ * Arguments/Returns follow uma_ctor specifications
+ * udata Actually uma_zctor_args
+ */
+
+static void
+zone_ctor(void *mem, int size, void *udata)
+{
+ struct uma_zctor_args *arg = udata;
+ uma_zone_t zone = mem;
+ uma_zone_t z;
+ uma_keg_t keg;
+
+ bzero(zone, size);
+ zone->uz_name = arg->name;
+ zone->uz_ctor = arg->ctor;
+ zone->uz_dtor = arg->dtor;
+ zone->uz_init = NULL;
+ zone->uz_fini = NULL;
+ zone->uz_allocs = 0;
+ zone->uz_fills = zone->uz_count = 0;
+
+ if (arg->flags & UMA_ZONE_SECONDARY) {
+ KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
+ keg = arg->keg;
+ zone->uz_keg = keg;
+ zone->uz_init = arg->uminit;
+ zone->uz_fini = arg->fini;
+ zone->uz_lock = &keg->uk_lock;
+ mtx_lock(&uma_mtx);
+ ZONE_LOCK(zone);
+ keg->uk_flags |= UMA_ZONE_SECONDARY;
+ LIST_FOREACH(z, &keg->uk_zones, uz_link) {
+ if (LIST_NEXT(z, uz_link) == NULL) {
+ LIST_INSERT_AFTER(z, zone, uz_link);
+ break;
+ }
+ }
+ ZONE_UNLOCK(zone);
+ mtx_unlock(&uma_mtx);
+ } else if (arg->keg == NULL) {
+ uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
+ arg->align, arg->flags);
+ } else {
+ struct uma_kctor_args karg;
+
+ /* We should only be here from uma_startup() */
+ karg.size = arg->size;
+ karg.uminit = arg->uminit;
+ karg.fini = arg->fini;
+ karg.align = arg->align;
+ karg.flags = arg->flags;
+ karg.zone = zone;
+ keg_ctor(arg->keg, sizeof(struct uma_keg), &karg);
+ }
+ keg = zone->uz_keg;
+ zone->uz_lock = &keg->uk_lock;
/*
* Some internal zones don't have room allocated for the per cpu
* caches. If we're internal, bail out here.
*/
- if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+ if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
+ KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
+ ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
return;
+ }
- if (zone->uz_ipers <= BUCKET_MAX)
- zone->uz_count = zone->uz_ipers;
+ if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
+ zone->uz_count = BUCKET_MAX;
+ else if (keg->uk_ipers <= BUCKET_MAX)
+ zone->uz_count = keg->uk_ipers;
else
zone->uz_count = BUCKET_MAX;
}
/*
- * Zone header dtor. This frees all data, destroys locks, frees the hash table
- * and removes the zone from the global list.
+ * Keg header dtor. This frees all data, destroys locks, frees the hash
+ * table and removes the keg from the global list.
*
* Arguments/Returns follow uma_dtor specifications
* udata unused
*/
+static void
+keg_dtor(void *arg, int size, void *udata)
+{
+ uma_keg_t keg;
+ keg = (uma_keg_t)arg;
+ mtx_lock(&keg->uk_lock);
+ if (keg->uk_free != 0) {
+ printf("Freed UMA keg was not empty (%d items). "
+ " Lost %d pages of memory.\n",
+ keg->uk_free, keg->uk_pages);
+ }
+ mtx_unlock(&keg->uk_lock);
+
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ hash_free(&keg->uk_hash);
+
+ mtx_destroy(&keg->uk_lock);
+}
+
+/*
+ * Zone header dtor.
+ *
+ * Arguments/Returns follow uma_dtor specifications
+ * udata unused
+ */
static void
zone_dtor(void *arg, int size, void *udata)
{
uma_zone_t zone;
+ uma_keg_t keg;
zone = (uma_zone_t)arg;
+ keg = zone->uz_keg;
- if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
+ if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
cache_drain(zone);
+
mtx_lock(&uma_mtx);
- LIST_REMOVE(zone, uz_link);
zone_drain(zone);
- mtx_unlock(&uma_mtx);
-
- ZONE_LOCK(zone);
- if (zone->uz_free != 0) {
- printf("Zone %s was not empty (%d items). "
- " Lost %d pages of memory.\n",
- zone->uz_name, zone->uz_free, zone->uz_pages);
- uma_print_zone(zone);
+ if (keg->uk_flags & UMA_ZONE_SECONDARY) {
+ LIST_REMOVE(zone, uz_link);
+ /*
+ * XXX there are some races here where
+ * the zone can be drained but zone lock
+ * released and then refilled before we
+ * remove it... we dont care for now
+ */
+ ZONE_LOCK(zone);
+ if (LIST_EMPTY(&keg->uk_zones))
+ keg->uk_flags &= ~UMA_ZONE_SECONDARY;
+ ZONE_UNLOCK(zone);
+ mtx_unlock(&uma_mtx);
+ } else {
+ LIST_REMOVE(keg, uk_link);
+ LIST_REMOVE(zone, uz_link);
+ mtx_unlock(&uma_mtx);
+ uma_zfree_internal(kegs, keg, NULL, 0);
}
-
- ZONE_UNLOCK(zone);
- if (zone->uz_flags & UMA_ZONE_HASH)
- hash_free(&zone->uz_hash);
-
- ZONE_LOCK_FINI(zone);
+ zone->uz_keg = NULL;
}
+
/*
* Traverses every zone in the system and calls a callback
*
@@ -1208,11 +1369,14 @@ zone_dtor(void *arg, int size, void *udata)
static void
zone_foreach(void (*zfunc)(uma_zone_t))
{
+ uma_keg_t keg;
uma_zone_t zone;
mtx_lock(&uma_mtx);
- LIST_FOREACH(zone, &uma_zones, uz_link)
- zfunc(zone);
+ LIST_FOREACH(keg, &uma_kegs, uk_link) {
+ LIST_FOREACH(zone, &keg->uk_zones, uz_link)
+ zfunc(zone);
+ }
mtx_unlock(&uma_mtx);
}
@@ -1227,25 +1391,23 @@ uma_startup(void *bootmem)
int i;
#ifdef UMA_DEBUG
- printf("Creating uma zone headers zone.\n");
+ printf("Creating uma keg headers zone and keg.\n");
#endif
mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
- /* "manually" Create the initial zone */
- args.name = "UMA Zones";
- args.size = sizeof(struct uma_zone) +
- (sizeof(struct uma_cache) * (mp_maxid + 1));
- args.ctor = zone_ctor;
- args.dtor = zone_dtor;
+
+ /* "manually" create the initial zone */
+ args.name = "UMA Kegs";
+ args.size = sizeof(struct uma_keg);
+ args.ctor = keg_ctor;
+ args.dtor = keg_dtor;
args.uminit = zero_init;
args.fini = NULL;
+ args.keg = &masterkeg;
args.align = 32 - 1;
args.flags = UMA_ZFLAG_INTERNAL;
/* The initial zone has no Per cpu queues so it's smaller */
- zone_ctor(zones, sizeof(struct uma_zone), &args);
+ zone_ctor(kegs, sizeof(struct uma_zone), &args);
- /* Initialize the pcpu cache lock set once and for all */
- for (i = 0; i <= mp_maxid; i++)
- CPU_LOCK_INIT(i);
#ifdef UMA_DEBUG
printf("Filling boot free list.\n");
#endif
@@ -1258,7 +1420,30 @@ uma_startup(void *bootmem)
}
#ifdef UMA_DEBUG
- printf("Creating slab zone.\n");
+ printf("Creating uma zone headers zone and keg.\n");
+#endif
+ args.name = "UMA Zones";
+ args.size = sizeof(struct uma_zone) +
+ (sizeof(struct uma_cache) * (mp_maxid + 1));
+ args.ctor = zone_ctor;
+ args.dtor = zone_dtor;
+ args.uminit = zero_init;
+ args.fini = NULL;
+ args.keg = NULL;
+ args.align = 32 - 1;
+ args.flags = UMA_ZFLAG_INTERNAL;
+ /* The initial zone has no Per cpu queues so it's smaller */
+ zone_ctor(zones, sizeof(struct uma_zone), &args);
+
+#ifdef UMA_DEBUG
+ printf("Initializing pcpu cache locks.\n");
+#endif
+ /* Initialize the pcpu cache lock set once and for all */
+ for (i = 0; i <= mp_maxid; i++)
+ CPU_LOCK_INIT(i);
+
+#ifdef UMA_DEBUG
+ printf("Creating slab and hash zones.\n");
#endif
/*
@@ -1276,6 +1461,20 @@ uma_startup(void *bootmem)
NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
+ /*
+ * We also create a zone for the bigger slabs with reference
+ * counts in them, to accomodate UMA_ZONE_REFCNT zones.
+ */
+ slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt);
+ slabsize /= UMA_MAX_WASTE;
+ slabsize++;
+ slabsize += 4 * slabsize;
+ slabsize += sizeof(struct uma_slab_refcnt);
+ slabrefzone = uma_zcreate("UMA RCntSlabs",
+ slabsize,
+ NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
+
hashzone = uma_zcreate("UMA Hash",
sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
NULL, NULL, NULL, NULL,
@@ -1321,6 +1520,21 @@ uma_startup3(void)
#endif
}
+static void
+uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
+ int align, u_int16_t flags)
+{
+ struct uma_kctor_args args;
+
+ args.size = size;
+ args.uminit = uminit;
+ args.fini = fini;
+ args.align = align;
+ args.flags = flags;
+ args.zone = zone;
+ zone = uma_zalloc_internal(kegs, &args, M_WAITOK);
+}
+
/* See uma.h */
uma_zone_t
uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
@@ -1338,6 +1552,27 @@ uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
args.fini = fini;
args.align = align;
args.flags = flags;
+ args.keg = NULL;
+
+ return (uma_zalloc_internal(zones, &args, M_WAITOK));
+}
+
+/* See uma.h */
+uma_zone_t
+uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+ uma_init zinit, uma_fini zfini, uma_zone_t master)
+{
+ struct uma_zctor_args args;
+
+ args.name = name;
+ args.size = master->uz_keg->uk_size;
+ args.ctor = ctor;
+ args.dtor = dtor;
+ args.uminit = zinit;
+ args.fini = zfini;
+ args.align = master->uz_keg->uk_align;
+ args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
+ args.keg = master->uz_keg;
return (uma_zalloc_internal(zones, &args, M_WAITOK));
}
@@ -1357,35 +1592,25 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
uma_cache_t cache;
uma_bucket_t bucket;
int cpu;
+ int badness = 1;
/* This is the fast path allocation */
#ifdef UMA_DEBUG_ALLOC_1
printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
#endif
-#ifdef INVARIANTS
- /*
- * To make sure that WAITOK or NOWAIT is set, but not more than
- * one, and check against the API botches that are common.
- * The uma code implies M_WAITOK if M_NOWAIT is not set, so
- * we default to waiting if none of the flags is set.
- */
- cpu = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
- if (cpu != M_NOWAIT && cpu != M_WAITOK) {
- static struct timeval lasterr;
- static int curerr, once;
- if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
- printf("Bad uma_zalloc flags: %x\n", cpu);
- backtrace();
- once++;
- }
- }
-#endif
if (!(flags & M_NOWAIT)) {
KASSERT(curthread->td_intr_nesting_level == 0,
("malloc(M_WAITOK) in interrupt context"));
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "malloc() of \"%s\"", zone->uz_name);
+#ifdef WITNESS
+ badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
+ zone->uz_name);
+#endif
+ if (badness) {
+ flags &= ~M_WAITOK;
+ flags |= M_NOWAIT;
+ }
}
zalloc_restart:
@@ -1413,9 +1638,9 @@ zalloc_start:
#endif
CPU_UNLOCK(cpu);
if (zone->uz_ctor)
- zone->uz_ctor(item, zone->uz_size, udata);
+ zone->uz_ctor(item,zone->uz_keg->uk_size,udata);
if (flags & M_ZERO)
- bzero(item, zone->uz_size);
+ bzero(item, zone->uz_keg->uk_size);
return (item);
} else if (cache->uc_freebucket) {
/*
@@ -1465,6 +1690,7 @@ zalloc_start:
/* Bump up our uz_count so we get here less */
if (zone->uz_count < BUCKET_MAX)
zone->uz_count++;
+
/*
* Now lets just fill a bucket and put it on the free list. If that
* works we'll restart the allocation from the begining.
@@ -1488,6 +1714,9 @@ static uma_slab_t
uma_zone_slab(uma_zone_t zone, int flags)
{
uma_slab_t slab;
+ uma_keg_t keg;
+
+ keg = zone->uz_keg;
/*
* This is to prevent us from recursively trying to allocate
@@ -1498,7 +1727,7 @@ uma_zone_slab(uma_zone_t zone, int flags)
* things happen. So instead we return a NULL bucket, and make
* the code that allocates buckets smart enough to deal with it
*/
- if (zone->uz_flags & UMA_ZFLAG_INTERNAL && zone->uz_recurse != 0)
+ if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
return (NULL);
slab = NULL;
@@ -1509,14 +1738,14 @@ uma_zone_slab(uma_zone_t zone, int flags)
* used over those that are totally full. This helps to reduce
* fragmentation.
*/
- if (zone->uz_free != 0) {
- if (!LIST_EMPTY(&zone->uz_part_slab)) {
- slab = LIST_FIRST(&zone->uz_part_slab);
+ if (keg->uk_free != 0) {
+ if (!LIST_EMPTY(&keg->uk_part_slab)) {
+ slab = LIST_FIRST(&keg->uk_part_slab);
} else {
- slab = LIST_FIRST(&zone->uz_free_slab);
+ slab = LIST_FIRST(&keg->uk_free_slab);
LIST_REMOVE(slab, us_link);
- LIST_INSERT_HEAD(&zone->uz_part_slab, slab,
- us_link);
+ LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
+ us_link);
}
return (slab);
}
@@ -1527,27 +1756,28 @@ uma_zone_slab(uma_zone_t zone, int flags)
if (flags & M_NOVM)
break;
- if (zone->uz_maxpages &&
- zone->uz_pages >= zone->uz_maxpages) {
- zone->uz_flags |= UMA_ZFLAG_FULL;
+ if (keg->uk_maxpages &&
+ keg->uk_pages >= keg->uk_maxpages) {
+ keg->uk_flags |= UMA_ZFLAG_FULL;
if (flags & M_NOWAIT)
break;
else
- msleep(zone, &zone->uz_lock, PVM,
+ msleep(keg, &keg->uk_lock, PVM,
"zonelimit", 0);
continue;
}
- zone->uz_recurse++;
+ keg->uk_recurse++;
slab = slab_zalloc(zone, flags);
- zone->uz_recurse--;
+ keg->uk_recurse--;
+
/*
* If we got a slab here it's safe to mark it partially used
* and return. We assume that the caller is going to remove
* at least one item.
*/
if (slab) {
- LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+ LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
return (slab);
}
/*
@@ -1564,22 +1794,25 @@ uma_zone_slab(uma_zone_t zone, int flags)
static void *
uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
{
+ uma_keg_t keg;
void *item;
u_int8_t freei;
+ keg = zone->uz_keg;
+
freei = slab->us_firstfree;
- slab->us_firstfree = slab->us_freelist[freei];
- item = slab->us_data + (zone->uz_rsize * freei);
+ slab->us_firstfree = slab->us_freelist[freei].us_item;
+ item = slab->us_data + (keg->uk_rsize * freei);
slab->us_freecount--;
- zone->uz_free--;
+ keg->uk_free--;
#ifdef INVARIANTS
uma_dbg_alloc(zone, slab, item);
#endif
/* Move this slab to the full list */
if (slab->us_freecount == 0) {
LIST_REMOVE(slab, us_link);
- LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link);
+ LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
}
return (item);
@@ -1590,6 +1823,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
{
uma_bucket_t bucket;
uma_slab_t slab;
+ int16_t saved;
int max;
/*
@@ -1603,7 +1837,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
int bflags;
bflags = (flags & ~M_ZERO);
- if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+ if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
bflags |= M_NOVM;
ZONE_UNLOCK(zone);
@@ -1628,18 +1862,36 @@ uma_zalloc_bucket(uma_zone_t zone, int flags)
max = MIN(bucket->ub_entries, zone->uz_count);
/* Try to keep the buckets totally full */
+ saved = bucket->ub_cnt;
while (bucket->ub_cnt < max &&
(slab = uma_zone_slab(zone, flags)) != NULL) {
while (slab->us_freecount && bucket->ub_cnt < max) {
bucket->ub_bucket[bucket->ub_cnt++] =
uma_slab_alloc(zone, slab);
}
+
/* Don't block on the next fill */
flags |= M_NOWAIT;
}
- zone->uz_fills--;
+ /*
+ * We unlock here because we need to call the zone's init.
+ * It should be safe to unlock because the slab dealt with
+ * above is already on the appropriate list within the keg
+ * and the bucket we filled is not yet on any list, so we
+ * own it.
+ */
+ if (zone->uz_init != NULL) {
+ int i;
+
+ ZONE_UNLOCK(zone);
+ for (i = saved; i < bucket->ub_cnt; i++)
+ zone->uz_init(bucket->ub_bucket[i],
+ zone->uz_keg->uk_size);
+ ZONE_LOCK(zone);
+ }
+ zone->uz_fills--;
if (bucket->ub_cnt != 0) {
LIST_INSERT_HEAD(&zone->uz_full_bucket,
bucket, ub_link);
@@ -1668,10 +1920,12 @@ done:
static void *
uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
{
+ uma_keg_t keg;
uma_slab_t slab;
void *item;
item = NULL;
+ keg = zone->uz_keg;
#ifdef UMA_DEBUG_ALLOC
printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -1688,10 +1942,18 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
ZONE_UNLOCK(zone);
+ /*
+ * We have to call both the zone's init (not the keg's init)
+ * and the zone's ctor. This is because the item is going from
+ * a keg slab directly to the user, and the user is expecting it
+ * to be both zone-init'd as well as zone-ctor'd.
+ */
+ if (zone->uz_init != NULL)
+ zone->uz_init(item, keg->uk_size);
if (zone->uz_ctor != NULL)
- zone->uz_ctor(item, zone->uz_size, udata);
+ zone->uz_ctor(item, keg->uk_size, udata);
if (flags & M_ZERO)
- bzero(item, zone->uz_size);
+ bzero(item, keg->uk_size);
return (item);
}
@@ -1700,6 +1962,7 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
void
uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
{
+ uma_keg_t keg;
uma_cache_t cache;
uma_bucket_t bucket;
int bflags;
@@ -1708,6 +1971,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
/* This is the fast path free */
skip = 0;
+ keg = zone->uz_keg;
+
#ifdef UMA_DEBUG_ALLOC_1
printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
#endif
@@ -1716,11 +1981,11 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
* a little longer for the limits to be reset.
*/
- if (zone->uz_flags & UMA_ZFLAG_FULL)
+ if (keg->uk_flags & UMA_ZFLAG_FULL)
goto zfree_internal;
if (zone->uz_dtor) {
- zone->uz_dtor(item, zone->uz_size, udata);
+ zone->uz_dtor(item, keg->uk_size, udata);
skip = 1;
}
@@ -1745,7 +2010,7 @@ zfree_start:
bucket->ub_cnt++;
#ifdef INVARIANTS
ZONE_LOCK(zone);
- if (zone->uz_flags & UMA_ZONE_MALLOC)
+ if (keg->uk_flags & UMA_ZONE_MALLOC)
uma_dbg_free(zone, udata, item);
else
uma_dbg_free(zone, NULL, item);
@@ -1810,7 +2075,7 @@ zfree_start:
#endif
bflags = M_NOWAIT;
- if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
+ if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
bflags |= M_NOVM;
bucket = bucket_alloc(zone->uz_count, bflags);
if (bucket) {
@@ -1836,7 +2101,7 @@ zfree_internal:
*/
if (skip) {
ZONE_LOCK(zone);
- if (zone->uz_flags & UMA_ZONE_MALLOC)
+ if (keg->uk_flags & UMA_ZONE_MALLOC)
uma_dbg_free(zone, udata, item);
else
uma_dbg_free(zone, NULL, item);
@@ -1846,7 +2111,6 @@ zfree_internal:
uma_zfree_internal(zone, item, udata, skip);
return;
-
}
/*
@@ -1862,20 +2126,25 @@ static void
uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
{
uma_slab_t slab;
+ uma_keg_t keg;
u_int8_t *mem;
u_int8_t freei;
+ keg = zone->uz_keg;
+
if (!skip && zone->uz_dtor)
- zone->uz_dtor(item, zone->uz_size, udata);
+ zone->uz_dtor(item, keg->uk_size, udata);
+ if (zone->uz_fini)
+ zone->uz_fini(item, keg->uk_size);
ZONE_LOCK(zone);
- if (!(zone->uz_flags & UMA_ZONE_MALLOC)) {
+ if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
- if (zone->uz_flags & UMA_ZONE_HASH)
- slab = hash_sfind(&zone->uz_hash, mem);
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ slab = hash_sfind(&keg->uk_hash, mem);
else {
- mem += zone->uz_pgoff;
+ mem += keg->uk_pgoff;
slab = (uma_slab_t)mem;
}
} else {
@@ -1883,36 +2152,36 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
}
/* Do we need to remove from any lists? */
- if (slab->us_freecount+1 == zone->uz_ipers) {
+ if (slab->us_freecount+1 == keg->uk_ipers) {
LIST_REMOVE(slab, us_link);
- LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+ LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
} else if (slab->us_freecount == 0) {
LIST_REMOVE(slab, us_link);
- LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
+ LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
}
/* Slab management stuff */
freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / zone->uz_rsize;
+ / keg->uk_rsize;
#ifdef INVARIANTS
if (!skip)
uma_dbg_free(zone, slab, item);
#endif
- slab->us_freelist[freei] = slab->us_firstfree;
+ slab->us_freelist[freei].us_item = slab->us_firstfree;
slab->us_firstfree = freei;
slab->us_freecount++;
/* Zone statistics */
- zone->uz_free++;
+ keg->uk_free++;
- if (zone->uz_flags & UMA_ZFLAG_FULL) {
- if (zone->uz_pages < zone->uz_maxpages)
- zone->uz_flags &= ~UMA_ZFLAG_FULL;
+ if (keg->uk_flags & UMA_ZFLAG_FULL) {
+ if (keg->uk_pages < keg->uk_maxpages)
+ keg->uk_flags &= ~UMA_ZFLAG_FULL;
/* We can handle one more allocation */
- wakeup_one(zone);
+ wakeup_one(keg);
}
ZONE_UNLOCK(zone);
@@ -1922,24 +2191,71 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
void
uma_zone_set_max(uma_zone_t zone, int nitems)
{
+ uma_keg_t keg;
+
+ keg = zone->uz_keg;
ZONE_LOCK(zone);
- if (zone->uz_ppera > 1)
- zone->uz_maxpages = nitems * zone->uz_ppera;
+ if (keg->uk_ppera > 1)
+ keg->uk_maxpages = nitems * keg->uk_ppera;
else
- zone->uz_maxpages = nitems / zone->uz_ipers;
+ keg->uk_maxpages = nitems / keg->uk_ipers;
- if (zone->uz_maxpages * zone->uz_ipers < nitems)
- zone->uz_maxpages++;
+ if (keg->uk_maxpages * keg->uk_ipers < nitems)
+ keg->uk_maxpages++;
ZONE_UNLOCK(zone);
}
/* See uma.h */
void
+uma_zone_set_init(uma_zone_t zone, uma_init uminit)
+{
+ ZONE_LOCK(zone);
+ KASSERT(zone->uz_keg->uk_pages == 0,
+ ("uma_zone_set_init on non-empty keg"));
+ zone->uz_keg->uk_init = uminit;
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
+{
+ ZONE_LOCK(zone);
+ KASSERT(zone->uz_keg->uk_pages == 0,
+ ("uma_zone_set_fini on non-empty keg"));
+ zone->uz_keg->uk_fini = fini;
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
+{
+ ZONE_LOCK(zone);
+ KASSERT(zone->uz_keg->uk_pages == 0,
+ ("uma_zone_set_zinit on non-empty keg"));
+ zone->uz_init = zinit;
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
+uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
+{
+ ZONE_LOCK(zone);
+ KASSERT(zone->uz_keg->uk_pages == 0,
+ ("uma_zone_set_zfini on non-empty keg"));
+ zone->uz_fini = zfini;
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
+void
uma_zone_set_freef(uma_zone_t zone, uma_free freef)
{
ZONE_LOCK(zone);
- zone->uz_freef = freef;
+ zone->uz_keg->uk_freef = freef;
ZONE_UNLOCK(zone);
}
@@ -1948,8 +2264,8 @@ void
uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
{
ZONE_LOCK(zone);
- zone->uz_flags |= UMA_ZFLAG_PRIVALLOC;
- zone->uz_allocf = allocf;
+ zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
+ zone->uz_keg->uk_allocf = allocf;
ZONE_UNLOCK(zone);
}
@@ -1957,12 +2273,14 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
int
uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
{
- int pages;
+ uma_keg_t keg;
vm_offset_t kva;
+ int pages;
- pages = count / zone->uz_ipers;
+ keg = zone->uz_keg;
+ pages = count / keg->uk_ipers;
- if (pages * zone->uz_ipers < count)
+ if (pages * keg->uk_ipers < count)
pages++;
kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
@@ -1978,11 +2296,11 @@ uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
pages, obj);
}
ZONE_LOCK(zone);
- zone->uz_kva = kva;
- zone->uz_obj = obj;
- zone->uz_maxpages = pages;
- zone->uz_allocf = obj_alloc;
- zone->uz_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
+ keg->uk_kva = kva;
+ keg->uk_obj = obj;
+ keg->uk_maxpages = pages;
+ keg->uk_allocf = obj_alloc;
+ keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
ZONE_UNLOCK(zone);
return (1);
}
@@ -1993,20 +2311,41 @@ uma_prealloc(uma_zone_t zone, int items)
{
int slabs;
uma_slab_t slab;
+ uma_keg_t keg;
+ keg = zone->uz_keg;
ZONE_LOCK(zone);
- slabs = items / zone->uz_ipers;
- if (slabs * zone->uz_ipers < items)
+ slabs = items / keg->uk_ipers;
+ if (slabs * keg->uk_ipers < items)
slabs++;
while (slabs > 0) {
slab = slab_zalloc(zone, M_WAITOK);
- LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
+ LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
slabs--;
}
ZONE_UNLOCK(zone);
}
/* See uma.h */
+u_int32_t *
+uma_find_refcnt(uma_zone_t zone, void *item)
+{
+ uma_slabrefcnt_t slab;
+ uma_keg_t keg;
+ u_int32_t *refcnt;
+ int idx;
+
+ keg = zone->uz_keg;
+ slab = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
+ KASSERT(slab != NULL,
+ ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
+ idx = ((unsigned long)item - (unsigned long)slab->us_data)
+ / keg->uk_rsize;
+ refcnt = &(slab->us_freelist[idx].us_refcnt);
+ return refcnt;
+}
+
+/* See uma.h */
void
uma_reclaim(void)
{
@@ -2021,6 +2360,7 @@ uma_reclaim(void)
* zones are drained. We have to do the same for buckets.
*/
zone_drain(slabzone);
+ zone_drain(slabrefzone);
bucket_zone_drain();
}
@@ -2044,7 +2384,6 @@ uma_large_malloc(int size, int wait)
uma_zfree_internal(slabzone, slab, NULL, 0);
}
-
return (mem);
}
@@ -2065,8 +2404,8 @@ uma_print_stats(void)
static void
slab_print(uma_slab_t slab)
{
- printf("slab: zone %p, data %p, freecount %d, firstfree %d\n",
- slab->us_zone, slab->us_data, slab->us_freecount,
+ printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
+ slab->us_keg, slab->us_data, slab->us_freecount,
slab->us_firstfree);
}
@@ -2084,21 +2423,23 @@ void
uma_print_zone(uma_zone_t zone)
{
uma_cache_t cache;
+ uma_keg_t keg;
uma_slab_t slab;
int i;
+ keg = zone->uz_keg;
printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
- zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags,
- zone->uz_ipers, zone->uz_ppera,
- (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free);
+ zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
+ keg->uk_ipers, keg->uk_ppera,
+ (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
printf("Part slabs:\n");
- LIST_FOREACH(slab, &zone->uz_part_slab, us_link)
+ LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
slab_print(slab);
printf("Free slabs:\n");
- LIST_FOREACH(slab, &zone->uz_free_slab, us_link)
+ LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
slab_print(slab);
printf("Full slabs:\n");
- LIST_FOREACH(slab, &zone->uz_full_slab, us_link)
+ LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
slab_print(slab);
for (i = 0; i <= mp_maxid; i++) {
if (CPU_ABSENT(i))
@@ -2122,6 +2463,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
int totalfree;
char *tmpbuf, *offset;
uma_zone_t z;
+ uma_keg_t zk;
char *p;
int cpu;
int cachefree;
@@ -2130,8 +2472,10 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
cnt = 0;
mtx_lock(&uma_mtx);
- LIST_FOREACH(z, &uma_zones, uz_link)
- cnt++;
+ LIST_FOREACH(zk, &uma_kegs, uk_link) {
+ LIST_FOREACH(z, &zk->uk_zones, uz_link)
+ cnt++;
+ }
mtx_unlock(&uma_mtx);
MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
M_TEMP, M_WAITOK);
@@ -2144,10 +2488,11 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
goto out;
offset = tmpbuf;
mtx_lock(&uma_mtx);
- LIST_FOREACH(z, &uma_zones, uz_link) {
+ LIST_FOREACH(zk, &uma_kegs, uk_link) {
+ LIST_FOREACH(z, &zk->uk_zones, uz_link) {
if (cnt == 0) /* list may have changed size */
break;
- if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+ if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
for (cpu = 0; cpu <= mp_maxid; cpu++) {
if (CPU_ABSENT(cpu))
continue;
@@ -2156,7 +2501,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
}
ZONE_LOCK(z);
cachefree = 0;
- if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) {
+ if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
for (cpu = 0; cpu <= mp_maxid; cpu++) {
if (CPU_ABSENT(cpu))
continue;
@@ -2171,12 +2516,12 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
cachefree += bucket->ub_cnt;
}
- totalfree = z->uz_free + cachefree;
+ totalfree = zk->uk_free + cachefree;
len = snprintf(offset, linesize,
"%-12.12s %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
- z->uz_name, z->uz_size,
- z->uz_maxpages * z->uz_ipers,
- (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree,
+ z->uz_name, zk->uk_size,
+ zk->uk_maxpages * zk->uk_ipers,
+ (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
totalfree,
(unsigned long long)z->uz_allocs);
ZONE_UNLOCK(z);
@@ -2185,6 +2530,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
p[1] = ':';
cnt--;
offset += len;
+ }
}
mtx_unlock(&uma_mtx);
*offset++ = '\0';
diff --git a/sys/vm/uma_dbg.c b/sys/vm/uma_dbg.c
index 85d067d..0f845cf 100644
--- a/sys/vm/uma_dbg.c
+++ b/sys/vm/uma_dbg.c
@@ -192,15 +192,17 @@ static uma_slab_t
uma_dbg_getslab(uma_zone_t zone, void *item)
{
uma_slab_t slab;
+ uma_keg_t keg;
u_int8_t *mem;
+ keg = zone->uz_keg;
mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
- if (zone->uz_flags & UMA_ZONE_MALLOC) {
+ if (keg->uk_flags & UMA_ZONE_MALLOC) {
slab = vtoslab((vm_offset_t)mem);
- } else if (zone->uz_flags & UMA_ZONE_HASH) {
- slab = hash_sfind(&zone->uz_hash, mem);
+ } else if (keg->uk_flags & UMA_ZONE_HASH) {
+ slab = hash_sfind(&keg->uk_hash, mem);
} else {
- mem += zone->uz_pgoff;
+ mem += keg->uk_pgoff;
slab = (uma_slab_t)mem;
}
@@ -215,8 +217,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item)
void
uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
{
+ uma_keg_t keg;
int freei;
+ keg = zone->uz_keg;
if (slab == NULL) {
slab = uma_dbg_getslab(zone, item);
if (slab == NULL)
@@ -225,9 +229,9 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
}
freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / zone->uz_rsize;
+ / keg->uk_rsize;
- slab->us_freelist[freei] = 255;
+ slab->us_freelist[freei].us_item = 255;
return;
}
@@ -241,8 +245,10 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
void
uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
{
+ uma_keg_t keg;
int freei;
+ keg = zone->uz_keg;
if (slab == NULL) {
slab = uma_dbg_getslab(zone, item);
if (slab == NULL)
@@ -251,22 +257,22 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
}
freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / zone->uz_rsize;
+ / keg->uk_rsize;
- if (freei >= zone->uz_ipers)
+ if (freei >= keg->uk_ipers)
panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
- zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
+ zone->uz_name, zone, slab, freei, keg->uk_ipers-1);
- if (((freei * zone->uz_rsize) + slab->us_data) != item) {
+ if (((freei * keg->uk_rsize) + slab->us_data) != item) {
printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
zone->uz_name, zone, slab, item);
panic("should be %p\n",
- (freei * zone->uz_rsize) + slab->us_data);
+ (freei * keg->uk_rsize) + slab->us_data);
}
- if (slab->us_freelist[freei] != 255) {
+ if (slab->us_freelist[freei].us_item != 255) {
printf("Slab at %p, freei %d = %d.\n",
- slab, freei, slab->us_freelist[freei]);
+ slab, freei, slab->us_freelist[freei].us_item);
panic("Duplicate free of item %p from zone %p(%s)\n",
item, zone, zone->uz_name);
}
@@ -276,5 +282,5 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
* Until then the count of valid slabs will make sure we don't
* accidentally follow this and assume it's a valid index.
*/
- slab->us_freelist[freei] = 0;
+ slab->us_freelist[freei].us_item = 0;
}
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 35acfde..a4cbe5f 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -35,10 +35,10 @@
/*
* Here's a quick description of the relationship between the objects:
*
- * Zones contain lists of slabs which are stored in either the full bin, empty
+ * Kegs contain lists of slabs which are stored in either the full bin, empty
* bin, or partially allocated bin, to reduce fragmentation. They also contain
* the user supplied value for size, which is adjusted for alignment purposes
- * and rsize is the result of that. The zone also stores information for
+ * and rsize is the result of that. The Keg also stores information for
* managing a hash of page addresses that maps pages to uma_slab_t structures
* for pages that don't have embedded uma_slab_t's.
*
@@ -67,6 +67,20 @@
* so at this time it may not make sense to optimize for it. This can, of
* course, be solved with dynamic slab sizes.
*
+ * Kegs may serve multiple Zones but by far most of the time they only serve
+ * one. When a Zone is created, a Keg is allocated and setup for it. While
+ * the backing Keg stores slabs, the Zone caches Buckets of items allocated
+ * from the slabs. Each Zone is equipped with an init/fini and ctor/dtor
+ * pair, as well as with its own set of small per-CPU caches, layered above
+ * the Zone's general Bucket cache.
+ *
+ * The PCPU caches are protected by their own locks, while the Zones backed
+ * by the same Keg all share a common Keg lock (to coalesce contention on
+ * the backing slabs). The backing Keg typically only serves one Zone but
+ * in the case of multiple Zones, one of the Zones is considered the
+ * Master Zone and all Zone-related stats from the Keg are done in the
+ * Master Zone. For an example of a Multi-Zone setup, refer to the
+ * Mbuf allocation code.
*/
/*
@@ -134,28 +148,6 @@
SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h), \
(mem))], (s), uma_slab, us_hlink);
-/* Page management structure */
-
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab {
- uma_zone_t us_zone; /* Zone we live in */
- union {
- LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */
- unsigned long _us_size; /* Size of allocation */
- } us_type;
- SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */
- u_int8_t *us_data; /* First item */
- u_int8_t us_flags; /* Page flags see uma.h */
- u_int8_t us_freecount; /* How many are free? */
- u_int8_t us_firstfree; /* First free item index */
- u_int8_t us_freelist[1]; /* Free List (actually larger) */
-};
-
-#define us_link us_type._us_link
-#define us_size us_type._us_size
-
-typedef struct uma_slab * uma_slab_t;
-
/* Hash table for freed address -> slab translation */
SLIST_HEAD(slabhead, uma_slab);
@@ -188,6 +180,97 @@ struct uma_cache {
typedef struct uma_cache * uma_cache_t;
/*
+ * Keg management structure
+ *
+ * TODO: Optimize for cache line size
+ *
+ */
+struct uma_keg {
+ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */
+
+ struct mtx uk_lock; /* Lock for the keg */
+ struct uma_hash uk_hash;
+
+ LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */
+ LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */
+ LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */
+ LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */
+
+ u_int32_t uk_recurse; /* Allocation recursion count */
+ u_int32_t uk_align; /* Alignment mask */
+ u_int32_t uk_pages; /* Total page count */
+ u_int32_t uk_free; /* Count of items free in slabs */
+ u_int32_t uk_size; /* Requested size of each item */
+ u_int32_t uk_rsize; /* Real size of each item */
+ u_int32_t uk_maxpages; /* Maximum number of pages to alloc */
+
+ uma_init uk_init; /* Keg's init routine */
+ uma_fini uk_fini; /* Keg's fini routine */
+ uma_alloc uk_allocf; /* Allocation function */
+ uma_free uk_freef; /* Free routine */
+
+ struct vm_object *uk_obj; /* Zone specific object */
+ vm_offset_t uk_kva; /* Base kva for zones with objs */
+ uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */
+
+ u_int16_t uk_pgoff; /* Offset to uma_slab struct */
+ u_int16_t uk_ppera; /* pages per allocation from backend */
+ u_int16_t uk_ipers; /* Items per slab */
+ u_int16_t uk_flags; /* Internal flags */
+};
+
+/* Simpler reference to uma_keg for internal use. */
+typedef struct uma_keg * uma_keg_t;
+
+/* Page management structure */
+
+/* Sorry for the union, but space efficiency is important */
+struct uma_slab_head {
+ uma_keg_t us_keg; /* Keg we live in */
+ union {
+ LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */
+ unsigned long _us_size; /* Size of allocation */
+ } us_type;
+ SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */
+ u_int8_t *us_data; /* First item */
+ u_int8_t us_flags; /* Page flags see uma.h */
+ u_int8_t us_freecount; /* How many are free? */
+ u_int8_t us_firstfree; /* First free item index */
+};
+
+/* The standard slab structure */
+struct uma_slab {
+ struct uma_slab_head us_head; /* slab header data */
+ struct {
+ u_int8_t us_item;
+ } us_freelist[1]; /* actual number bigger */
+};
+
+/*
+ * The slab structure for UMA_ZONE_REFCNT zones for whose items we
+ * maintain reference counters in the slab for.
+ */
+struct uma_slab_refcnt {
+ struct uma_slab_head us_head; /* slab header data */
+ struct {
+ u_int8_t us_item;
+ u_int32_t us_refcnt;
+ } us_freelist[1]; /* actual number bigger */
+};
+
+#define us_keg us_head.us_keg
+#define us_link us_head.us_type._us_link
+#define us_size us_head.us_type._us_size
+#define us_hlink us_head.us_hlink
+#define us_data us_head.us_data
+#define us_flags us_head.us_flags
+#define us_freecount us_head.us_freecount
+#define us_firstfree us_head.us_firstfree
+
+typedef struct uma_slab * uma_slab_t;
+typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
+
+/*
* Zone management structure
*
* TODO: Optimize for cache line size
@@ -195,42 +278,22 @@ typedef struct uma_cache * uma_cache_t;
*/
struct uma_zone {
char *uz_name; /* Text name of the zone */
- LIST_ENTRY(uma_zone) uz_link; /* List of all zones */
- u_int32_t uz_align; /* Alignment mask */
- u_int32_t uz_pages; /* Total page count */
-
-/* Used during alloc / free */
- struct mtx uz_lock; /* Lock for the zone */
- u_int32_t uz_free; /* Count of items free in slabs */
- u_int16_t uz_ipers; /* Items per slab */
- u_int16_t uz_flags; /* Internal flags */
-
- LIST_HEAD(,uma_slab) uz_part_slab; /* partially allocated slabs */
- LIST_HEAD(,uma_slab) uz_free_slab; /* empty slab list */
- LIST_HEAD(,uma_slab) uz_full_slab; /* full slabs */
+ struct mtx *uz_lock; /* Lock for the zone (keg's lock) */
+ uma_keg_t uz_keg; /* Our underlying Keg */
+
+ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
LIST_HEAD(,uma_bucket) uz_full_bucket; /* full buckets */
LIST_HEAD(,uma_bucket) uz_free_bucket; /* Buckets for frees */
- u_int32_t uz_size; /* Requested size of each item */
- u_int32_t uz_rsize; /* Real size of each item */
-
- struct uma_hash uz_hash;
- u_int16_t uz_pgoff; /* Offset to uma_slab struct */
- u_int16_t uz_ppera; /* pages per allocation from backend */
uma_ctor uz_ctor; /* Constructor for each allocation */
uma_dtor uz_dtor; /* Destructor */
- u_int64_t uz_allocs; /* Total number of allocations */
-
uma_init uz_init; /* Initializer for each item */
uma_fini uz_fini; /* Discards memory */
- uma_alloc uz_allocf; /* Allocation function */
- uma_free uz_freef; /* Free routine */
- struct vm_object *uz_obj; /* Zone specific object */
- vm_offset_t uz_kva; /* Base kva for zones with objs */
- u_int32_t uz_maxpages; /* Maximum number of pages to alloc */
- int uz_recurse; /* Allocation recursion count */
+
+ u_int64_t uz_allocs; /* Total number of allocations */
uint16_t uz_fills; /* Outstanding bucket fills */
uint16_t uz_count; /* Highest value ub_ptr can have */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
@@ -256,16 +319,16 @@ void uma_large_free(uma_slab_t slab);
#define ZONE_LOCK_INIT(z, lc) \
do { \
if ((lc)) \
- mtx_init(&(z)->uz_lock, (z)->uz_name, \
+ mtx_init((z)->uz_lock, (z)->uz_name, \
(z)->uz_name, MTX_DEF | MTX_DUPOK); \
else \
- mtx_init(&(z)->uz_lock, (z)->uz_name, \
+ mtx_init((z)->uz_lock, (z)->uz_name, \
"UMA zone", MTX_DEF | MTX_DUPOK); \
} while (0)
-#define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock)
-#define ZONE_LOCK(z) mtx_lock(&(z)->uz_lock)
-#define ZONE_UNLOCK(z) mtx_unlock(&(z)->uz_lock)
+#define ZONE_LOCK_FINI(z) mtx_destroy((z)->uz_lock)
+#define ZONE_LOCK(z) mtx_lock((z)->uz_lock)
+#define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lock)
#define CPU_LOCK_INIT(cpu) \
mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu", \
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 3e21a99..f71785f 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -320,16 +320,6 @@ kmem_malloc(map, size, flags)
vm_map_lock(map);
if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
vm_map_unlock(map);
- if (map != kmem_map) {
- static int last_report; /* when we did it (in ticks) */
- if (ticks < last_report ||
- (ticks - last_report) >= hz) {
- last_report = ticks;
- printf("Out of mbuf address space!\n");
- printf("Consider increasing NMBCLUSTERS\n");
- }
- return (0);
- }
if ((flags & M_NOWAIT) == 0)
panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
(long)size, (long)map->size);
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index ac9dd26..8992599 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -256,7 +256,6 @@ static char *nlistf = NULL, *memf = NULL;
int Aflag; /* show addresses of protocol control block */
int aflag; /* show all sockets (including servers) */
int bflag; /* show i/f total bytes in/out */
-int cflag; /* show mbuf cache information */
int dflag; /* show i/f dropped packets */
int gflag; /* show group (multicast) routing or stats */
int iflag; /* show interfaces */
@@ -297,9 +296,6 @@ main(int argc, char *argv[])
case 'b':
bflag = 1;
break;
- case 'c':
- cflag = 1;
- break;
case 'd':
dflag = 1;
break;
@@ -425,10 +421,6 @@ main(int argc, char *argv[])
if (nlistf != NULL || memf != NULL)
setgid(getgid());
- if (cflag && !mflag) {
- (void)fprintf(stderr, "-c only valid with -m\n");
- usage();
- }
if (mflag) {
if (memf != NULL) {
if (kread(0, 0, 0) == 0)
diff --git a/usr.bin/netstat/mbuf.c b/usr.bin/netstat/mbuf.c
index aa6a8d2..98546c4 100644
--- a/usr.bin/netstat/mbuf.c
+++ b/usr.bin/netstat/mbuf.c
@@ -99,17 +99,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
u_long mbhiaddr, u_long clhiaddr, u_long mbloaddr, u_long clloaddr,
u_long cpusaddr __unused, u_long pgsaddr, u_long mbpaddr)
{
- int i, j, nmbufs, nmbclusters, page_size, num_objs;
+ int i, nmbclusters;
int nsfbufs, nsfbufspeak, nsfbufsused;
- u_int mbuf_hiwm, clust_hiwm, mbuf_lowm, clust_lowm;
- u_long totspace[2], totused[2];
- u_long gentotnum, gentotfree, totnum, totfree;
- u_long totmem, totmemalloced, totmemused;
short nmbtypes;
size_t mlen;
long *mbtypes = NULL;
struct mbstat *mbstat = NULL;
- struct mbpstat **mbpstat = NULL;
struct mbtypenames *mp;
bool *seen = NULL;
@@ -119,50 +114,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
goto err;
}
- /*
- * XXX: Unfortunately, for the time being, we have to fetch
- * the total length of the per-CPU stats area via sysctl
- * (regardless of whether we're looking at a core or not.
- */
- if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &mlen, NULL, 0) < 0) {
- warn("sysctl: retrieving mb_statpcpu len");
- goto err;
- }
- num_objs = (int)(mlen / sizeof(struct mbpstat));
- if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
- warn("calloc: cannot allocate memory for mbpstats pointers");
- goto err;
- }
- if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
- warn("calloc: cannot allocate memory for mbpstats");
- goto err;
- }
-
if (mbaddr) {
- if (kread(mbpaddr, (char *)mbpstat[0], mlen))
- goto err;
if (kread(mbaddr, (char *)mbstat, sizeof mbstat))
goto err;
if (kread(nmbcaddr, (char *)&nmbclusters, sizeof(int)))
goto err;
- if (kread(nmbufaddr, (char *)&nmbufs, sizeof(int)))
- goto err;
- if (kread(mbhiaddr, (char *)&mbuf_hiwm, sizeof(u_int)))
- goto err;
- if (kread(clhiaddr, (char *)&clust_hiwm, sizeof(u_int)))
- goto err;
- if (kread(mbloaddr, (char *)&mbuf_lowm, sizeof(u_int)))
- goto err;
- if (kread(clloaddr, (char *)&clust_lowm, sizeof(u_int)))
- goto err;
- if (kread(pgsaddr, (char *)&page_size, sizeof(int)))
- goto err;
} else {
- if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &mlen,
- NULL, 0) < 0) {
- warn("sysctl: retrieving mb_statpcpu");
- goto err;
- }
mlen = sizeof *mbstat;
if (sysctlbyname("kern.ipc.mbstat", mbstat, &mlen, NULL, 0)
< 0) {
@@ -175,43 +132,9 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
warn("sysctl: retrieving nmbclusters");
goto err;
}
- mlen = sizeof(int);
- if (sysctlbyname("kern.ipc.nmbufs", &nmbufs, &mlen, NULL, 0)
- < 0) {
- warn("sysctl: retrieving nmbufs");
- goto err;
- }
- mlen = sizeof(u_int);
- if (sysctlbyname("kern.ipc.mbuf_hiwm", &mbuf_hiwm, &mlen,
- NULL, 0) < 0) {
- warn("sysctl: retrieving mbuf_hiwm");
- goto err;
- }
- mlen = sizeof(u_int);
- if (sysctlbyname("kern.ipc.clust_hiwm", &clust_hiwm, &mlen,
- NULL, 0) < 0) {
- warn("sysctl: retrieving clust_hiwm");
- goto err;
- }
- mlen = sizeof(u_int);
- if (sysctlbyname("kern.ipc.mbuf_lowm", &mbuf_lowm, &mlen,
- NULL, 0) < 0) {
- warn("sysctl: retrieving mbuf_lowm");
- goto err;
- }
- mlen = sizeof(u_int);
- if (sysctlbyname("kern.ipc.clust_lowm", &clust_lowm, &mlen,
- NULL, 0) < 0) {
- warn("sysctl: retrieving clust_lowm");
- goto err;
- }
- mlen = sizeof(int);
- if (sysctlbyname("hw.pagesize", &page_size, &mlen, NULL, 0)
- < 0) {
- warn("sysctl: retrieving hw.pagesize");
- goto err;
- }
}
+ if (mbstat->m_mbufs < 0) mbstat->m_mbufs = 0; /* XXX */
+ if (mbstat->m_mclusts < 0) mbstat->m_mclusts = 0; /* XXX */
nmbtypes = mbstat->m_numtypes;
if ((seen = calloc(nmbtypes, sizeof(*seen))) == NULL) {
@@ -223,59 +146,13 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
goto err;
}
- for (i = 0; i < num_objs; i++)
- mbpstat[i] = mbpstat[0] + i;
-
#undef MSIZE
#define MSIZE (mbstat->m_msize)
#undef MCLBYTES
#define MCLBYTES (mbstat->m_mclbytes)
-#define GENLST (num_objs - 1)
- totnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
- totfree = mbpstat[GENLST]->mb_mbfree;
- for (j = 1; j < nmbtypes; j++)
- mbtypes[j] += mbpstat[GENLST]->mb_mbtypes[j];
- totspace[0] = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck * MSIZE;
- for (i = 0; i < (num_objs - 1); i++) {
- if (mbpstat[i]->mb_active == 0)
- continue;
- totspace[0] += mbpstat[i]->mb_mbbucks*mbstat->m_mbperbuck*MSIZE;
- totnum += mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck;
- totfree += mbpstat[i]->mb_mbfree;
- for (j = 1; j < nmbtypes; j++)
- mbtypes[j] += mbpstat[i]->mb_mbtypes[j];
- }
- totused[0] = totnum - totfree;
- if (cflag) {
- printf("mbuf usage:\n"
- "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
- totused[0], totnum, nmbufs);
- gentotnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
- gentotfree = mbpstat[GENLST]->mb_mbfree;
- printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
- gentotnum - gentotfree, gentotnum);
- } else {
- /* XXX: peak is now wrong. */
- printf("%lu/%lu/%d mbufs in use (current/peak/max):\n",
- totused[0], totnum, nmbufs);
- }
+ printf("%lu mbufs in use\n", mbstat->m_mbufs);
- for (i = 0; cflag && i < (num_objs - 1); i++) {
- if (mbpstat[i]->mb_active == 0)
- continue;
- printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
- i,
- (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck -
- mbpstat[i]->mb_mbfree),
- (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck));
- }
- if (cflag) {
- printf("\tMbuf cache high watermark: %d\n", mbuf_hiwm);
-#ifdef NOTYET
- printf("\tMbuf cache low watermark: %d\n", mbuf_lowm);
-#endif
- }
for (mp = mbtypenames; mp->mt_name; mp++) {
if (mbtypes[mp->mt_type]) {
seen[mp->mt_type] = YES;
@@ -288,53 +165,10 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
printf("\t %lu mbufs allocated to <mbuf type: %d>\n",
mbtypes[i], i);
}
- if (cflag)
- printf("\t%.1f%% of mbuf map consumed\n",
- totspace[0] * 100.0 / (nmbufs * MSIZE));
- totnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
- totfree = mbpstat[GENLST]->mb_clfree;
- totspace[1] = mbpstat[GENLST]->mb_clbucks*mbstat->m_clperbuck*MCLBYTES;
- for (i = 0; i < (num_objs - 1); i++) {
- if (mbpstat[i]->mb_active == 0)
- continue;
- totspace[1] += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck
- * MCLBYTES;
- totnum += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck;
- totfree += mbpstat[i]->mb_clfree;
- }
- totused[1] = totnum - totfree;
- if (cflag) {
- printf("mbuf cluster usage:\n"
- "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
- totused[1], totnum, nmbclusters);
- gentotnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
- gentotfree = mbpstat[GENLST]->mb_clfree;
- printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
- gentotnum - gentotfree, gentotnum);
- } else {
- /* XXX: peak is now wrong. */
- printf("%lu/%lu/%d mbuf clusters in use (current/peak/max)\n",
- totused[1], totnum, nmbclusters);
- }
- for (i = 0; cflag && i < (num_objs - 1); i++) {
- if (mbpstat[i]->mb_active == 0)
- continue;
- printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
- i,
- (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck -
- mbpstat[i]->mb_clfree),
- (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck));
- }
- if (cflag) {
- printf("\tCluster cache high watermark: %d\n", clust_hiwm);
-#ifdef NOTYET
- printf("\tCluster cache low watermark: %d\n", clust_lowm);
-#endif
- }
- if (cflag)
- printf("\t%.1f%% of cluster map consumed\n",
- totspace[1] * 100.0 / (nmbclusters * MCLBYTES));
+ printf("%lu/%d mbuf clusters in use (current/max)\n",
+ mbstat->m_mclusts, nmbclusters);
+
mlen = sizeof(nsfbufs);
if (!sysctlbyname("kern.ipc.nsfbufs", &nsfbufs, &mlen, NULL, 0) &&
!sysctlbyname("kern.ipc.nsfbufsused", &nsfbufsused, &mlen, NULL,
@@ -344,15 +178,8 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
printf("%d/%d/%d sfbufs in use (current/peak/max)\n",
nsfbufsused, nsfbufspeak, nsfbufs);
}
- totmem = nmbufs * MSIZE + nmbclusters * MCLBYTES;
- totmemalloced = totspace[0] + totspace[1];
- totmemused = totused[0] * MSIZE + totused[1] * MCLBYTES;
- printf(
- "%lu KBytes allocated to network (%.1f%% in use, %.1f%% wired)\n",
- totmem / 1024, totmemused * 100.0 / totmem,
- totmemalloced * 100.0 / totmem);
- printf("%lu requests for memory denied\n", mbstat->m_drops);
- printf("%lu requests for memory delayed\n", mbstat->m_wait);
+ printf("%lu KBytes allocated to network\n", (mbstat->m_mbufs * MSIZE +
+ mbstat->m_mclusts * MCLBYTES) / 1024);
printf("%lu requests for sfbufs denied\n", mbstat->sf_allocfail);
printf("%lu requests for sfbufs delayed\n", mbstat->sf_allocwait);
printf("%lu requests for I/O initiated by sendfile\n",
@@ -366,9 +193,4 @@ err:
free(seen);
if (mbstat != NULL)
free(mbstat);
- if (mbpstat != NULL) {
- if (mbpstat[0] != NULL)
- free(mbpstat[0]);
- free(mbpstat);
- }
}
diff --git a/usr.bin/netstat/netstat.1 b/usr.bin/netstat/netstat.1
index 45023fe..32edfec 100644
--- a/usr.bin/netstat/netstat.1
+++ b/usr.bin/netstat/netstat.1
@@ -181,7 +181,6 @@ or for a single
.Bk -words
.Nm
.Fl m
-.Op Fl c
.Op Fl M Ar core
.Op Fl N Ar system
.Ek
@@ -189,9 +188,6 @@ or for a single
Show statistics recorded by the memory management routines
.Pq Xr mbuf 9 .
The network manages a private pool of memory buffers.
-The
-.Fl c
-option shows per-CPU statistics for caching.
.It Xo
.Bk -words
.Nm
diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h
index c59b7e8..e2b3f29 100644
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@@ -39,7 +39,6 @@
extern int Aflag; /* show addresses of protocol control block */
extern int aflag; /* show all sockets (including servers) */
extern int bflag; /* show i/f total bytes in/out */
-extern int cflag; /* show mbuf cache information */
extern int dflag; /* show i/f dropped packets */
extern int gflag; /* show group (multicast) routing or stats */
extern int iflag; /* show interfaces */
diff --git a/usr.bin/systat/mbufs.c b/usr.bin/systat/mbufs.c
index e1b665b2..1193a3e 100644
--- a/usr.bin/systat/mbufs.c
+++ b/usr.bin/systat/mbufs.c
@@ -52,12 +52,9 @@ static const char sccsid[] = "@(#)mbufs.c 8.1 (Berkeley) 6/6/93";
#include "systat.h"
#include "extern.h"
-static struct mbpstat **mbpstat;
static struct mbstat *mbstat;
-static int num_objs;
static long *m_mbtypes;
static short nmbtypes;
-#define GENLST (num_objs - 1)
static struct mtnames {
short mt_type;
@@ -101,20 +98,11 @@ void
showmbufs()
{
int i, j, max, idx;
- u_long totfree;
+ u_long totmbufs;
char buf[10];
const char *mtname;
- totfree = mbpstat[GENLST]->mb_mbfree;
- for (i = 1; i < nmbtypes; i++)
- m_mbtypes[i] += mbpstat[GENLST]->mb_mbtypes[i];
- for (i = 0; i < GENLST; i++) {
- if (mbpstat[i]->mb_active == 0)
- continue;
- totfree += mbpstat[i]->mb_mbfree;
- for (j = 1; j < nmbtypes; j++)
- m_mbtypes[j] += mbpstat[i]->mb_mbtypes[j];
- }
+ totmbufs = mbstat->m_mbufs;
/*
* Print totals for different mbuf types.
@@ -159,16 +147,16 @@ showmbufs()
/*
* Print total number of free mbufs.
*/
- if (totfree > 0) {
- mvwprintw(wnd, 1+j, 0, "%-10.10s", "free");
- if (totfree > 60) {
- snprintf(buf, sizeof(buf), " %lu", totfree);
- totfree = 60;
- while(totfree--)
+ if (totmbufs > 0) {
+ mvwprintw(wnd, 1+j, 0, "%-10.10s", "Mbufs");
+ if (totmbufs > 60) {
+ snprintf(buf, sizeof(buf), " %lu", totmbufs);
+ totmbufs = 60;
+ while(totmbufs--)
waddch(wnd, 'X');
waddstr(wnd, buf);
} else {
- while(totfree--)
+ while(totmbufs--)
waddch(wnd, 'X');
}
wclrtoeol(wnd);
@@ -198,23 +186,6 @@ initmbufs()
return 0;
}
- if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &len, NULL, 0) < 0) {
- error("sysctl getting mbpstat total size failed");
- return 0;
- }
- num_objs = (int)(len / sizeof(struct mbpstat));
- if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
- error("calloc mbpstat pointers failed");
- return 0;
- }
- if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
- error("calloc mbpstat structures failed");
- return 0;
- }
-
- for (i = 0; i < num_objs; i++)
- mbpstat[i] = mbpstat[0] + i;
-
return 1;
}
@@ -223,7 +194,7 @@ fetchmbufs()
{
size_t len;
- len = num_objs * sizeof(struct mbpstat);
- if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &len, NULL, 0) < 0)
- printw("sysctl: mbpstat: %s", strerror(errno));
+ len = sizeof *mbstat;
+ if (sysctlbyname("kern.ipc.mbstat", mbstat, &len, NULL, 0) < 0)
+ printw("sysctl: mbstat: %s", strerror(errno));
}
OpenPOWER on IntegriCloud