summaryrefslogtreecommitdiffstats
path: root/sys/kern/subr_mbuf.c
diff options
context:
space:
mode:
authorbmilekic <bmilekic@FreeBSD.org>2001-06-22 06:35:32 +0000
committerbmilekic <bmilekic@FreeBSD.org>2001-06-22 06:35:32 +0000
commit5d710b296b16892625e76899c986dd7697754050 (patch)
treed56e06b359d94f82a565ce2544371429eb0c93ae /sys/kern/subr_mbuf.c
parent44d1723f45fa955b68fb90fc3e7604ceeea6f0d3 (diff)
downloadFreeBSD-src-5d710b296b16892625e76899c986dd7697754050.zip
FreeBSD-src-5d710b296b16892625e76899c986dd7697754050.tar.gz
Introduce numerous SMP friendly changes to the mbuf allocator. Namely,
introduce a modified allocation mechanism for mbufs and mbuf clusters; one which can scale under SMP and which offers the possibility of resource reclamation to be implemented in the future. Notable advantages: o Reduce contention for SMP by offering per-CPU pools and locks. o Better use of data cache due to per-CPU pools. o Much less code cache pollution due to excessively large allocation macros. o Framework for `grouping' objects from same page together so as to be able to possibly free wired-down pages back to the system if they are no longer needed by the network stacks. Additional things changed with this addition: - Moved some mbuf specific declarations and initializations from sys/conf/param.c into mbuf-specific code where they belong. - m_getclr() has been renamed to m_get_clrd() because the old name is really confusing. m_getclr() HAS been preserved though and is defined to the new name. No tree sweep has been done "to change the interface," as the old name will continue to be supported and is not depracated. The change was merely done because m_getclr() sounds too much like "m_get a cluster." - TEMPORARILY disabled mbtypes statistics displaying in netstat(1) and systat(1) (see TODO below). - Fixed systat(1) to display number of "free mbufs" based on new per-CPU stat structures. - Fixed netstat(1) to display new per-CPU stats based on sysctl-exported per-CPU stat structures. All infos are fetched via sysctl. TODO (in order of priority): - Re-enable mbtypes statistics in both netstat(1) and systat(1) after introducing an SMP friendly way to collect the mbtypes stats under the already introduced per-CPU locks (i.e. hopefully don't use atomic() - it seems too costly for a mere stat update, especially when other locks are already present). - Optionally have systat(1) display not only "total free mbufs" but also "total free mbufs per CPU pool." - Fix minor length-fetching issues in netstat(1) related to recently re-enabled option to read mbuf stats from a core file. - Move reference counters at least for mbuf clusters into an unused portion of the cluster itself, to save space and need to allocate a counter. - Look into introducing resource freeing possibly from a kproc. Reviewed by (in parts): jlemon, jake, silby, terry Tested by: jlemon (Intel & Alpha), mjacob (Intel & Alpha) Preliminary performance measurements: jlemon (and me, obviously) URL: http://people.freebsd.org/~bmilekic/mb_alloc/
Diffstat (limited to 'sys/kern/subr_mbuf.c')
-rw-r--r--sys/kern/subr_mbuf.c1029
1 files changed, 1029 insertions, 0 deletions
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
new file mode 100644
index 0000000..e285484
--- /dev/null
+++ b/sys/kern/subr_mbuf.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2001
+ * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Maximum number of PCPU containers. If you know what you're doing you could
+ * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
+ * system during compilation, and thus prevent kernel structure bloats.
+ */
+#ifdef MBALLOC_NCPU
+#define NCPU MBALLOC_NCPU
+#else
+#define NCPU MAXCPU
+#endif
+
+/*
+ * The mbuf allocator is heavily based on Alfred Perlstein's
+ * (alfred@FreeBSD.org) "memcache" allocator which is itself based
+ * on concepts from several per-CPU memory allocators. The difference
+ * between this allocator and memcache is that, among other things:
+ *
+ * (i) We don't free back to the map from the free() routine - we leave the
+ * option of implementing lazy freeing (from a kproc) in the future.
+ *
+ * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
+ * maximum number of allocatable objects of a given type. Further,
+ * we handle blocking on a cv in the case that the map is starved and
+ * we have to rely solely on cached (circulating) objects.
+ *
+ * The mbuf allocator keeps all objects that it allocates in mb_buckets.
+ * The buckets keep a page worth of objects (an object can be an mbuf or an
+ * mbuf cluster) and facilitate moving larger sets of contiguous objects
+ * from the per-CPU lists to the main list for the given object. The buckets
+ * also have an added advantage in that after several moves from a per-CPU
+ * list to the main list and back to the per-CPU list, contiguous objects
+ * are kept together, thus trying to put the TLB cache to good use.
+ *
+ * The buckets are kept on singly-linked lists called "containers." A container
+ * is protected by a mutex lock in order to ensure consistency. The mutex lock
+ * itself is allocated seperately and attached to the container at boot time,
+ * thus allowing for certain containers to share the same mutex lock. Per-CPU
+ * containers for mbufs and mbuf clusters all share the same per-CPU
+ * lock whereas the "general system" containers (i.e. the "main lists") for
+ * these objects share one global lock.
+ *
+ */
+struct mb_bucket {
+ SLIST_ENTRY(mb_bucket) mb_blist;
+ int mb_owner;
+ int mb_numfree;
+ void *mb_free[0];
+};
+
+struct mb_container {
+ SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
+ struct mtx *mc_lock;
+ int mc_numowner;
+ u_int mc_starved;
+ u_long *mc_objcount;
+ u_long *mc_numpgs;
+};
+
+struct mb_gen_list {
+ struct mb_container mb_cont;
+ struct cv mgl_mstarved;
+};
+
+struct mb_pcpu_list {
+ struct mb_container mb_cont;
+};
+
+/*
+ * Boot-time configurable object counts that will determine the maximum
+ * number of permitted objects in the mbuf and mcluster cases. In the
+ * ext counter (nmbcnt) case, it's just an indicator serving to scale
+ * kmem_map size properly - in other words, we may be allowed to allocate
+ * more than nmbcnt counters, whereas we will never be allowed to allocate
+ * more than nmbufs mbufs or nmbclusters mclusters.
+ * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
+ * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
+ */
+#ifndef NMBCLUSTERS
+#define NMBCLUSTERS (1024 + MAXUSERS * 16)
+#endif
+#ifndef NMBUFS
+#define NMBUFS (NMBCLUSTERS * 2)
+#endif
+#ifndef NSFBUFS
+#define NSFBUFS (512 + MAXUSERS * 16)
+#endif
+#ifndef NMBCNTS
+#define NMBCNTS (NMBCLUSTERS + NSFBUFS)
+#endif
+int nmbufs = NMBUFS;
+int nmbclusters = NMBCLUSTERS;
+int nmbcnt = NMBCNTS;
+int nsfbufs = NSFBUFS;
+TUNABLE_INT("kern.ipc.nmbufs", &nmbufs);
+TUNABLE_INT("kern.ipc.nmbclusters", &nmbclusters);
+TUNABLE_INT("kern.ipc.nmbcnt", &nmbcnt);
+TUNABLE_INT("kern.ipc.nsfbufs", &nsfbufs);
+
+/*
+ * Perform sanity checks of tunables declared above.
+ */
+static void
+tunable_mbinit(void *dummy)
+{
+ /*
+ * This has to be done before VM init.
+ */
+ if (nmbufs < nmbclusters * 2)
+ nmbufs = nmbclusters * 2;
+ if (nmbcnt < nmbclusters + nsfbufs)
+ nmbcnt = nmbclusters + nsfbufs;
+
+ return;
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+/*
+ * The freelist structures and mutex locks. The number statically declared
+ * here depends on the number of CPUs.
+ *
+ * We setup in such a way that all the objects (mbufs, clusters)
+ * share the same mutex lock. It has been established that we do not benefit
+ * from different locks for different objects, so we use the same lock,
+ * regardless of object type.
+ */
+struct mb_lstmngr {
+ struct mb_gen_list *ml_genlist;
+ struct mb_pcpu_list *ml_cntlst[NCPU];
+ struct mb_bucket **ml_btable;
+ vm_map_t ml_map;
+ vm_offset_t ml_mapbase;
+ vm_offset_t ml_maptop;
+ int ml_mapfull;
+ u_int ml_objsize;
+ u_int *ml_wmhigh;
+};
+struct mb_lstmngr mb_list_mbuf, mb_list_clust;
+struct mtx mbuf_gen, mbuf_pcpu[NCPU];
+
+/*
+ * Local macros for internal allocator structure manipulations.
+ */
+#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
+
+#define MB_GET_PCPU_LIST_NUM(mb_lst, num) (mb_lst)->ml_cntlst[(num)]
+
+#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
+
+#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
+
+#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
+
+#define MB_BUCKET_INDX(mb_obj, mb_lst) \
+ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+
+#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
+{ \
+ struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
+ \
+ (mb_bckt)->mb_numfree--; \
+ (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
+ (*((mb_lst)->mb_cont.mc_objcount))--; \
+ if ((mb_bckt)->mb_numfree == 0) { \
+ SLIST_REMOVE_HEAD(_mchd, mb_blist); \
+ SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
+ (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
+ } \
+}
+
+#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
+ (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
+ (mb_bckt)->mb_numfree++; \
+ (*((mb_lst)->mb_cont.mc_objcount))++;
+
+/*
+ * Ownership of buckets/containers is represented by integers. The PCPU
+ * lists range from 0 to NCPU-1. We need a free numerical id for the general
+ * list (we use NCPU). We also need a non-conflicting free bit to indicate
+ * that the bucket is free and removed from a container, while not losing
+ * the bucket's originating container id. We use the highest bit
+ * for the free marker.
+ */
+#define MB_GENLIST_OWNER (NCPU)
+#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
+
+/*
+ * sysctl(8) exported objects
+ */
+struct mbstat mbstat; /* General stats + infos. */
+struct mbpstat mb_statpcpu[NCPU+1]; /* PCPU + Gen. container alloc stats */
+int mbuf_wait = 64; /* Sleep time for wait code (ticks) */
+u_int mbuf_limit = 512; /* Upper lim. on # of mbufs per CPU */
+u_int clust_limit = 128; /* Upper lim. on # of clusts per CPU */
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
+ "Maximum number of mbuf clusters available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
+ "Maximum number of mbufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
+ "Number used to scale kmem_map to ensure sufficient space for counters");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
+ "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
+ "Sleep time of mbuf subsystem wait allocations during exhaustion");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
+ "Upper limit of number of mbufs allowed on each PCPU list");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
+ "Upper limit of number of mbuf clusters allowed on each PCPU list");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+ "Mbuf general information and statistics");
+SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
+ sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
+
+/*
+ * Prototypes of local allocator routines.
+ */
+static __inline void *mb_alloc(struct mb_lstmngr *, int);
+void *mb_alloc_wait(struct mb_lstmngr *);
+static __inline void mb_free(struct mb_lstmngr *, void *);
+static void mb_init(void *);
+struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
+ struct mb_pcpu_list *);
+void mb_reclaim(void);
+
+/*
+ * Initial allocation numbers. Each parameter represents the number of buckets
+ * of each object that will be placed initially in each PCPU container for
+ * said object.
+ */
+#define NMB_MBUF_INIT 4
+#define NMB_CLUST_INIT 16
+
+/*
+ * Initialize the mbuf subsystem.
+ *
+ * We sub-divide the kmem_map into several submaps; this way, we don't have
+ * to worry about artificially limiting the number of mbuf or mbuf cluster
+ * allocations, due to fear of one type of allocation "stealing" address
+ * space initially reserved for another.
+ *
+ * Setup both the general containers and all the PCPU containers. Populate
+ * the PCPU containers with initial numbers.
+ */
+MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mb_init, NULL)
+void
+mb_init(void *dummy)
+{
+ struct mb_pcpu_list *pcpu_cnt;
+ vm_size_t mb_map_size;
+ int i, j;
+
+ /*
+ * Setup all the submaps, for each type of object that we deal
+ * with in this allocator.
+ */
+ mb_map_size = (vm_size_t)(nmbufs * MSIZE);
+ mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+ mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
+ sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ if (mb_list_mbuf.ml_btable == NULL)
+ goto bad;
+ mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
+ &(mb_list_mbuf.ml_maptop), mb_map_size);
+ mb_list_mbuf.ml_mapfull = 0;
+ mb_list_mbuf.ml_objsize = MSIZE;
+ mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+
+ mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
+ mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
+ mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
+ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ if (mb_list_clust.ml_btable == NULL)
+ goto bad;
+ mb_list_clust.ml_map = kmem_suballoc(kmem_map,
+ &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
+ mb_map_size);
+ mb_list_clust.ml_mapfull = 0;
+ mb_list_clust.ml_objsize = MCLBYTES;
+ mb_list_clust.ml_wmhigh = &clust_limit;
+
+ /* XXX XXX XXX: mbuf_map->system_map = clust_map->system_map = 1 */
+
+ /*
+ * Allocate required general (global) containers for each object type.
+ */
+ mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+ M_NOWAIT);
+ mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
+ M_NOWAIT);
+ if ((mb_list_mbuf.ml_genlist == NULL) ||
+ (mb_list_clust.ml_genlist == NULL))
+ goto bad;
+
+ /*
+ * Initialize condition variables and general container mutex locks.
+ */
+ mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", 0);
+ cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
+ cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
+ "mcluster pool starved");
+ mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
+ mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
+
+ /*
+ * Setup the general containers for each object.
+ */
+ mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
+ mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
+ mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
+ mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
+ mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
+ mb_list_clust.ml_genlist->mb_cont.mc_objcount =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
+ mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
+ mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+ SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
+ SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
+
+ /*
+ * Initialize general mbuf statistics
+ */
+ mbstat.m_msize = MSIZE;
+ mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_minclsize = MINCLSIZE;
+ mbstat.m_mlen = MLEN;
+ mbstat.m_mhlen = MHLEN;
+
+ /*
+ * Allocate and initialize PCPU containers.
+ */
+ for (i = 0; i < mp_ncpus; i++) {
+ mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+ M_MBUF, M_NOWAIT);
+ mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+ M_MBUF, M_NOWAIT);
+ if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
+ (mb_list_clust.ml_cntlst[i] == NULL))
+ goto bad;
+
+ mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", 0);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
+
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
+ &(mb_statpcpu[i].mb_mbfree);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
+ &(mb_statpcpu[i].mb_clfree);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
+ &(mb_statpcpu[i].mb_mbpgs);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
+ &(mb_statpcpu[i].mb_clpgs);
+
+ SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
+ SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
+
+ /*
+ * Perform initial allocations.
+ */
+ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
+ MB_LOCK_CONT(pcpu_cnt);
+ for (j = 0; j < NMB_MBUF_INIT; j++) {
+ if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
+ == NULL)
+ goto bad;
+ }
+ MB_UNLOCK_CONT(pcpu_cnt);
+
+ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
+ MB_LOCK_CONT(pcpu_cnt);
+ for (j = 0; j < NMB_CLUST_INIT; j++) {
+ if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
+ == NULL)
+ goto bad;
+ }
+ MB_UNLOCK_CONT(pcpu_cnt);
+ }
+
+ return;
+bad:
+ panic("mb_init(): failed to initialize mbuf subsystem!");
+}
+
+/*
+ * Populate a given mbuf PCPU container with a bucket full of fresh new
+ * buffers. Return a pointer to the new bucket (already in the container if
+ * successful), or return NULL on failure.
+ *
+ * LOCKING NOTES:
+ * PCPU container lock must be held when this is called.
+ * The lock is dropped here so that we can cleanly call the underlying VM
+ * code. If we fail, we return with no locks held. If we succeed (i.e. return
+ * non-NULL), we return with the PCPU lock held, ready for allocation from
+ * the returned bucket.
+ */
+struct mb_bucket *
+mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
+{
+ struct mb_bucket *bucket;
+ caddr_t p;
+ int i;
+
+ MB_UNLOCK_CONT(cnt_lst);
+ /*
+ * If our object's (finite) map is starved now (i.e. no more address
+ * space), bail out now.
+ */
+ if (mb_list->ml_mapfull)
+ return (NULL);
+
+ bucket = malloc(sizeof(struct mb_bucket) +
+ PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+ how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+ if (bucket == NULL)
+ return (NULL);
+
+ p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
+ how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+ if (p == NULL) {
+ free(bucket, M_MBUF);
+ return (NULL);
+ }
+
+ bucket->mb_numfree = 0;
+ mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
+ for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+ bucket->mb_free[i] = p;
+ bucket->mb_numfree++;
+ p += mb_list->ml_objsize;
+ }
+
+ MB_LOCK_CONT(cnt_lst);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
+ (*(cnt_lst->mb_cont.mc_numpgs))++;
+ *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
+
+ return (bucket);
+}
+
+/*
+ * Allocate an mbuf-subsystem type object.
+ * The general case is very easy. Complications only arise if our PCPU
+ * container is empty. Things get worse if the PCPU container is empty,
+ * the general container is empty, and we've run out of address space
+ * in our map; then we try to block if we're willing to (M_TRYWAIT).
+ */
+static __inline
+void *
+mb_alloc(struct mb_lstmngr *mb_list, int how)
+{
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_bucket *bucket;
+ void *m;
+
+ m = NULL;
+ cnt_lst = MB_GET_PCPU_LIST(mb_list);
+ MB_LOCK_CONT(cnt_lst);
+
+ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
+ /*
+ * This is the easy allocation case. We just grab an object
+ * from a bucket in the PCPU container. At worst, we
+ * have just emptied the bucket and so we remove it
+ * from the container.
+ */
+ MB_GET_OBJECT(m, bucket, cnt_lst);
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ struct mb_gen_list *gen_list;
+
+ /*
+ * This is the less-common more difficult case. We must
+ * first verify if the general list has anything for us
+ * and if that also fails, we must allocate a page from
+ * the map and create a new bucket to place in our PCPU
+ * container (already locked). If the map is starved then
+ * we're really in for trouble, as we have to wait on
+ * the general container's condition variable.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+
+ if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
+ != NULL) {
+ /*
+ * Give ownership of the bucket to our CPU's
+ * container, but only actually put the bucket
+ * in the container if it doesn't become free
+ * upon removing an mbuf from it.
+ */
+ SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
+ mb_blist);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ (*(gen_list->mb_cont.mc_numpgs))--;
+ (*(cnt_lst->mb_cont.mc_numpgs))++;
+ *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
+ bucket->mb_numfree--;
+ m = bucket->mb_free[(bucket->mb_numfree)];
+ if (bucket->mb_numfree == 0) {
+ SLIST_NEXT(bucket, mb_blist) = NULL;
+ bucket->mb_owner |= MB_BUCKET_FREE;
+ } else {
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ *(cnt_lst->mb_cont.mc_objcount) +=
+ bucket->mb_numfree;
+ }
+ MB_UNLOCK_CONT(gen_list);
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ /*
+ * We'll have to allocate a new page.
+ */
+ MB_UNLOCK_CONT(gen_list);
+ bucket = mb_pop_cont(mb_list, how, cnt_lst);
+ if (bucket != NULL) {
+ bucket->mb_numfree--;
+ m = bucket->mb_free[(bucket->mb_numfree)];
+ (*(cnt_lst->mb_cont.mc_objcount))--;
+ MB_UNLOCK_CONT(cnt_lst);
+ } else {
+ if (how == M_TRYWAIT) {
+ /*
+ * Absolute worst-case scenario. We block if
+ * we're willing to, but only after trying to
+ * steal from other lists.
+ */
+ mb_list->ml_mapfull = 1;
+ m = mb_alloc_wait(mb_list);
+ } else
+ /* XXX: No consistency. */
+ mbstat.m_drops++;
+ }
+ }
+ }
+
+ return (m);
+}
+
+/*
+ * This is the worst-case scenario called only if we're allocating with
+ * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
+ * by looking in every PCPU container. If we're still unsuccesful, we
+ * try the general container one last time and possibly block on our
+ * starved cv.
+ */
+void *
+mb_alloc_wait(struct mb_lstmngr *mb_list)
+{
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_gen_list *gen_list;
+ struct mb_bucket *bucket;
+ void *m;
+ int i, cv_ret;
+
+ /*
+ * Try to reclaim mbuf-related objects (mbufs, clusters).
+ */
+ mb_reclaim();
+
+ /*
+ * Cycle all the PCPU containers. Increment starved counts if found
+ * empty.
+ */
+ for (i = 0; i < mp_ncpus; i++) {
+ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
+ MB_LOCK_CONT(cnt_lst);
+
+ /*
+ * If container is non-empty, get a single object from it.
+ * If empty, increment starved count.
+ */
+ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
+ NULL) {
+ MB_GET_OBJECT(m, bucket, cnt_lst);
+ MB_UNLOCK_CONT(cnt_lst);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ return (m);
+ } else
+ cnt_lst->mb_cont.mc_starved++;
+
+ MB_UNLOCK_CONT(cnt_lst);
+ }
+
+ /*
+ * We're still here, so that means it's time to get the general
+ * container lock, check it one more time (now that mb_reclaim()
+ * has been called) and if we still get nothing, block on the cv.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
+ MB_GET_OBJECT(m, bucket, gen_list);
+ MB_UNLOCK_CONT(gen_list);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ return (m);
+ }
+
+ gen_list->mb_cont.mc_starved++;
+ cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
+ gen_list->mb_cont.mc_lock, mbuf_wait);
+ gen_list->mb_cont.mc_starved--;
+
+ if ((cv_ret == 0) &&
+ ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
+ MB_GET_OBJECT(m, bucket, gen_list);
+ mbstat.m_wait++; /* XXX: No consistency. */
+ } else {
+ mbstat.m_drops++; /* XXX: No consistency. */
+ m = NULL;
+ }
+
+ MB_UNLOCK_CONT(gen_list);
+
+ return (m);
+}
+
+/*
+ * Free an object to its rightful container.
+ * In the very general case, this operation is really very easy.
+ * Complications arise primarily if:
+ * (a) We've hit the high limit on number of free objects allowed in
+ * our PCPU container.
+ * (b) We're in a critical situation where our container has been
+ * marked 'starved' and we need to issue wakeups on the starved
+ * condition variable.
+ * (c) Minor (odd) cases: our bucket has migrated while we were
+ * waiting for the lock; our bucket is in the general container;
+ * our bucket is empty.
+ */
+static __inline
+void
+mb_free(struct mb_lstmngr *mb_list, void *m)
+{
+ struct mb_pcpu_list *cnt_lst;
+ struct mb_gen_list *gen_list;
+ struct mb_bucket *bucket;
+ u_int owner;
+
+ bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
+
+ /*
+ * Make sure that if after we lock the bucket's present container the
+ * bucket has migrated, that we drop the lock and get the new one.
+ */
+retry_lock:
+ owner = bucket->mb_owner & ~MB_BUCKET_FREE;
+ switch (owner) {
+ case MB_GENLIST_OWNER:
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+ MB_UNLOCK_CONT(gen_list);
+ goto retry_lock;
+ }
+
+ /*
+ * If we're intended for the general container, this is
+ * real easy: no migrating required. The only `bogon'
+ * is that we're now contending with all the threads
+ * dealing with the general list, but this is expected.
+ */
+ MB_PUT_OBJECT(m, bucket, gen_list);
+ if (gen_list->mb_cont.mc_starved > 0)
+ cv_signal(&(gen_list->mgl_mstarved));
+ MB_UNLOCK_CONT(gen_list);
+ break;
+
+ default:
+ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
+ MB_LOCK_CONT(cnt_lst);
+ if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
+ MB_UNLOCK_CONT(cnt_lst);
+ goto retry_lock;
+ }
+
+ MB_PUT_OBJECT(m, bucket, cnt_lst);
+
+ if (cnt_lst->mb_cont.mc_starved > 0) {
+ /*
+ * This is a tough case. It means that we've
+ * been flagged at least once to indicate that
+ * we're empty, and that the system is in a critical
+ * situation, so we ought to migrate at least one
+ * bucket over to the general container.
+ * There may or may not be a thread blocking on
+ * the starved condition variable, but chances
+ * are that one will eventually come up soon so
+ * it's better to migrate now than never.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
+ ("mb_free: corrupt bucket %p\n", bucket));
+ SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = MB_GENLIST_OWNER;
+ (*(cnt_lst->mb_cont.mc_objcount))--;
+ (*(gen_list->mb_cont.mc_objcount))++;
+ (*(cnt_lst->mb_cont.mc_numpgs))--;
+ (*(gen_list->mb_cont.mc_numpgs))++;
+
+ /*
+ * Determine whether or not to keep transferring
+ * buckets to the general list or whether we've
+ * transferred enough already.
+ * We realize that although we may flag another
+ * bucket to be migrated to the general container
+ * that in the meantime, the thread that was
+ * blocked on the cv is already woken up and
+ * long gone. But in that case, the worst
+ * consequence is that we will end up migrating
+ * one bucket too many, which is really not a big
+ * deal, especially if we're close to a critical
+ * situation.
+ */
+ if (gen_list->mb_cont.mc_starved > 0) {
+ cnt_lst->mb_cont.mc_starved--;
+ cv_signal(&(gen_list->mgl_mstarved));
+ } else
+ cnt_lst->mb_cont.mc_starved = 0;
+
+ MB_UNLOCK_CONT(gen_list);
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+
+ if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
+ /*
+ * We've hit the high limit of allowed numbers of mbufs
+ * on this PCPU list. We must now migrate a bucket
+ * over to the general container.
+ */
+ gen_list = MB_GET_GEN_LIST(mb_list);
+ MB_LOCK_CONT(gen_list);
+ if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
+ bucket =
+ SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
+ SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ mb_blist);
+ }
+ SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = MB_GENLIST_OWNER;
+ *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
+ *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
+ (*(cnt_lst->mb_cont.mc_numpgs))--;
+ (*(gen_list->mb_cont.mc_numpgs))++;
+
+ MB_UNLOCK_CONT(gen_list);
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+
+ if (bucket->mb_owner & MB_BUCKET_FREE) {
+ SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
+ bucket, mb_blist);
+ bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
+ }
+
+ MB_UNLOCK_CONT(cnt_lst);
+ break;
+ }
+
+ return;
+}
+
+/*
+ * Drain protocols in hopes to free up some resources.
+ *
+ * LOCKING NOTES:
+ * No locks should be held when this is called. The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * violation if we're holding any mutex if that mutex is acquired in reverse
+ * order relative to one of the locks in the drain routines.
+ */
+void
+mb_reclaim(void)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+/*
+ * XXX: Argh, we almost always trip here with witness turned on now-a-days
+ * XXX: because we often come in with Giant held. For now, there's no way
+ * XXX: to avoid this.
+ */
+#ifdef WITNESS
+ KASSERT(witness_list(curproc) == 0,
+ ("mb_reclaim() called with locks held"));
+#endif
+
+ mbstat.m_drain++; /* XXX: No consistency. */
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_drain)
+ (*pr->pr_drain)();
+
+}
+
+/*
+ * Local mbuf & cluster alloc macros and routines.
+ * Local macro and function names begin with an underscore ("_").
+ */
+void _mext_free(struct mbuf *);
+void _mclfree(struct mbuf *);
+
+#define _m_get(m, how, type) do { \
+ (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how)); \
+ if ((m) != NULL) { \
+ (m)->m_type = (type); \
+ (m)->m_next = NULL; \
+ (m)->m_nextpkt = NULL; \
+ (m)->m_data = (m)->m_dat; \
+ (m)->m_flags = 0; \
+ } \
+} while (0)
+
+#define _m_gethdr(m, how, type) do { \
+ (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how)); \
+ if ((m) != NULL) { \
+ (m)->m_type = (type); \
+ (m)->m_next = NULL; \
+ (m)->m_nextpkt = NULL; \
+ (m)->m_data = (m)->m_pktdat; \
+ (m)->m_flags = M_PKTHDR; \
+ (m)->m_pkthdr.rcvif = NULL; \
+ (m)->m_pkthdr.csum_flags = 0; \
+ (m)->m_pkthdr.aux = NULL; \
+ } \
+} while (0)
+
+/* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME) */
+#define _m_free(m, n) do { \
+ (n) = (m)->m_next; \
+ if ((m)->m_flags & M_EXT) \
+ MEXTFREE((m)); \
+ if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) { \
+ m_freem((m)->m_pkthdr.aux); \
+ (m)->m_pkthdr.aux = NULL; \
+ } \
+ mb_free(&mb_list_mbuf, (m)); \
+} while (0)
+
+#define _mext_init_ref(m) do { \
+ (m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT); \
+ if ((m)->m_ext.ref_cnt != NULL) { \
+ *((m)->m_ext.ref_cnt) = 0; \
+ MEXT_ADD_REF((m)); \
+ } \
+} while (0)
+
+#define _mext_dealloc_ref(m) \
+ free((m)->m_ext.ref_cnt, M_MBUF)
+
+void
+_mext_free(struct mbuf *mb)
+{
+
+ if (mb->m_ext.ext_type == EXT_CLUSTER)
+ mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf);
+ else
+ (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
+
+ _mext_dealloc_ref(mb);
+ return;
+}
+
+/* We only include this here to avoid making m_clget() excessively large
+ * due to too much inlined code. */
+void
+_mclfree(struct mbuf *mb)
+{
+
+ mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf);
+ mb->m_ext.ext_buf = NULL;
+ return;
+}
+
+/*
+ * Exported space allocation and de-allocation routines.
+ */
+struct mbuf *
+m_get(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_get(mb, how, type);
+ return (mb);
+}
+
+struct mbuf *
+m_gethdr(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_gethdr(mb, how, type);
+ return (mb);
+}
+
+struct mbuf *
+m_get_clrd(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_get(mb, how, type);
+
+ if (mb != NULL)
+ bzero(mtod(mb, caddr_t), MLEN);
+
+ return (mb);
+}
+
+struct mbuf *
+m_gethdr_clrd(int how, int type)
+{
+ struct mbuf *mb;
+
+ _m_gethdr(mb, how, type);
+
+ if (mb != NULL)
+ bzero(mtod(mb, caddr_t), MHLEN);
+
+ return (mb);
+}
+
+struct mbuf *
+m_free(struct mbuf *mb)
+{
+ struct mbuf *nb;
+
+ _m_free(mb, nb);
+ return (nb);
+}
+
+void
+m_clget(struct mbuf *mb, int how)
+{
+
+ mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how);
+ if (mb->m_ext.ext_buf != NULL) {
+ _mext_init_ref(mb);
+ if (mb->m_ext.ref_cnt == NULL)
+ _mclfree(mb);
+ else {
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_flags |= M_EXT;
+ mb->m_ext.ext_free = NULL;
+ mb->m_ext.ext_args = NULL;
+ mb->m_ext.ext_size = MCLBYTES;
+ mb->m_ext.ext_type = EXT_CLUSTER;
+ }
+ }
+ return;
+}
+
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+ void (*freef)(caddr_t, void *), void *args, short flags, int type)
+{
+
+ _mext_init_ref(mb);
+ if (mb->m_ext.ref_cnt != NULL) {
+ mb->m_flags |= (M_EXT | flags);
+ mb->m_ext.ext_buf = buf;
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_ext.ext_size = size;
+ mb->m_ext.ext_free = freef;
+ mb->m_ext.ext_args = args;
+ mb->m_ext.ext_type = type;
+ }
+ return;
+}
OpenPOWER on IntegriCloud