summaryrefslogtreecommitdiffstats
path: root/sys/kern/subr_mbuf.c
diff options
context:
space:
mode:
authorbmilekic <bmilekic@FreeBSD.org>2003-02-20 04:26:58 +0000
committerbmilekic <bmilekic@FreeBSD.org>2003-02-20 04:26:58 +0000
commit26ba0eb55ca13bca8cb911b471a949574a3b513b (patch)
tree01e5c8c145370b4dd9231e04fd70b4698c7d7db5 /sys/kern/subr_mbuf.c
parent569e9d1a868a053312375c90c3f0c7ab35d18e47 (diff)
downloadFreeBSD-src-26ba0eb55ca13bca8cb911b471a949574a3b513b.zip
FreeBSD-src-26ba0eb55ca13bca8cb911b471a949574a3b513b.tar.gz
o Allow "buckets" in mb_alloc to be differently sized (according to
compile-time constants). That is, a "bucket" now is not necessarily a page-worth of mbufs or clusters, but it is MBUF_BUCK_SZ, CLUS_BUCK_SZ worth of mbufs, clusters. o Rename {mbuf,clust}_limit to {mbuf,clust}_hiwm and introduce {mbuf,clust}_lowm, which currently has no effect but will be used to set the low watermarks. o Fix netstat so that it can deal with the differently-sized buckets and teach it about the low watermarks too. o Make sure the per-cpu stats for an absent CPU has mb_active set to 0, explicitly. o Get rid of the allocate refcounts from mbuf map mess. Instead, just malloc() the refcounts in one shot from mbuf_init() o Clean up / update comments in subr_mbuf.c
Diffstat (limited to 'sys/kern/subr_mbuf.c')
-rw-r--r--sys/kern/subr_mbuf.c185
1 files changed, 112 insertions, 73 deletions
diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c
index 171ccae..4facff8 100644
--- a/sys/kern/subr_mbuf.c
+++ b/sys/kern/subr_mbuf.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2001, 2002
+ * Copyright (c) 2001, 2002, 2003
* Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -51,9 +51,11 @@
#include <vm/pmap.h>
#include <vm/vm_map.h>
-/******************************************************************************
- * mb_alloc mbuf and cluster allocator.
- *
+/*
+ * mb_alloc: network buffer allocator
+ */
+
+/*
* Maximum number of PCPU containers. If you know what you're doing you could
* explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
* system during compilation, and thus prevent kernel structure bloat.
@@ -70,34 +72,50 @@
#endif
/*-
- * The mbuf allocator is heavily based on Alfred Perlstein's
- * (alfred@FreeBSD.org) "memcache" allocator which is itself based
- * on concepts from several per-CPU memory allocators. The difference
- * between this allocator and memcache is that, among other things:
+ * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
+ * "memcache" proof-of-concept allocator which was itself based on
+ * several well-known SMP-friendly allocators.
+ *
+ * The mb_alloc mbuf allocator is a special when compared to other
+ * general-purpose allocators. Some things to take note of:
*
- * (i) We don't free back to the map from the free() routine - we leave the
- * option of implementing lazy freeing (from a kproc) in the future.
+ * Mbufs and mbuf clusters are two different objects. Sometimes we
+ * will allocate a single mbuf, other times a single cluster,
+ * other times both. Further, we may sometimes wish to allocate a
+ * whole chain of mbufs with clusters. This allocator will perform
+ * the common case of each scenario in one function call (this
+ * includes constructing or destructing the object) while only
+ * locking/unlocking the cache once, if it can get away with it.
+ * The caches consist of pure mbufs and pure clusters; that is
+ * there are no 'zones' containing mbufs with already pre-hooked
+ * clusters. Since we can allocate both objects atomically anyway,
+ * we don't bother fragmenting our caches for any particular 'scenarios.'
*
- * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
- * maximum number of allocatable objects of a given type. Further,
- * we handle blocking on a cv in the case that the map is starved and
- * we have to rely solely on cached (circulating) objects.
+ * We allocate from seperate sub-maps of kmem_map, thus imposing
+ * an ultimate upper-limit on the number of allocatable clusters
+ * and mbufs and also, since the clusters all come from a
+ * virtually contiguous region, we can keep reference counters
+ * for them and "allocate" them purely by indexing into a
+ * dense refcount vector.
+ *
+ * We call out to protocol drain routines (which can be hooked
+ * into us) when we're low on space.
*
* The mbuf allocator keeps all objects that it allocates in mb_buckets.
- * The buckets keep a page worth of objects (an object can be an mbuf or an
+ * The buckets keep a number of objects (an object can be an mbuf or an
* mbuf cluster) and facilitate moving larger sets of contiguous objects
- * from the per-CPU lists to the main list for the given object. The buckets
- * also have an added advantage in that after several moves from a per-CPU
- * list to the main list and back to the per-CPU list, contiguous objects
- * are kept together, thus trying to put the TLB cache to good use.
+ * from the per-CPU caches to the global cache. The buckets also have
+ * the added advantage that objects, when migrated from cache to cache,
+ * are migrated in chunks that keep contiguous objects together,
+ * minimizing TLB pollution.
*
* The buckets are kept on singly-linked lists called "containers." A container
- * is protected by a mutex lock in order to ensure consistency. The mutex lock
+ * is protected by a mutex in order to ensure consistency. The mutex
* itself is allocated separately and attached to the container at boot time,
- * thus allowing for certain containers to share the same mutex lock. Per-CPU
- * containers for mbufs and mbuf clusters all share the same per-CPU
- * lock whereas the "general system" containers (i.e., the "main lists") for
- * these objects share one global lock.
+ * thus allowing for certain containers to share the same lock. Per-CPU
+ * containers for mbufs and mbuf clusters all share the same per-CPU
+ * lock whereas the global cache containers for these objects share one
+ * global lock.
*/
struct mb_bucket {
SLIST_ENTRY(mb_bucket) mb_blist;
@@ -113,7 +131,7 @@ struct mb_container {
u_int mc_starved;
long *mc_types;
u_long *mc_objcount;
- u_long *mc_numpgs;
+ u_long *mc_numbucks;
};
struct mb_gen_list {
@@ -153,6 +171,13 @@ int nmbcnt;
int nsfbufs;
/*
+ * Sizes of objects per bucket. There are this size's worth of mbufs
+ * or clusters in each bucket. Please keep these a power-of-2.
+ */
+#define MBUF_BUCK_SZ (PAGE_SIZE * 2)
+#define CLUST_BUCK_SZ (PAGE_SIZE * 4)
+
+/*
* Perform sanity checks of tunables declared above.
*/
static void
@@ -197,7 +222,9 @@ struct mb_lstmngr {
vm_offset_t ml_maptop;
int ml_mapfull;
u_int ml_objsize;
+ u_int ml_objbucks;
u_int *ml_wmhigh;
+ u_int *ml_wmlow;
};
static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
@@ -222,7 +249,8 @@ u_int *cl_refcntmap;
(mb_lst)->ml_cntlst[(num)]
#define MB_BUCKET_INDX(mb_obj, mb_lst) \
- (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \
+ ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
{ \
@@ -269,8 +297,10 @@ struct mbstat mbstat;
/* Sleep time for wait code (in ticks). */
static int mbuf_wait = 64;
-static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */
-static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */
+static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */
+static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */
+static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */
+static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */
/*
* Objects exported by sysctl(8).
@@ -286,10 +316,14 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
"Maximum number of sendfile(2) sf_bufs available");
SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
"Sleep time of mbuf subsystem wait allocations during exhaustion");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
- "Upper limit of number of mbufs allowed on each PCPU list");
-SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
- "Upper limit of number of mbuf clusters allowed on each PCPU list");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
+ "Upper limit of number of mbufs allowed in each cache");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RW, &mbuf_lowm, 0,
+ "Lower limit of number of mbufs allowed in each cache");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
+ "Upper limit of number of mbuf clusters allowed in each cache");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RW, &clust_lowm, 0,
+ "Lower limit of number of mbuf clusters allowed in each cache");
SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
"Mbuf general information and statistics");
SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
@@ -309,8 +343,8 @@ static void mbuf_init(void *);
* of each object that will be placed initially in each PCPU container for
* said object.
*/
-#define NMB_MBUF_INIT 4
-#define NMB_CLUST_INIT 16
+#define NMB_MBUF_INIT 2
+#define NMB_CLUST_INIT 8
/*
* Internal flags that allow for cache locks to remain "persistent" across
@@ -341,14 +375,12 @@ mbuf_init(void *dummy)
/*
* Set up all the submaps, for each type of object that we deal
- * with in this allocator. We also allocate space for the cluster
- * ref. counts in the mbuf map (and not the cluster map) in order to
- * give clusters a nice contiguous address space without any holes.
+ * with in this allocator.
*/
- mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * sizeof(u_int));
- mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
- mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
- sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ mb_map_size = (vm_size_t)(nmbufs * MSIZE);
+ mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
+ mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
+ MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
if (mb_list_mbuf.ml_btable == NULL)
goto bad;
mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
@@ -356,12 +388,14 @@ mbuf_init(void *dummy)
mb_list_mbuf.ml_map->system_map = 1;
mb_list_mbuf.ml_mapfull = 0;
mb_list_mbuf.ml_objsize = MSIZE;
- mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+ mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / MSIZE;
+ mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
+ mb_list_mbuf.ml_wmlow = &mbuf_lowm;
mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
- mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
- mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
- * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
+ mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
+ mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
+ CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
if (mb_list_clust.ml_btable == NULL)
goto bad;
mb_list_clust.ml_map = kmem_suballoc(kmem_map,
@@ -370,7 +404,9 @@ mbuf_init(void *dummy)
mb_list_clust.ml_map->system_map = 1;
mb_list_clust.ml_mapfull = 0;
mb_list_clust.ml_objsize = MCLBYTES;
- mb_list_clust.ml_wmhigh = &clust_limit;
+ mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / MCLBYTES;
+ mb_list_clust.ml_wmhigh = &clust_hiwm;
+ mb_list_clust.ml_wmlow = &clust_lowm;
/*
* Allocate required general (global) containers for each object type.
@@ -404,10 +440,10 @@ mbuf_init(void *dummy)
&(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
mb_list_clust.ml_genlist->mb_cont.mc_objcount =
&(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
- mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
- mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
- &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+ mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
+ mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
mb_list_mbuf.ml_genlist->mb_cont.mc_types =
&(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
@@ -418,8 +454,7 @@ mbuf_init(void *dummy)
* Allocate all the required counters for clusters. This makes
* cluster allocations/deallocations much faster.
*/
- cl_refcntmap = (u_int *)kmem_malloc(mb_list_mbuf.ml_map,
- roundup(nmbclusters * sizeof(u_int), MSIZE), M_NOWAIT);
+ cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
if (cl_refcntmap == NULL)
goto bad;
@@ -432,13 +467,17 @@ mbuf_init(void *dummy)
mbstat.m_mlen = MLEN;
mbstat.m_mhlen = MHLEN;
mbstat.m_numtypes = MT_NTYPES;
+ mbstat.m_mbperbuck = MBUF_BUCK_SZ / MSIZE;
+ mbstat.m_clperbuck = CLUST_BUCK_SZ / MCLBYTES;
/*
* Allocate and initialize PCPU containers.
*/
for (i = 0; i < NCPU; i++) {
- if (CPU_ABSENT(i))
+ if (CPU_ABSENT(i)) {
+ mb_statpcpu[i].mb_active = 0;
continue;
+ }
mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
M_MBUF, M_NOWAIT);
@@ -461,10 +500,10 @@ mbuf_init(void *dummy)
&(mb_statpcpu[i].mb_mbfree);
mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
&(mb_statpcpu[i].mb_clfree);
- mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
- &(mb_statpcpu[i].mb_mbpgs);
- mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
- &(mb_statpcpu[i].mb_clpgs);
+ mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
+ &(mb_statpcpu[i].mb_mbbucks);
+ mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
+ &(mb_statpcpu[i].mb_clbucks);
mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
&(mb_statpcpu[i].mb_mbtypes[0]);
mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
@@ -527,13 +566,13 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
return (NULL);
bucket = malloc(sizeof(struct mb_bucket) +
- PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+ mb_list->ml_objbucks * sizeof(void *), M_MBUF,
how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
if (bucket == NULL)
return (NULL);
- p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
- how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
+ p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize *
+ mb_list->ml_objbucks, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
if (p == NULL) {
free(bucket, M_MBUF);
if (how == M_TRYWAIT)
@@ -543,7 +582,7 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
bucket->mb_numfree = 0;
mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
- for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+ for (i = 0; i < mb_list->ml_objbucks; i++) {
bucket->mb_free[i] = p;
bucket->mb_numfree++;
p += mb_list->ml_objsize;
@@ -552,14 +591,14 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
MB_LOCK_CONT(cnt_lst);
bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
- (*(cnt_lst->mb_cont.mc_numpgs))++;
+ (*(cnt_lst->mb_cont.mc_numbucks))++;
*(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
return (bucket);
}
/*
- * Allocate an mbuf-subsystem type object.
+ * Allocate a network buffer.
* The general case is very easy. Complications only arise if our PCPU
* container is empty. Things get worse if the PCPU container is empty,
* the general container is empty, and we've run out of address space
@@ -629,8 +668,8 @@ mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist,
SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
mb_blist);
bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
- (*(gen_list->mb_cont.mc_numpgs))--;
- (*(cnt_lst->mb_cont.mc_numpgs))++;
+ (*(gen_list->mb_cont.mc_numbucks))--;
+ (*(cnt_lst->mb_cont.mc_numbucks))++;
*(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
bucket->mb_numfree--;
m = bucket->mb_free[(bucket->mb_numfree)];
@@ -893,8 +932,8 @@ retry_lock:
bucket->mb_owner = MB_GENLIST_OWNER;
(*(cnt_lst->mb_cont.mc_objcount))--;
(*(gen_list->mb_cont.mc_objcount))++;
- (*(cnt_lst->mb_cont.mc_numpgs))--;
- (*(gen_list->mb_cont.mc_numpgs))++;
+ (*(cnt_lst->mb_cont.mc_numbucks))--;
+ (*(gen_list->mb_cont.mc_numbucks))++;
/*
* Determine whether or not to keep transferring
@@ -943,8 +982,8 @@ retry_lock:
bucket->mb_owner = MB_GENLIST_OWNER;
*(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
*(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
- (*(cnt_lst->mb_cont.mc_numpgs))--;
- (*(gen_list->mb_cont.mc_numpgs))++;
+ (*(cnt_lst->mb_cont.mc_numbucks))--;
+ (*(gen_list->mb_cont.mc_numbucks))++;
/*
* While we're at it, transfer some of the mbtypes
@@ -957,10 +996,10 @@ retry_lock:
* being freed in an effort to keep the mbtypes
* counters approximately balanced across all lists.
*/
- MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
- mb_list->ml_objsize) - bucket->mb_numfree);
- MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
- mb_list->ml_objsize) - bucket->mb_numfree);
+ MB_MBTYPES_DEC(cnt_lst, type,
+ mb_list->ml_objbucks - bucket->mb_numfree);
+ MB_MBTYPES_INC(gen_list, type,
+ mb_list->ml_objbucks - bucket->mb_numfree);
MB_UNLOCK_CONT(gen_list);
if ((persist & MBP_PERSIST) == 0)
OpenPOWER on IntegriCloud