From 26ba0eb55ca13bca8cb911b471a949574a3b513b Mon Sep 17 00:00:00 2001 From: bmilekic Date: Thu, 20 Feb 2003 04:26:58 +0000 Subject: o Allow "buckets" in mb_alloc to be differently sized (according to compile-time constants). That is, a "bucket" now is not necessarily a page-worth of mbufs or clusters, but it is MBUF_BUCK_SZ, CLUS_BUCK_SZ worth of mbufs, clusters. o Rename {mbuf,clust}_limit to {mbuf,clust}_hiwm and introduce {mbuf,clust}_lowm, which currently has no effect but will be used to set the low watermarks. o Fix netstat so that it can deal with the differently-sized buckets and teach it about the low watermarks too. o Make sure the per-cpu stats for an absent CPU has mb_active set to 0, explicitly. o Get rid of the allocate refcounts from mbuf map mess. Instead, just malloc() the refcounts in one shot from mbuf_init() o Clean up / update comments in subr_mbuf.c --- sys/kern/subr_mbuf.c | 185 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 112 insertions(+), 73 deletions(-) (limited to 'sys/kern/subr_mbuf.c') diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c index 171ccae..4facff8 100644 --- a/sys/kern/subr_mbuf.c +++ b/sys/kern/subr_mbuf.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2001, 2002 + * Copyright (c) 2001, 2002, 2003 * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,9 +51,11 @@ #include #include -/****************************************************************************** - * mb_alloc mbuf and cluster allocator. - * +/* + * mb_alloc: network buffer allocator + */ + +/* * Maximum number of PCPU containers. If you know what you're doing you could * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your * system during compilation, and thus prevent kernel structure bloat. @@ -70,34 +72,50 @@ #endif /*- - * The mbuf allocator is heavily based on Alfred Perlstein's - * (alfred@FreeBSD.org) "memcache" allocator which is itself based - * on concepts from several per-CPU memory allocators. The difference - * between this allocator and memcache is that, among other things: + * The mbuf allocator is based on Alfred Perlstein's + * "memcache" proof-of-concept allocator which was itself based on + * several well-known SMP-friendly allocators. + * + * The mb_alloc mbuf allocator is a special when compared to other + * general-purpose allocators. Some things to take note of: * - * (i) We don't free back to the map from the free() routine - we leave the - * option of implementing lazy freeing (from a kproc) in the future. + * Mbufs and mbuf clusters are two different objects. Sometimes we + * will allocate a single mbuf, other times a single cluster, + * other times both. Further, we may sometimes wish to allocate a + * whole chain of mbufs with clusters. This allocator will perform + * the common case of each scenario in one function call (this + * includes constructing or destructing the object) while only + * locking/unlocking the cache once, if it can get away with it. + * The caches consist of pure mbufs and pure clusters; that is + * there are no 'zones' containing mbufs with already pre-hooked + * clusters. Since we can allocate both objects atomically anyway, + * we don't bother fragmenting our caches for any particular 'scenarios.' * - * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the - * maximum number of allocatable objects of a given type. Further, - * we handle blocking on a cv in the case that the map is starved and - * we have to rely solely on cached (circulating) objects. + * We allocate from seperate sub-maps of kmem_map, thus imposing + * an ultimate upper-limit on the number of allocatable clusters + * and mbufs and also, since the clusters all come from a + * virtually contiguous region, we can keep reference counters + * for them and "allocate" them purely by indexing into a + * dense refcount vector. + * + * We call out to protocol drain routines (which can be hooked + * into us) when we're low on space. * * The mbuf allocator keeps all objects that it allocates in mb_buckets. - * The buckets keep a page worth of objects (an object can be an mbuf or an + * The buckets keep a number of objects (an object can be an mbuf or an * mbuf cluster) and facilitate moving larger sets of contiguous objects - * from the per-CPU lists to the main list for the given object. The buckets - * also have an added advantage in that after several moves from a per-CPU - * list to the main list and back to the per-CPU list, contiguous objects - * are kept together, thus trying to put the TLB cache to good use. + * from the per-CPU caches to the global cache. The buckets also have + * the added advantage that objects, when migrated from cache to cache, + * are migrated in chunks that keep contiguous objects together, + * minimizing TLB pollution. * * The buckets are kept on singly-linked lists called "containers." A container - * is protected by a mutex lock in order to ensure consistency. The mutex lock + * is protected by a mutex in order to ensure consistency. The mutex * itself is allocated separately and attached to the container at boot time, - * thus allowing for certain containers to share the same mutex lock. Per-CPU - * containers for mbufs and mbuf clusters all share the same per-CPU - * lock whereas the "general system" containers (i.e., the "main lists") for - * these objects share one global lock. + * thus allowing for certain containers to share the same lock. Per-CPU + * containers for mbufs and mbuf clusters all share the same per-CPU + * lock whereas the global cache containers for these objects share one + * global lock. */ struct mb_bucket { SLIST_ENTRY(mb_bucket) mb_blist; @@ -113,7 +131,7 @@ struct mb_container { u_int mc_starved; long *mc_types; u_long *mc_objcount; - u_long *mc_numpgs; + u_long *mc_numbucks; }; struct mb_gen_list { @@ -153,6 +171,13 @@ int nmbcnt; int nsfbufs; /* + * Sizes of objects per bucket. There are this size's worth of mbufs + * or clusters in each bucket. Please keep these a power-of-2. + */ +#define MBUF_BUCK_SZ (PAGE_SIZE * 2) +#define CLUST_BUCK_SZ (PAGE_SIZE * 4) + +/* * Perform sanity checks of tunables declared above. */ static void @@ -197,7 +222,9 @@ struct mb_lstmngr { vm_offset_t ml_maptop; int ml_mapfull; u_int ml_objsize; + u_int ml_objbucks; u_int *ml_wmhigh; + u_int *ml_wmlow; }; static struct mb_lstmngr mb_list_mbuf, mb_list_clust; static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; @@ -222,7 +249,8 @@ u_int *cl_refcntmap; (mb_lst)->ml_cntlst[(num)] #define MB_BUCKET_INDX(mb_obj, mb_lst) \ - (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE) + (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \ + ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize)) #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ { \ @@ -269,8 +297,10 @@ struct mbstat mbstat; /* Sleep time for wait code (in ticks). */ static int mbuf_wait = 64; -static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */ -static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */ +static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */ +static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */ +static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */ +static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */ /* * Objects exported by sysctl(8). @@ -286,10 +316,14 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, "Sleep time of mbuf subsystem wait allocations during exhaustion"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0, - "Upper limit of number of mbufs allowed on each PCPU list"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0, - "Upper limit of number of mbuf clusters allowed on each PCPU list"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0, + "Upper limit of number of mbufs allowed in each cache"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RW, &mbuf_lowm, 0, + "Lower limit of number of mbufs allowed in each cache"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0, + "Upper limit of number of mbuf clusters allowed in each cache"); +SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RW, &clust_lowm, 0, + "Lower limit of number of mbuf clusters allowed in each cache"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, @@ -309,8 +343,8 @@ static void mbuf_init(void *); * of each object that will be placed initially in each PCPU container for * said object. */ -#define NMB_MBUF_INIT 4 -#define NMB_CLUST_INIT 16 +#define NMB_MBUF_INIT 2 +#define NMB_CLUST_INIT 8 /* * Internal flags that allow for cache locks to remain "persistent" across @@ -341,14 +375,12 @@ mbuf_init(void *dummy) /* * Set up all the submaps, for each type of object that we deal - * with in this allocator. We also allocate space for the cluster - * ref. counts in the mbuf map (and not the cluster map) in order to - * give clusters a nice contiguous address space without any holes. + * with in this allocator. */ - mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * sizeof(u_int)); - mb_map_size = rounddown(mb_map_size, PAGE_SIZE); - mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE * - sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); + mb_map_size = (vm_size_t)(nmbufs * MSIZE); + mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ); + mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / + MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_mbuf.ml_btable == NULL) goto bad; mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), @@ -356,12 +388,14 @@ mbuf_init(void *dummy) mb_list_mbuf.ml_map->system_map = 1; mb_list_mbuf.ml_mapfull = 0; mb_list_mbuf.ml_objsize = MSIZE; - mb_list_mbuf.ml_wmhigh = &mbuf_limit; + mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / MSIZE; + mb_list_mbuf.ml_wmhigh = &mbuf_hiwm; + mb_list_mbuf.ml_wmlow = &mbuf_lowm; mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); - mb_map_size = rounddown(mb_map_size, PAGE_SIZE); - mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE - * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); + mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ); + mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / + CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_clust.ml_btable == NULL) goto bad; mb_list_clust.ml_map = kmem_suballoc(kmem_map, @@ -370,7 +404,9 @@ mbuf_init(void *dummy) mb_list_clust.ml_map->system_map = 1; mb_list_clust.ml_mapfull = 0; mb_list_clust.ml_objsize = MCLBYTES; - mb_list_clust.ml_wmhigh = &clust_limit; + mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / MCLBYTES; + mb_list_clust.ml_wmhigh = &clust_hiwm; + mb_list_clust.ml_wmlow = &clust_lowm; /* * Allocate required general (global) containers for each object type. @@ -404,10 +440,10 @@ mbuf_init(void *dummy) &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); mb_list_clust.ml_genlist->mb_cont.mc_objcount = &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); - mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs); - mb_list_clust.ml_genlist->mb_cont.mc_numpgs = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs); + mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks); + mb_list_clust.ml_genlist->mb_cont.mc_numbucks = + &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks); mb_list_mbuf.ml_genlist->mb_cont.mc_types = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; @@ -418,8 +454,7 @@ mbuf_init(void *dummy) * Allocate all the required counters for clusters. This makes * cluster allocations/deallocations much faster. */ - cl_refcntmap = (u_int *)kmem_malloc(mb_list_mbuf.ml_map, - roundup(nmbclusters * sizeof(u_int), MSIZE), M_NOWAIT); + cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT); if (cl_refcntmap == NULL) goto bad; @@ -432,13 +467,17 @@ mbuf_init(void *dummy) mbstat.m_mlen = MLEN; mbstat.m_mhlen = MHLEN; mbstat.m_numtypes = MT_NTYPES; + mbstat.m_mbperbuck = MBUF_BUCK_SZ / MSIZE; + mbstat.m_clperbuck = CLUST_BUCK_SZ / MCLBYTES; /* * Allocate and initialize PCPU containers. */ for (i = 0; i < NCPU; i++) { - if (CPU_ABSENT(i)) + if (CPU_ABSENT(i)) { + mb_statpcpu[i].mb_active = 0; continue; + } mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); @@ -461,10 +500,10 @@ mbuf_init(void *dummy) &(mb_statpcpu[i].mb_mbfree); mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = &(mb_statpcpu[i].mb_clfree); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs = - &(mb_statpcpu[i].mb_mbpgs); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs = - &(mb_statpcpu[i].mb_clpgs); + mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks = + &(mb_statpcpu[i].mb_mbbucks); + mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks = + &(mb_statpcpu[i].mb_clbucks); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = &(mb_statpcpu[i].mb_mbtypes[0]); mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; @@ -527,13 +566,13 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) return (NULL); bucket = malloc(sizeof(struct mb_bucket) + - PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF, + mb_list->ml_objbucks * sizeof(void *), M_MBUF, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (bucket == NULL) return (NULL); - p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE, - how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); + p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * + mb_list->ml_objbucks, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (p == NULL) { free(bucket, M_MBUF); if (how == M_TRYWAIT) @@ -543,7 +582,7 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) bucket->mb_numfree = 0; mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; - for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) { + for (i = 0; i < mb_list->ml_objbucks; i++) { bucket->mb_free[i] = p; bucket->mb_numfree++; p += mb_list->ml_objsize; @@ -552,14 +591,14 @@ mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) MB_LOCK_CONT(cnt_lst); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); - (*(cnt_lst->mb_cont.mc_numpgs))++; + (*(cnt_lst->mb_cont.mc_numbucks))++; *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; return (bucket); } /* - * Allocate an mbuf-subsystem type object. + * Allocate a network buffer. * The general case is very easy. Complications only arise if our PCPU * container is empty. Things get worse if the PCPU container is empty, * the general container is empty, and we've run out of address space @@ -629,8 +668,8 @@ mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), mb_blist); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - (*(gen_list->mb_cont.mc_numpgs))--; - (*(cnt_lst->mb_cont.mc_numpgs))++; + (*(gen_list->mb_cont.mc_numbucks))--; + (*(cnt_lst->mb_cont.mc_numbucks))++; *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; bucket->mb_numfree--; m = bucket->mb_free[(bucket->mb_numfree)]; @@ -893,8 +932,8 @@ retry_lock: bucket->mb_owner = MB_GENLIST_OWNER; (*(cnt_lst->mb_cont.mc_objcount))--; (*(gen_list->mb_cont.mc_objcount))++; - (*(cnt_lst->mb_cont.mc_numpgs))--; - (*(gen_list->mb_cont.mc_numpgs))++; + (*(cnt_lst->mb_cont.mc_numbucks))--; + (*(gen_list->mb_cont.mc_numbucks))++; /* * Determine whether or not to keep transferring @@ -943,8 +982,8 @@ retry_lock: bucket->mb_owner = MB_GENLIST_OWNER; *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; - (*(cnt_lst->mb_cont.mc_numpgs))--; - (*(gen_list->mb_cont.mc_numpgs))++; + (*(cnt_lst->mb_cont.mc_numbucks))--; + (*(gen_list->mb_cont.mc_numbucks))++; /* * While we're at it, transfer some of the mbtypes @@ -957,10 +996,10 @@ retry_lock: * being freed in an effort to keep the mbtypes * counters approximately balanced across all lists. */ - MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE / - mb_list->ml_objsize) - bucket->mb_numfree); - MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE / - mb_list->ml_objsize) - bucket->mb_numfree); + MB_MBTYPES_DEC(cnt_lst, type, + mb_list->ml_objbucks - bucket->mb_numfree); + MB_MBTYPES_INC(gen_list, type, + mb_list->ml_objbucks - bucket->mb_numfree); MB_UNLOCK_CONT(gen_list); if ((persist & MBP_PERSIST) == 0) -- cgit v1.1