From 57ca4583e728cab422fba8f15de10bd0b637b3dd Mon Sep 17 00:00:00 2001 From: rwatson Date: Tue, 14 Jul 2009 22:48:30 +0000 Subject: Build on Jeff Roberson's linker-set based dynamic per-CPU allocator (DPCPU), as suggested by Peter Wemm, and implement a new per-virtual network stack memory allocator. Modify vnet to use the allocator instead of monolithic global container structures (vinet, ...). This change solves many binary compatibility problems associated with VIMAGE, and restores ELF symbols for virtualized global variables. Each virtualized global variable exists as a "reference copy", and also once per virtual network stack. Virtualized global variables are tagged at compile-time, placing the in a special linker set, which is loaded into a contiguous region of kernel memory. Virtualized global variables in the base kernel are linked as normal, but those in modules are copied and relocated to a reserved portion of the kernel's vnet region with the help of a the kernel linker. Virtualized global variables exist in per-vnet memory set up when the network stack instance is created, and are initialized statically from the reference copy. Run-time access occurs via an accessor macro, which converts from the current vnet and requested symbol to a per-vnet address. When "options VIMAGE" is not compiled into the kernel, normal global ELF symbols will be used instead and indirection is avoided. This change restores static initialization for network stack global variables, restores support for non-global symbols and types, eliminates the need for many subsystem constructors, eliminates large per-subsystem structures that caused many binary compatibility issues both for monitoring applications (netstat) and kernel modules, removes the per-function INIT_VNET_*() macros throughout the stack, eliminates the need for vnet_symmap ksym(2) munging, and eliminates duplicate definitions of virtualized globals under VIMAGE_GLOBALS. Bump __FreeBSD_version and update UPDATING. Portions submitted by: bz Reviewed by: bz, zec Discussed with: gnn, jamie, jeff, jhb, julian, sam Suggested by: peter Approved by: re (kensmith) --- sys/kern/kern_linker.c | 15 ------ sys/kern/kern_poll.c | 1 - sys/kern/kern_sysctl.c | 105 ++++-------------------------------------- sys/kern/kern_uuid.c | 1 - sys/kern/kern_vimage.c | 119 ++++++------------------------------------------ sys/kern/link_elf.c | 55 ++++++++++++++++++++++ sys/kern/link_elf_obj.c | 43 ++++++++++++++++- sys/kern/uipc_domain.c | 6 +-- 8 files changed, 119 insertions(+), 226 deletions(-) (limited to 'sys/kern') diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index 9775059..1768e69 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -1334,23 +1334,8 @@ kldsym(struct thread *td, struct kldsym_args *uap) break; } } -#ifndef VIMAGE_GLOBALS - /* - * If the symbol is not found in global namespace, - * try to look it up in the current vimage namespace. - */ - if (lf == NULL) { - CURVNET_SET(TD_TO_VNET(td)); - error = vi_symlookup(&lookup, symstr); - CURVNET_RESTORE(); - if (error == 0) - error = copyout(&lookup, uap->data, - sizeof(lookup)); - } -#else if (lf == NULL) error = ENOENT; -#endif } KLD_UNLOCK(); out: diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c index d9cf49f..5396240 100644 --- a/sys/kern/kern_poll.c +++ b/sys/kern/kern_poll.c @@ -545,7 +545,6 @@ ether_poll_deregister(struct ifnet *ifp) static int poll_switch(SYSCTL_HANDLER_ARGS) { - INIT_VNET_NET(curvnet); struct ifnet *ifp; int error; int val = polling; diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 9e61544..94e45f1 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -59,6 +59,8 @@ __FBSDID("$FreeBSD$"); #include #endif +#include + #include #include @@ -936,33 +938,9 @@ sysctl_handle_int(SYSCTL_HANDLER_ARGS) return (error); } -#ifdef VIMAGE -int -sysctl_handle_v_int(SYSCTL_HANDLER_ARGS) -{ - int tmpout, error = 0; - - SYSCTL_RESOLVE_V_ARG1(); - - /* - * Attempt to get a coherent snapshot by making a copy of the data. - */ - tmpout = *(int *)arg1; - error = SYSCTL_OUT(req, &tmpout, sizeof(int)); - - if (error || !req->newptr) - return (error); - - if (!arg1) - error = EPERM; - else - error = SYSCTL_IN(req, arg1, sizeof(int)); - return (error); -} -#endif - /* * Based on on sysctl_handle_int() convert milliseconds into ticks. + * Note: this is used by TCP. */ int @@ -970,8 +948,11 @@ sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; - SYSCTL_RESOLVE_V_ARG1(); - +#ifdef VIMAGE + if (arg1 != NULL) + arg1 = (void *)(TD_TO_VNET(req->td)->vnet_data_base + + (uintptr_t)arg1); +#endif tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); @@ -1097,47 +1078,6 @@ retry: return (error); } -#ifdef VIMAGE -int -sysctl_handle_v_string(SYSCTL_HANDLER_ARGS) -{ - int error=0; - char *tmparg; - size_t outlen; - - SYSCTL_RESOLVE_V_ARG1(); - - /* - * Attempt to get a coherent snapshot by copying to a - * temporary kernel buffer. - */ -retry: - outlen = strlen((char *)arg1)+1; - tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); - - if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { - free(tmparg, M_SYSCTLTMP); - goto retry; - } - - error = SYSCTL_OUT(req, tmparg, outlen); - free(tmparg, M_SYSCTLTMP); - - if (error || !req->newptr) - return (error); - - if ((req->newlen - req->newidx) >= arg2) { - error = EINVAL; - } else { - arg2 = (req->newlen - req->newidx); - error = SYSCTL_IN(req, arg1, arg2); - ((char *)arg1)[arg2] = '\0'; - } - - return (error); -} -#endif - /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. @@ -1175,35 +1115,6 @@ retry: return (error); } -#ifdef VIMAGE -int -sysctl_handle_v_opaque(SYSCTL_HANDLER_ARGS) -{ - int error, tries; - u_int generation; - struct sysctl_req req2; - - SYSCTL_RESOLVE_V_ARG1(); - - tries = 0; - req2 = *req; -retry: - generation = curthread->td_generation; - error = SYSCTL_OUT(req, arg1, arg2); - if (error) - return (error); - tries++; - if (generation != curthread->td_generation && tries < 3) { - *req = req2; - goto retry; - } - - error = SYSCTL_IN(req, arg1, arg2); - - return (error); -} -#endif - /* * Transfer functions to/from kernel space. * XXX: rather untested at this point diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c index a0c26b9..cd88538 100644 --- a/sys/kern/kern_uuid.c +++ b/sys/kern/kern_uuid.c @@ -89,7 +89,6 @@ MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF); static void uuid_node(uint16_t *node) { - INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct sockaddr_dl *sdl; diff --git a/sys/kern/kern_vimage.c b/sys/kern/kern_vimage.c index 9ee3e1d..daa9b9f 100644 --- a/sys/kern/kern_vimage.c +++ b/sys/kern/kern_vimage.c @@ -53,8 +53,6 @@ __FBSDID("$FreeBSD$"); #include #include -#ifndef VIMAGE_GLOBALS - MALLOC_DEFINE(M_VIMAGE, "vimage", "vimage resource container"); MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); MALLOC_DEFINE(M_VPROCG, "vprocg", "process group control block"); @@ -65,13 +63,11 @@ static void vnet_mod_complete_registration(struct vnet_modlink *); static int vnet_mod_constructor(struct vnet_modlink *); static int vnet_mod_destructor(struct vnet_modlink *); -#ifdef VIMAGE static struct vimage *vi_alloc(struct vimage *, char *); static int vi_destroy(struct vimage *); static struct vimage *vimage_get_next(struct vimage *, struct vimage *, int); static void vimage_relative_name(struct vimage *, struct vimage *, char *, int); -#endif #define VNET_LIST_WLOCK() \ mtx_lock(&vnet_list_refc_mtx); \ @@ -81,17 +77,11 @@ static void vimage_relative_name(struct vimage *, struct vimage *, #define VNET_LIST_WUNLOCK() \ mtx_unlock(&vnet_list_refc_mtx); -#ifdef VIMAGE struct vimage_list_head vimage_head; struct vnet_list_head vnet_head; struct vprocg_list_head vprocg_head; -#else -#ifndef VIMAGE_GLOBALS struct vprocg vprocg_0; -#endif -#endif -#ifdef VIMAGE struct cv vnet_list_condvar; struct mtx vnet_list_refc_mtx; int vnet_list_refc = 0; @@ -100,9 +90,7 @@ static u_int last_vi_id = 0; static u_int last_vprocg_id = 0; struct vnet *vnet0; -#endif -#ifdef VIMAGE /* * Move an ifnet to or from another vnet, specified by the jail id. If a @@ -396,8 +384,6 @@ vimage_get_next(struct vimage *top, struct vimage *where, int recurse) return (NULL); } -#endif /* VIMAGE */ /* User interface block */ - /* * Kernel interfaces and handlers. @@ -540,25 +526,11 @@ vnet_mod_constructor(struct vnet_modlink *vml) if (vml->vml_iarg) printf("/%s", vml->vml_iname); printf(": "); -#ifdef VIMAGE - if (vmi->vmi_size) - printf("malloc(%zu); ", vmi->vmi_size); -#endif if (vmi->vmi_iattach != NULL) printf("iattach()"); printf("\n"); #endif -#ifdef VIMAGE - if (vmi->vmi_size) { - void *mem = malloc(vmi->vmi_size, M_VNET, - M_NOWAIT | M_ZERO); - if (mem == NULL) /* XXX should return error, not panic. */ - panic("malloc for %s\n", vmi->vmi_name); - curvnet->mod_data[vmi->vmi_id] = mem; - } -#endif - if (vmi->vmi_iattach != NULL) vmi->vmi_iattach(vml->vml_iarg); @@ -577,63 +549,15 @@ vnet_mod_destructor(struct vnet_modlink *vml) printf(": "); if (vmi->vmi_idetach != NULL) printf("idetach(); "); -#ifdef VIMAGE - if (vmi->vmi_size) - printf("free()"); -#endif printf("\n"); #endif if (vmi->vmi_idetach) vmi->vmi_idetach(vml->vml_iarg); -#ifdef VIMAGE - if (vmi->vmi_size) { - if (curvnet->mod_data[vmi->vmi_id] == NULL) - panic("vi_destroy: %s\n", vmi->vmi_name); - free(curvnet->mod_data[vmi->vmi_id], M_VNET); - curvnet->mod_data[vmi->vmi_id] = NULL; - } -#endif - return (0); } -/* - * vi_symlookup() attempts to resolve name to address queries for - * variables which have been moved from global namespace to virtualization - * container structures, but are still directly accessed from legacy - * userspace processes via kldsym(2) and kmem(4) interfaces. - */ -int -vi_symlookup(struct kld_sym_lookup *lookup, char *symstr) -{ - struct vnet_modlink *vml; - struct vnet_symmap *mapentry; - - TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { - if (vml->vml_modinfo->vmi_symmap == NULL) - continue; - for (mapentry = vml->vml_modinfo->vmi_symmap; - mapentry->name != NULL; mapentry++) { - if (strcmp(symstr, mapentry->name) == 0) { -#ifdef VIMAGE - lookup->symvalue = - (u_long) curvnet->mod_data[ - vml->vml_modinfo->vmi_id]; - lookup->symvalue += mapentry->offset; -#else - lookup->symvalue = (u_long) mapentry->offset; -#endif - lookup->symsize = mapentry->size; - return (0); - } - } - } - return (ENOENT); -} - -#ifdef VIMAGE struct vnet * vnet_alloc(void) { @@ -642,6 +566,7 @@ vnet_alloc(void) vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO); vnet->vnet_magic_n = VNET_MAGIC_N; + vnet_data_init(vnet); /* Initialize / attach vnet module instances. */ CURVNET_SET_QUIET(vnet); @@ -669,7 +594,6 @@ vnet_destroy(struct vnet *vnet) VNET_LIST_WUNLOCK(); CURVNET_SET_QUIET(vnet); - INIT_VNET_NET(vnet); /* Return all inherited interfaces to their parent vnets. */ TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { @@ -685,10 +609,22 @@ vnet_destroy(struct vnet *vnet) CURVNET_RESTORE(); /* Hopefully, we are OK to free the vnet container itself. */ + vnet_data_destroy(vnet); vnet->vnet_magic_n = 0xdeadbeef; free(vnet, M_VNET); } +void +vnet_foreach(void (*vnet_foreach_fn)(struct vnet *, void *), void *arg) +{ + struct vnet *vnet; + + VNET_LIST_RLOCK(); + LIST_FOREACH(vnet, &vnet_head, vnet_le) + vnet_foreach_fn(vnet, arg); + VNET_LIST_RUNLOCK(); +} + static struct vimage * vi_alloc(struct vimage *parent, char *name) { @@ -757,7 +693,6 @@ vi_destroy(struct vimage *vip) return (0); } -#endif /* VIMAGE */ static void vi_init(void *unused) @@ -766,7 +701,6 @@ vi_init(void *unused) TAILQ_INIT(&vnet_modlink_head); TAILQ_INIT(&vnet_modpending_head); -#ifdef VIMAGE LIST_INIT(&vimage_head); LIST_INIT(&vprocg_head); LIST_INIT(&vnet_head); @@ -783,7 +717,6 @@ vi_init(void *unused) * curvnet recursions. */ curvnet = prison0.pr_vnet = vnet0 = LIST_FIRST(&vnet_head); -#endif } static void @@ -791,9 +724,7 @@ vi_init_done(void *unused) { struct vnet_modlink *vml_iter; -#ifdef VIMAGE curvnet = NULL; -#endif if (TAILQ_EMPTY(&vnet_modpending_head)) return; @@ -809,45 +740,21 @@ vi_init_done(void *unused) SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL); SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL); -#endif /* !VIMAGE_GLOBALS */ -#ifdef VIMAGE #ifdef DDB -static void -db_vnet_ptr(void *arg) -{ - - if (arg) - db_printf(" %p", arg); - else -#if SIZE_MAX == UINT32_MAX /* 32-bit arch */ - db_printf(" 0"); -#else /* 64-bit arch, most probaly... */ - db_printf(" 0"); -#endif -} - DB_SHOW_COMMAND(vnets, db_show_vnets) { VNET_ITERATOR_DECL(vnet_iter); #if SIZE_MAX == UINT32_MAX /* 32-bit arch */ db_printf(" vnet ifs socks"); - db_printf(" net inet inet6 ipsec netgraph\n"); #else /* 64-bit arch, most probaly... */ db_printf(" vnet ifs socks"); - db_printf(" net inet inet6 ipsec netgraph\n"); #endif VNET_FOREACH(vnet_iter) { db_printf("%p %3d %5d", vnet_iter, vnet_iter->ifcnt, vnet_iter->sockcnt); - db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NET]); - db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET]); - db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET6]); - db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_IPSEC]); - db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NETGRAPH]); db_printf("\n"); } } #endif -#endif /* VIMAGE */ diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index cd0f3e9..b389ace 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$"); #include +#include + #include #include @@ -111,6 +113,11 @@ typedef struct elf_file { Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */ Elf_Addr pcpu_base; /* Relocated pcpu set address. */ +#ifdef VIMAGE + Elf_Addr vnet_start; /* Pre-relocation vnet set start. */ + Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */ + Elf_Addr vnet_base; /* Relocated vnet set address. */ +#endif #ifdef GDB struct link_map gdb; /* hooks for gdb */ #endif @@ -506,6 +513,36 @@ parse_dpcpu(elf_file_t ef) return (0); } +#ifdef VIMAGE +static int +parse_vnet(elf_file_t ef) +{ + int count; + int error; + + ef->vnet_start = 0; + ef->vnet_stop = 0; + error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start, + (void ***)&ef->vnet_stop, &count); + /* Error just means there is no vnet data set to relocate. */ + if (error) + return (0); + count *= sizeof(void *); + /* + * Allocate space in the primary vnet area. Copy in our initialization + * from the data section and then initialize all per-vnet storage from + * that. + */ + ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count); + if (ef->vnet_base == (Elf_Addr)NULL) + return (ENOSPC); + memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count); + vnet_data_copy((void *)ef->vnet_base, count); + + return (0); +} +#endif + static int link_elf_link_preload(linker_class_t cls, const char* filename, linker_file_t *result) @@ -553,6 +590,10 @@ link_elf_link_preload(linker_class_t cls, error = parse_dynamic(ef); if (error == 0) error = parse_dpcpu(ef); +#ifdef VIMAGE + if (error == 0) + error = parse_vnet(ef); +#endif if (error) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return error; @@ -838,6 +879,11 @@ link_elf_load_file(linker_class_t cls, const char* filename, error = parse_dpcpu(ef); if (error) goto out; +#ifdef VIMAGE + error = parse_vnet(ef); + if (error) + goto out; +#endif link_elf_reloc_local(lf); VOP_UNLOCK(nd.ni_vp, 0); @@ -942,6 +988,10 @@ elf_relocaddr(linker_file_t lf, Elf_Addr x) ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); +#ifdef VIMAGE + if (x >= ef->vnet_start && x < ef->vnet_stop) + return ((x - ef->vnet_start) + ef->vnet_base); +#endif return (x); } @@ -954,6 +1004,11 @@ link_elf_unload_file(linker_file_t file) if (ef->pcpu_base) { dpcpu_free((void *)ef->pcpu_base, ef->pcpu_stop - ef->pcpu_start); } +#ifdef VIMAGE + if (ef->vnet_base) { + vnet_data_free((void *)ef->vnet_base, ef->vnet_stop - ef->vnet_start); + } +#endif #ifdef GDB if (ef->gdb.l_ld) { GDB_STATE(RT_DELETE); diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c index 9d4d70c..afcdd63 100644 --- a/sys/kern/link_elf_obj.c +++ b/sys/kern/link_elf_obj.c @@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$"); #include +#include + #include #include @@ -346,6 +348,21 @@ link_elf_link_preload(linker_class_t cls, const char *filename, ef->progtab[pb].size); dpcpu_copy(dpcpu, shdr[i].sh_size); ef->progtab[pb].addr = dpcpu; +#ifdef VIMAGE + } else if (ef->progtab[pb].name != NULL && + !strcmp(ef->progtab[pb].name, "set_vnet")) { + void *vnet_data; + + vnet_data = vnet_data_alloc(shdr[i].sh_size); + if (vnet_data == NULL) { + error = ENOSPC; + goto out; + } + memcpy(vnet_data, ef->progtab[pb].addr, + ef->progtab[pb].size); + vnet_data_copy(vnet_data, shdr[i].sh_size); + ef->progtab[pb].addr = vnet_data; +#endif } /* Update all symbol values with the offset. */ @@ -737,6 +754,12 @@ link_elf_load_file(linker_class_t cls, const char *filename, !strcmp(ef->progtab[pb].name, "set_pcpu")) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); +#ifdef VIMAGE + else if (ef->progtab[pb].name != NULL && + !strcmp(ef->progtab[pb].name, "set_vnet")) + ef->progtab[pb].addr = + vnet_data_alloc(shdr[i].sh_size); +#endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; @@ -758,10 +781,21 @@ link_elf_load_file(linker_class_t cls, const char *filename, error = EINVAL; goto out; } - /* Initialize the per-cpu area. */ - if (ef->progtab[pb].addr != (void *)mapbase) + /* Initialize the per-cpu or vnet area. */ + if (ef->progtab[pb].addr != (void *)mapbase && + !strcmp(ef->progtab[pb].name, "set_pcpu")) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); +#ifdef VIMAGE + else if (ef->progtab[pb].addr != + (void *)mapbase && + !strcmp(ef->progtab[pb].name, "set_vnet")) + vnet_data_copy(ef->progtab[pb].addr, + shdr[i].sh_size); +#endif + else + panic("link_elf_load_file: unexpected " + "progbits type"); } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); @@ -877,6 +911,11 @@ link_elf_unload_file(linker_file_t file) if (!strcmp(ef->progtab[i].name, "set_pcpu")) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); +#ifdef VIMAGE + else if (!strcmp(ef->progtab[i].name, "set_vnet")) + vnet_data_free(ef->progtab[i].addr, + ef->progtab[i].size); +#endif } } if (ef->preloaded) { diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index 9ee6047..88322d5 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -106,14 +106,12 @@ struct pr_usrreqs nousrreqs = { .pru_sopoll = pru_sopoll_notsupp, }; -#ifndef VIMAGE_GLOBALS +#ifdef VIMAGE vnet_modinfo_t vnet_domain_modinfo = { .vmi_id = VNET_MOD_DOMAIN, .vmi_name = "domain", .vmi_iattach = net_init_domain, -#ifdef VIMAGE .vmi_idetach = net_detach_domain, -#endif }; #endif @@ -249,7 +247,7 @@ net_add_domain(void *data) "domainfinalize()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); -#ifndef VIMAGE_GLOBALS +#ifdef VIMAGE vnet_mod_register_multi(&vnet_domain_modinfo, dp, dp->dom_name); #else net_init_domain(dp); -- cgit v1.1