diff options
Diffstat (limited to 'sys/kern')
40 files changed, 995 insertions, 605 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index c30e1a2..00b1c3f 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -498,7 +498,7 @@ proc0_init(void *dummy __unused) strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); - callout_init(&p->p_itcallout, CALLOUT_MPSAFE); + callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c index 288fac5..c04d1da 100644 --- a/sys/kern/kern_conf.c +++ b/sys/kern/kern_conf.c @@ -698,6 +698,13 @@ prep_devname(struct cdev *dev, const char *fmt, va_list ap) ; for (to = dev->si_name; *from != '\0'; from++, to++) { + /* + * Spaces and double quotation marks cause + * problems for the devctl(4) protocol. + * Reject names containing those characters. + */ + if (isspace(*from) || *from == '"') + return (EINVAL); /* Treat multiple sequential slashes as single. */ while (from[0] == '/' && from[1] == '/') from++; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 1eb18f1..6dae173 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -133,12 +133,26 @@ static int fill_socket_info(struct socket *so, struct kinfo_file *kif); static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif); /* - * A process is initially started out with NDFILE descriptors stored within - * this structure, selected to be enough for typical applications based on - * the historical limit of 20 open files (and the usage of descriptors by - * shells). If these descriptors are exhausted, a larger descriptor table - * may be allocated, up to a process' resource limit; the internal arrays - * are then unused. + * Each process has: + * + * - An array of open file descriptors (fd_ofiles) + * - An array of file flags (fd_ofileflags) + * - A bitmap recording which descriptors are in use (fd_map) + * + * A process starts out with NDFILE descriptors. The value of NDFILE has + * been selected based the historical limit of 20 open files, and an + * assumption that the majority of processes, especially short-lived + * processes like shells, will never need more. + * + * If this initial allocation is exhausted, a larger descriptor table and + * map are allocated dynamically, and the pointers in the process's struct + * filedesc are updated to point to those. This is repeated every time + * the process runs out of file descriptors (provided it hasn't hit its + * resource limit). + * + * Since threads may hold references to individual descriptor table + * entries, the tables are never freed. Instead, they are placed on a + * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) @@ -148,34 +162,23 @@ static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif); #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* - * Storage required per open file descriptor. - */ -#define OFILESIZE (sizeof(struct file *) + sizeof(char)) - -/* - * Storage to hold unused ofiles that need to be reclaimed. + * SLIST entry used to keep track of ofiles which must be reclaimed when + * the process exits. */ struct freetable { - struct file **ft_table; + struct file **ft_table; SLIST_ENTRY(freetable) ft_next; }; /* - * Basic allocation of descriptors: - * one of the above, plus arrays for NDFILE descriptors. + * Initial allocation: a filedesc structure + the head of SLIST used to + * keep track of old ofiles + enough space for NDFILE descriptors. */ struct filedesc0 { - struct filedesc fd_fd; - /* - * ofiles which need to be reclaimed on free. - */ - SLIST_HEAD(,freetable) fd_free; - /* - * These arrays are used when the number of open files is - * <= NDFILE, and are then pointed to by the pointers above. - */ - struct file *fd_dfiles[NDFILE]; - char fd_dfileflags[NDFILE]; + struct filedesc fd_fd; + SLIST_HEAD(, freetable) fd_free; + struct file *fd_dfiles[NDFILE]; + char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; @@ -1414,58 +1417,74 @@ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; - struct freetable *fo; + struct freetable *ft; struct file **ntable; struct file **otable; - char *nfileflags; + char *nfileflags, *ofileflags; int nnfiles, onfiles; - NDSLOTTYPE *nmap; + NDSLOTTYPE *nmap, *omap; FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); - /* compute the size of the new table */ + /* save old values */ onfiles = fdp->fd_nfiles; + otable = fdp->fd_ofiles; + ofileflags = fdp->fd_ofileflags; + omap = fdp->fd_map; + + /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; - /* allocate a new table and (if required) new bitmaps */ - ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable), + /* + * Allocate a new table and map. We need enough space for a) the + * file entries themselves, b) the file flags, and c) the struct + * freetable we will use when we decommission the table and place + * it on the freelist. We place the struct freetable in the + * middle so we don't have to worry about padding. + */ + ntable = malloc(nnfiles * sizeof(*ntable) + + sizeof(struct freetable) + + nnfiles * sizeof(*nfileflags), + M_FILEDESC, M_ZERO | M_WAITOK); + nfileflags = (char *)&ntable[nnfiles] + sizeof(struct freetable); + nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); - nfileflags = (char *)&ntable[nnfiles]; - if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) - nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, - M_FILEDESC, M_ZERO | M_WAITOK); - else - nmap = NULL; - bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); - bcopy(fdp->fd_ofileflags, nfileflags, onfiles); - otable = fdp->fd_ofiles; - fdp->fd_ofileflags = nfileflags; + /* copy the old data over and point at the new tables */ + memcpy(ntable, otable, onfiles * sizeof(*otable)); + memcpy(nfileflags, ofileflags, onfiles * sizeof(*ofileflags)); + memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); + + /* update the pointers and counters */ + fdp->fd_nfiles = nnfiles; fdp->fd_ofiles = ntable; + fdp->fd_ofileflags = nfileflags; + fdp->fd_map = nmap; + /* - * We must preserve ofiles until the process exits because we can't - * be certain that no threads have references to the old table via - * _fget(). + * Do not free the old file table, as some threads may still + * reference entries within it. Instead, place it on a freelist + * which will be processed when the struct filedesc is released. + * + * Do, however, free the old map. + * + * Note that if onfiles == NDFILE, we're dealing with the original + * static allocation contained within (struct filedesc0 *)fdp, + * which must not be freed. */ if (onfiles > NDFILE) { - fo = (struct freetable *)&otable[onfiles]; + ft = (struct freetable *)&otable[onfiles]; fdp0 = (struct filedesc0 *)fdp; - fo->ft_table = otable; - SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next); - } - if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { - bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); - if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) - free(fdp->fd_map, M_FILEDESC); - fdp->fd_map = nmap; + ft->ft_table = otable; + SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); + free(omap, M_FILEDESC); } - fdp->fd_nfiles = nnfiles; } /* diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 5dc43ca..0d2709f 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -150,11 +150,7 @@ sys_vfork(struct thread *td, struct vfork_args *uap) int error, flags; struct proc *p2; -#ifdef XEN - flags = RFFDG | RFPROC; /* validate that this is still an issue */ -#else flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; -#endif error = fork1(td, flags, 0, &p2, NULL, 0); if (error == 0) { td->td_retval[0] = p2->p_pid; @@ -591,7 +587,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2, LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); - callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); + callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 2ff64d5..c624283 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -4518,6 +4518,8 @@ prison_racct_detach(struct prison *pr) sx_assert(&allprison_lock, SA_UNLOCKED); + if (pr->pr_prison_racct == NULL) + return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c index 6498498..a83cedf 100644 --- a/sys/kern/kern_ktr.c +++ b/sys/kern/kern_ktr.c @@ -112,7 +112,7 @@ static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options"); SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, "Version of the KTR interface"); -SYSCTL_INT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD, +SYSCTL_UINT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD, &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel"); static void @@ -190,8 +190,8 @@ sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_INT|CTLFLAG_RW, 0, 0, - sysctl_debug_ktr_mask, "I", +SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_UINT|CTLFLAG_RW, 0, 0, + sysctl_debug_ktr_mask, "IU", "Bitmask of KTR event classes for which logging is enabled"); static int diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index 8b428bd..98f0156 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/kdb.h> #include <sys/ktr.h> #include <sys/lock.h> #include <sys/lock_profile.h> @@ -477,7 +478,7 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", __func__, file, line)); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, lk->lock_object.lo_name, file, line)); @@ -934,9 +935,19 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, } break; case LK_DOWNGRADE: - _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); + _lockmgr_assert(lk, KA_XLOCKED, file, line); LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); + + /* + * Panic if the lock is recursed. + */ + if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", + __func__, iwmesg, file, line); + } TD_SLOCKS_INC(curthread); /* @@ -1254,7 +1265,14 @@ _lockmgr_disown(struct lock *lk, const char *file, int line) return; tid = (uintptr_t)curthread; - _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); + _lockmgr_assert(lk, KA_XLOCKED, file, line); + + /* + * Panic if the lock is recursed. + */ + if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) + panic("%s: disown a recursed lockmgr @ %s:%d\n", + __func__, file, line); /* * If the owner is already LK_KERNPROC just skip the whole operation. diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index fe91e3e..3bdfd88 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -1,6 +1,6 @@ /*- * Copyright (c) 2004, 2005, - * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. + * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$"); * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / - * | [ Mbuf Keg ] + * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ @@ -137,8 +137,7 @@ tunable_mbinit(void *dummy) TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, - nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); - + nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_MIDDLE, tunable_mbinit, NULL); @@ -148,7 +147,7 @@ sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) int error, newnmbclusters; newnmbclusters = nmbclusters; - error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); + error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { @@ -171,7 +170,7 @@ sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) int error, newnmbjumbop; newnmbjumbop = nmbjumbop; - error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); + error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { @@ -185,8 +184,7 @@ sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", - "Maximum number of mbuf page size jumbo clusters allowed"); - + "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) @@ -194,7 +192,7 @@ sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; - error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); + error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbo9 > nmbjumbo9&& nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { @@ -208,7 +206,7 @@ sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", - "Maximum number of mbuf 9k jumbo clusters allowed"); + "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) @@ -216,7 +214,7 @@ sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; - error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); + error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { @@ -238,7 +236,7 @@ sysctl_nmbufs(SYSCTL_HANDLER_ARGS) int error, newnmbufs; newnmbufs = nmbufs; - error = sysctl_handle_int(oidp, &newnmbufs, 0, req); + error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; @@ -254,7 +252,6 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbuf, CTLTYPE_INT|CTLFLAG_RW, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); - SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); @@ -307,10 +304,9 @@ mbuf_init(void *dummy) NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); - if (nmbufs > 0) { - uma_zone_set_max(zone_mbuf, nmbufs); - nmbufs = uma_zone_get_max(zone_mbuf); - } + if (nmbufs > 0) + nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); + uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, mb_dtor_clust, @@ -320,10 +316,9 @@ mbuf_init(void *dummy) NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); - if (nmbclusters > 0) { - uma_zone_set_max(zone_clust, nmbclusters); - nmbclusters = uma_zone_get_max(zone_clust); - } + if (nmbclusters > 0) + nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); + uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); @@ -337,10 +332,9 @@ mbuf_init(void *dummy) NULL, NULL, #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); - if (nmbjumbop > 0) { - uma_zone_set_max(zone_jumbop, nmbjumbop); - nmbjumbop = uma_zone_get_max(zone_jumbop); - } + if (nmbjumbop > 0) + nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); + uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, mb_dtor_clust, @@ -351,10 +345,9 @@ mbuf_init(void *dummy) #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); - if (nmbjumbo9 > 0) { - uma_zone_set_max(zone_jumbo9, nmbjumbo9); - nmbjumbo9 = uma_zone_get_max(zone_jumbo9); - } + if (nmbjumbo9 > 0) + nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); + uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, mb_dtor_clust, @@ -365,10 +358,9 @@ mbuf_init(void *dummy) #endif UMA_ALIGN_PTR, UMA_ZONE_REFCNT); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); - if (nmbjumbo16 > 0) { - uma_zone_set_max(zone_jumbo16, nmbjumbo16); - nmbjumbo16 = uma_zone_get_max(zone_jumbo16); - } + if (nmbjumbo16 > 0) + nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); + uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int), NULL, NULL, @@ -488,7 +480,7 @@ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; - unsigned long flags; + unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c index b0bc5c8..29864a8 100644 --- a/sys/kern/kern_mib.c +++ b/sys/kern/kern_mib.c @@ -377,15 +377,8 @@ SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, /* Actual kernel configuration options. */ extern char kernconfstring[]; -static int -sysctl_kern_config(SYSCTL_HANDLER_ARGS) -{ - return (sysctl_handle_string(oidp, kernconfstring, - strlen(kernconfstring), req)); -} - -SYSCTL_PROC(_kern, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RW, - 0, 0, sysctl_kern_config, "", "Kernel configuration file"); +SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, kernconfstring, 0, + "Kernel configuration file"); #endif static int diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 2f13863..39f461e 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -210,7 +210,7 @@ __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line) m = mtxlock2mtx(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, @@ -326,7 +326,7 @@ _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) m = mtxlock2mtx(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, diff --git a/sys/kern/kern_priv.c b/sys/kern/kern_priv.c index fd3a95c..2f70c2b 100644 --- a/sys/kern/kern_priv.c +++ b/sys/kern/kern_priv.c @@ -59,6 +59,11 @@ SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW, &suser_enabled, 0, "processes with uid 0 have privilege"); TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled); +static int unprivileged_mlock = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RW|CTLFLAG_TUN, + &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)"); +TUNABLE_INT("security.bsd.unprivileged_mlock", &unprivileged_mlock); + SDT_PROVIDER_DEFINE(priv); SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_ok, priv-ok, "int"); SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_err, priv-err, "int"); @@ -93,6 +98,19 @@ priv_check_cred(struct ucred *cred, int priv, int flags) if (error) goto out; + if (unprivileged_mlock) { + /* + * Allow unprivileged users to call mlock(2)/munlock(2) and + * mlockall(2)/munlockall(2). + */ + switch (priv) { + case PRIV_VM_MLOCK: + case PRIV_VM_MUNLOCK: + error = 0; + goto out; + } + } + /* * Having determined if privilege is restricted by various policies, * now determine if privilege is granted. At this point, any policy diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c index 30400b1..ebf0f76 100644 --- a/sys/kern/kern_rmlock.c +++ b/sys/kern/kern_rmlock.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/kdb.h> #include <sys/ktr.h> #include <sys/lock.h> #include <sys/mutex.h> @@ -494,7 +495,7 @@ void _rm_wlock_debug(struct rmlock *rm, const char *file, int line) if (SCHEDULER_STOPPED()) return; - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, @@ -539,7 +540,7 @@ _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, if (SCHEDULER_STOPPED()) return (1); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); if (!trylock && (rm->lock_object.lo_flags & RM_SLEEPABLE)) diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index 60a7faa..38d0654 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include "opt_no_adaptive_rwlocks.h" #include <sys/param.h> +#include <sys/kdb.h> #include <sys/ktr.h> #include <sys/kernel.h> #include <sys/lock.h> @@ -258,7 +259,7 @@ _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line) rw = rwlock2rw(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, @@ -282,7 +283,7 @@ __rw_try_wlock(volatile uintptr_t *c, const char *file, int line) rw = rwlock2rw(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, @@ -364,7 +365,7 @@ __rw_rlock(volatile uintptr_t *c, const char *file, int line) rw = rwlock2rw(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, @@ -558,7 +559,7 @@ __rw_try_rlock(volatile uintptr_t *c, const char *file, int line) rw = rwlock2rw(c); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index b0e4839..fcbae28 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/kerneldump.h> #include <sys/kthread.h> +#include <sys/ktr.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/priv.h> @@ -150,6 +151,7 @@ static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); +static void vpanic(const char *fmt, va_list ap) __dead2; /* register various local shutdown events */ static void @@ -538,6 +540,134 @@ shutdown_reset(void *junk, int howto) /* NOTREACHED */ /* assuming reset worked */ } +#if defined(WITNESS) || defined(INVARIANTS) +static int kassert_warn_only = 0; +#ifdef KDB +static int kassert_do_kdb = 0; +#endif +#ifdef KTR +static int kassert_do_ktr = 0; +#endif +static int kassert_do_log = 1; +static int kassert_log_pps_limit = 4; +static int kassert_log_mute_at = 0; +static int kassert_log_panic_at = 0; +static int kassert_warnings = 0; + +SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); + +SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_warn_only, 0, + "KASSERT triggers a panic (1) or just a warning (0)"); +TUNABLE_INT("debug.kassert.warn_only", &kassert_warn_only); + +#ifdef KDB +SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_do_kdb, 0, "KASSERT will enter the debugger"); +TUNABLE_INT("debug.kassert.do_kdb", &kassert_do_kdb); +#endif + +#ifdef KTR +SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_do_ktr, 0, + "KASSERT does a KTR, set this to the KTRMASK you want"); +TUNABLE_INT("debug.kassert.do_ktr", &kassert_do_ktr); +#endif + +SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)"); +TUNABLE_INT("debug.kassert.do_log", &kassert_do_log); + +SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_warnings, 0, "number of KASSERTs that have been triggered"); +TUNABLE_INT("debug.kassert.warnings", &kassert_warnings); + +SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); +TUNABLE_INT("debug.kassert.log_panic_at", &kassert_log_panic_at); + +SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_log_pps_limit, 0, "limit number of log messages per second"); +TUNABLE_INT("debug.kassert.log_pps_limit", &kassert_log_pps_limit); + +SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RW | CTLFLAG_TUN, + &kassert_log_mute_at, 0, "max number of KASSERTS to log"); +TUNABLE_INT("debug.kassert.log_mute_at", &kassert_log_mute_at); + +static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); + +SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, + kassert_sysctl_kassert, "I", "set to trigger a test kassert"); + +static int +kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) +{ + int error, i; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error == 0) { + i = 0; + error = sysctl_handle_int(oidp, &i, 0, req); + } + if (error != 0 || req->newptr == NULL) + return (error); + KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); + return (0); +} + +/* + * Called by KASSERT, this decides if we will panic + * or if we will log via printf and/or ktr. + */ +void +kassert_panic(const char *fmt, ...) +{ + static char buf[256]; + va_list ap; + + va_start(ap, fmt); + (void)vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + /* + * panic if we're not just warning, or if we've exceeded + * kassert_log_panic_at warnings. + */ + if (!kassert_warn_only || + (kassert_log_panic_at > 0 && + kassert_warnings >= kassert_log_panic_at)) { + va_start(ap, fmt); + vpanic(fmt, ap); + /* NORETURN */ + } +#ifdef KTR + if (kassert_do_ktr) + CTR0(ktr_mask, buf); +#endif /* KTR */ + /* + * log if we've not yet met the mute limit. + */ + if (kassert_do_log && + (kassert_log_mute_at == 0 || + kassert_warnings < kassert_log_mute_at)) { + static struct timeval lasterr; + static int curerr; + + if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { + printf("KASSERT failed: %s\n", buf); + kdb_backtrace(); + } + } +#ifdef KDB + if (kassert_do_kdb) { + kdb_enter(KDB_WHY_KASSERT, buf); + } +#endif + atomic_add_int(&kassert_warnings, 1); +} +#endif + /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync @@ -546,12 +676,20 @@ shutdown_reset(void *junk, int howto) void panic(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + vpanic(fmt, ap); +} + +static void +vpanic(const char *fmt, va_list ap) +{ #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; - va_list ap; static char buf[256]; spinlock_enter(); @@ -587,7 +725,6 @@ panic(const char *fmt, ...) newpanic = 1; } - va_start(ap, fmt); if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; @@ -598,7 +735,6 @@ panic(const char *fmt, ...) vprintf(fmt, ap); printf("\n"); } - va_end(ap); #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 541ea2b..9c52707 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -106,7 +106,6 @@ SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 1, "struct proc *"); SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 2, "int"); static int coredump(struct thread *); -static char *expand_name(const char *, uid_t, pid_t, struct thread *, int); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td, int stop_allowed); @@ -202,37 +201,37 @@ SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, #define SA_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { - SA_KILL, /* SIGHUP */ - SA_KILL, /* SIGINT */ - SA_KILL|SA_CORE, /* SIGQUIT */ - SA_KILL|SA_CORE, /* SIGILL */ - SA_KILL|SA_CORE, /* SIGTRAP */ - SA_KILL|SA_CORE, /* SIGABRT */ - SA_KILL|SA_CORE, /* SIGEMT */ - SA_KILL|SA_CORE, /* SIGFPE */ - SA_KILL, /* SIGKILL */ - SA_KILL|SA_CORE, /* SIGBUS */ - SA_KILL|SA_CORE, /* SIGSEGV */ - SA_KILL|SA_CORE, /* SIGSYS */ - SA_KILL, /* SIGPIPE */ - SA_KILL, /* SIGALRM */ - SA_KILL, /* SIGTERM */ - SA_IGNORE, /* SIGURG */ - SA_STOP, /* SIGSTOP */ - SA_STOP|SA_TTYSTOP, /* SIGTSTP */ - SA_IGNORE|SA_CONT, /* SIGCONT */ - SA_IGNORE, /* SIGCHLD */ - SA_STOP|SA_TTYSTOP, /* SIGTTIN */ - SA_STOP|SA_TTYSTOP, /* SIGTTOU */ - SA_IGNORE, /* SIGIO */ - SA_KILL, /* SIGXCPU */ - SA_KILL, /* SIGXFSZ */ - SA_KILL, /* SIGVTALRM */ - SA_KILL, /* SIGPROF */ - SA_IGNORE, /* SIGWINCH */ - SA_IGNORE, /* SIGINFO */ - SA_KILL, /* SIGUSR1 */ - SA_KILL, /* SIGUSR2 */ + SA_KILL, /* SIGHUP */ + SA_KILL, /* SIGINT */ + SA_KILL|SA_CORE, /* SIGQUIT */ + SA_KILL|SA_CORE, /* SIGILL */ + SA_KILL|SA_CORE, /* SIGTRAP */ + SA_KILL|SA_CORE, /* SIGABRT */ + SA_KILL|SA_CORE, /* SIGEMT */ + SA_KILL|SA_CORE, /* SIGFPE */ + SA_KILL, /* SIGKILL */ + SA_KILL|SA_CORE, /* SIGBUS */ + SA_KILL|SA_CORE, /* SIGSEGV */ + SA_KILL|SA_CORE, /* SIGSYS */ + SA_KILL, /* SIGPIPE */ + SA_KILL, /* SIGALRM */ + SA_KILL, /* SIGTERM */ + SA_IGNORE, /* SIGURG */ + SA_STOP, /* SIGSTOP */ + SA_STOP|SA_TTYSTOP, /* SIGTSTP */ + SA_IGNORE|SA_CONT, /* SIGCONT */ + SA_IGNORE, /* SIGCHLD */ + SA_STOP|SA_TTYSTOP, /* SIGTTIN */ + SA_STOP|SA_TTYSTOP, /* SIGTTOU */ + SA_IGNORE, /* SIGIO */ + SA_KILL, /* SIGXCPU */ + SA_KILL, /* SIGXFSZ */ + SA_KILL, /* SIGVTALRM */ + SA_KILL, /* SIGPROF */ + SA_IGNORE, /* SIGWINCH */ + SA_IGNORE, /* SIGINFO */ + SA_KILL, /* SIGUSR1 */ + SA_KILL, /* SIGUSR2 */ }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); @@ -3018,11 +3017,11 @@ SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, #if defined(COMPRESS_USER_CORES) int compress_user_cores = 1; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW, - &compress_user_cores, 0, ""); + &compress_user_cores, 0, "Compression of user corefiles"); int compress_user_cores_gzlevel = -1; /* default level */ SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW, - &compress_user_cores_gzlevel, -1, "user core gz compression level"); + &compress_user_cores_gzlevel, -1, "Corefile gzip compression level"); #define GZ_SUFFIX ".gz" #define GZ_SUFFIX_LEN 3 @@ -3031,11 +3030,12 @@ SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW, static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, - sizeof(corefilename), "process corefile name format string"); + sizeof(corefilename), "Process corefile name format string"); /* - * expand_name(name, uid, pid, td, compress) - * Expand the name described in corefilename, using name, uid, and pid. + * corefile_open(comm, uid, pid, td, compress, vpp, namep) + * Expand the name described in corefilename, using name, uid, and pid + * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) @@ -3044,25 +3044,22 @@ SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ -static char * -expand_name(const char *name, uid_t uid, pid_t pid, struct thread *td, - int compress) +static int +corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, + int compress, struct vnode **vpp, char **namep) { + struct nameidata nd; struct sbuf sb; const char *format; - char *temp; - size_t i; - int indexpos; - char *hostname; + char *hostname, *name; + int indexpos, i, error, cmode, flags, oflags; hostname = NULL; format = corefilename; - temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); - if (temp == NULL) - return (NULL); + name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexpos = -1; - (void)sbuf_new(&sb, temp, MAXPATHLEN, SBUF_FIXEDLEN); - for (i = 0; format[i]; i++) { + (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); + for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; @@ -3073,27 +3070,18 @@ expand_name(const char *name, uid_t uid, pid_t pid, struct thread *td, case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, - M_TEMP, M_NOWAIT); - if (hostname == NULL) { - log(LOG_ERR, - "pid %ld (%s), uid (%lu): " - "unable to alloc memory " - "for corefile hostname\n", - (long)pid, name, - (u_long)uid); - goto nomem; - } - } + M_TEMP, M_WAITOK); + } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; - case 'I': /* autoincrementing index */ + case 'I': /* autoincrementing index */ sbuf_printf(&sb, "0"); indexpos = sbuf_len(&sb) - 1; break; case 'N': /* process name */ - sbuf_printf(&sb, "%s", name); + sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); @@ -3105,6 +3093,7 @@ expand_name(const char *name, uid_t uid, pid_t pid, struct thread *td, log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); + break; } break; default: @@ -3113,21 +3102,22 @@ expand_name(const char *name, uid_t uid, pid_t pid, struct thread *td, } free(hostname, M_TEMP); #ifdef COMPRESS_USER_CORES - if (compress) { + if (compress) sbuf_printf(&sb, GZ_SUFFIX); - } #endif if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " - "long\n", (long)pid, name, (u_long)uid); -nomem: + "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); - free(temp, M_TEMP); - return (NULL); + free(name, M_TEMP); + return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); + cmode = S_IRUSR | S_IWUSR; + oflags = VN_OPEN_NOAUDIT | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); + /* * If the core format has a %I in it, then we need to check * for existing corefiles before returning a name. @@ -3135,19 +3125,10 @@ nomem: * non-existing core file name to use. */ if (indexpos != -1) { - struct nameidata nd; - int error, n; - int flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW; - int cmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - int oflags = 0; - - if (capmode_coredump) - oflags = VN_OPEN_NOCAPCHECK; - - for (n = 0; n < num_cores; n++) { - temp[indexpos] = '0' + n; - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, - temp, td); + for (i = 0; i < num_cores; i++) { + flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW; + name[indexpos] = '0' + i; + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error) { @@ -3155,27 +3136,28 @@ nomem: continue; log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " - "on initial open test, error = %d\n", - pid, name, uid, temp, error); - free(temp, M_TEMP); - return (NULL); - } - NDFREE(&nd, NDF_ONLY_PNBUF); - VOP_UNLOCK(nd.ni_vp, 0); - error = vn_close(nd.ni_vp, FWRITE, td->td_ucred, td); - if (error) { - log(LOG_ERR, - "pid %d (%s), uid (%u): Path `%s' failed " - "on close after initial open test, " - "error = %d\n", - pid, name, uid, temp, error); - free(temp, M_TEMP); - return (NULL); + "on initial open test, error = %d\n", + pid, comm, uid, name, error); } - break; + goto out; } } - return (temp); + + flags = O_CREAT | FWRITE | O_NOFOLLOW; + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); + error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); +out: + if (error) { +#ifdef AUDIT + audit_proc_coredump(td, name, error); +#endif + free(name, M_TEMP); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + *vpp = nd.ni_vp; + *namep = name; + return (0); } /* @@ -3190,12 +3172,11 @@ static int coredump(struct thread *td) { struct proc *p = td->td_proc; - register struct vnode *vp; - register struct ucred *cred = td->td_ucred; + struct ucred *cred = td->td_ucred; + struct vnode *vp; struct flock lf; - struct nameidata nd; struct vattr vattr; - int error, error1, flags, locked; + int error, error1, locked; struct mount *mp; char *name; /* name of corefile */ off_t limit; @@ -3210,22 +3191,8 @@ coredump(struct thread *td) MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); - name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid, td, - compress); - if (name == NULL) { - PROC_UNLOCK(p); -#ifdef AUDIT - audit_proc_coredump(td, NULL, EINVAL); -#endif - return (EINVAL); - } - if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || - do_coredump == 0) { + if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0)) { PROC_UNLOCK(p); -#ifdef AUDIT - audit_proc_coredump(td, name, EFAULT); -#endif - free(name, M_TEMP); return (EFAULT); } @@ -3240,33 +3207,19 @@ coredump(struct thread *td) limit = (off_t)lim_cur(p, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); -#ifdef AUDIT - audit_proc_coredump(td, name, EFBIG); -#endif - free(name, M_TEMP); return (EFBIG); } PROC_UNLOCK(p); restart: - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); - flags = O_CREAT | FWRITE | O_NOFOLLOW; - error = vn_open_cred(&nd, &flags, S_IRUSR | S_IWUSR, - VN_OPEN_NOAUDIT | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0), - cred, NULL); - if (error) { -#ifdef AUDIT - audit_proc_coredump(td, name, error); -#endif - free(name, M_TEMP); + error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress, + &vp, &name); + if (error != 0) return (error); - } - NDFREE(&nd, NDF_ONLY_PNBUF); - vp = nd.ni_vp; /* Don't dump to non-regular files or files with links. */ - if (vp->v_type != VREG || - VOP_GETATTR(vp, &vattr, cred) || vattr.va_nlink != 1) { + if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || + vattr.va_nlink != 1) { VOP_UNLOCK(vp, 0); error = EFAULT; goto close; diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 885dc22..d0009b1 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -176,6 +176,12 @@ retry: /* * Kernel thread preemption implementation. Critical sections mark * regions of code in which preemptions are not allowed. + * + * It might seem a good idea to inline critical_enter() but, in order + * to prevent instructions reordering by the compiler, a __compiler_membar() + * would have to be used here (the same as sched_pin()). The performance + * penalty imposed by the membar could, then, produce slower code than + * the function call itself, for most cases. */ void critical_enter(void) diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c index af2391f..e469b1a 100644 --- a/sys/kern/kern_sx.c +++ b/sys/kern/kern_sx.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/kdb.h> #include <sys/ktr.h> #include <sys/lock.h> #include <sys/mutex.h> @@ -249,7 +250,7 @@ _sx_slock(struct sx *sx, int opts, const char *file, int line) if (SCHEDULER_STOPPED()) return (0); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, @@ -273,7 +274,7 @@ sx_try_slock_(struct sx *sx, const char *file, int line) if (SCHEDULER_STOPPED()) return (1); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); @@ -302,7 +303,7 @@ _sx_xlock(struct sx *sx, int opts, const char *file, int line) if (SCHEDULER_STOPPED()) return (0); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, @@ -328,7 +329,7 @@ sx_try_xlock_(struct sx *sx, const char *file, int line) if (SCHEDULER_STOPPED()) return (1); - KASSERT(!TD_IS_IDLETHREAD(curthread), + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index c0e7831..97c288d 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -788,13 +788,11 @@ realitexpire(void *arg) struct timeval ctv, ntv; p = (struct proc *)arg; - PROC_LOCK(p); kern_psignal(p, SIGALRM); if (!timevalisset(&p->p_realtimer.it_interval)) { timevalclear(&p->p_realtimer.it_value); if (p->p_flag & P_WEXIT) wakeup(&p->p_itcallout); - PROC_UNLOCK(p); return; } for (;;) { @@ -806,7 +804,6 @@ realitexpire(void *arg) timevalsub(&ntv, &ctv); callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1, realitexpire, p); - PROC_UNLOCK(p); return; } } diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index e3e1e9d..80933fa 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -84,7 +84,7 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, * TODO: * allocate more timeout table slots when table overflows. */ -int callwheelsize, callwheelbits, callwheelmask; +int callwheelsize, callwheelmask; /* * The callout cpu migration entity represents informations necessary for @@ -218,12 +218,10 @@ kern_timeout_callwheel_alloc(caddr_t v) timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); /* - * Calculate callout wheel size + * Calculate callout wheel size, should be next power of two higher + * than 'ncallout'. */ - for (callwheelsize = 1, callwheelbits = 0; - callwheelsize < ncallout; - callwheelsize <<= 1, ++callwheelbits) - ; + callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; cc->cc_callout = (struct callout *)v; @@ -441,15 +439,13 @@ static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - if (c->c_flags & CALLOUT_LOCAL_ALLOC) { - c->c_func = NULL; - SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); - } + if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0) + return; + c->c_func = NULL; + SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } -static struct callout * +static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, int *lockcalls, int *gcalls) { @@ -471,7 +467,9 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, static timeout_t *lastfunc; #endif - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); + KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) == + (CALLOUT_PENDING | CALLOUT_ACTIVE), + ("softclock_call_cc: pend|act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1; c_lock = c->c_lock; @@ -539,20 +537,7 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, class->lc_unlock(c_lock); skip: CC_LOCK(cc); - /* - * If the current callout is locally allocated (from - * timeout(9)) then put it on the freelist. - * - * Note: we need to check the cached copy of c_flags because - * if it was not local, then it's not safe to deref the - * callout pointer. - */ - if (c_flags & CALLOUT_LOCAL_ALLOC) { - KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC, - ("corrupted callout")); - c->c_func = NULL; - SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); - } + KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); cc->cc_curr = NULL; if (cc->cc_waiting) { /* @@ -561,13 +546,22 @@ skip: * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) + if (cc_cme_migrating(cc)) { cc_cme_cleanup(cc); + + /* + * It should be assert here that the callout is not + * destroyed but that is not easy. + */ + c->c_flags &= ~CALLOUT_DFRMIGRATION; + } cc->cc_waiting = 0; CC_UNLOCK(cc); wakeup(&cc->cc_waiting); CC_LOCK(cc); } else if (cc_cme_migrating(cc)) { + KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, + ("Migrating legacy callout %p", c)); #ifdef SMP /* * If the callout was scheduled for @@ -580,23 +574,20 @@ skip: cc_cme_cleanup(cc); /* - * Handle deferred callout stops + * It should be assert here that the callout is not destroyed + * but that is not easy. + * + * As first thing, handle deferred callout stops. */ if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); - goto nextc; + return; } - c->c_flags &= ~CALLOUT_DFRMIGRATION; - /* - * It should be assert here that the - * callout is not destroyed but that - * is not easy. - */ new_cc = callout_cpu_switch(c, cc, new_cpu); callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, new_cpu); @@ -606,10 +597,19 @@ skip: panic("migration should not happen"); #endif } -#ifdef SMP -nextc: -#endif - return (cc->cc_next); + /* + * If the current callout is locally allocated (from + * timeout(9)) then put it on the freelist. + * + * Note: we need to check the cached copy of c_flags because + * if it was not local, then it's not safe to deref the + * callout pointer. + */ + KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 || + c->c_flags == CALLOUT_LOCAL_ALLOC, + ("corrupted callout")); + if (c_flags & CALLOUT_LOCAL_ALLOC) + callout_cc_del(c, cc); } /* @@ -676,10 +676,12 @@ softclock(void *arg) steps = 0; } } else { + cc->cc_next = TAILQ_NEXT(c, c_links.tqe); TAILQ_REMOVE(bucket, c, c_links.tqe); - c = softclock_call_cc(c, cc, &mpcalls, + softclock_call_cc(c, cc, &mpcalls, &lockcalls, &gcalls); steps = 0; + c = cc->cc_next; } } } @@ -1024,6 +1026,8 @@ again: CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); + if (cc->cc_next == c) + cc->cc_next = TAILQ_NEXT(c, c_links.tqe); TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, c_links.tqe); callout_cc_del(c, cc); diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c new file mode 100644 index 0000000..9406d95 --- /dev/null +++ b/sys/kern/subr_busdma_bufalloc.c @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2012 Ian Lepore + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Buffer allocation support routines for bus_dmamem_alloc implementations. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/busdma_bufalloc.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/uma.h> + +/* + * We manage buffer zones up to a page in size. Buffers larger than a page can + * be managed by one of the kernel's page-oriented memory allocation routines as + * efficiently as what we can do here. Also, a page is the largest size for + * which we can g'tee contiguity when using uma, and contiguity is one of the + * requirements we have to fulfill. + */ +#define MIN_ZONE_BUFSIZE 32 +#define MAX_ZONE_BUFSIZE PAGE_SIZE + +/* + * The static array of 12 bufzones is big enough to handle all the zones for the + * smallest supported allocation size of 32 through the largest supported page + * size of 64K. If you up the biggest page size number, up the array size too. + * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1, + * but I don't know of an easy way to express that as a compile-time constant. + */ +#if PAGE_SIZE > 65536 +#error Unsupported page size +#endif + +struct busdma_bufalloc { + bus_size_t min_size; + size_t num_zones; + struct busdma_bufzone buf_zones[12]; +}; + +busdma_bufalloc_t +busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment, + uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags) +{ + struct busdma_bufalloc *ba; + struct busdma_bufzone *bz; + int i; + bus_size_t cursize; + + ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, + M_ZERO | M_WAITOK); + + ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment); + + /* + * Each uma zone is created with an alignment of size-1, meaning that + * the alignment is equal to the size (I.E., 64 byte buffers are aligned + * to 64 byte boundaries, etc). This allows for a fast efficient test + * when deciding whether a pool buffer meets the constraints of a given + * tag used for allocation: the buffer is usable if tag->alignment <= + * bufzone->size. + */ + for (i = 0, bz = ba->buf_zones, cursize = ba->min_size; + i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE; + ++i, ++bz, cursize <<= 1) { + snprintf(bz->name, sizeof(bz->name), "dma %.10s %lu", + name, cursize); + bz->size = cursize; + bz->umazone = uma_zcreate(bz->name, bz->size, + NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags); + if (bz->umazone == NULL) { + busdma_bufalloc_destroy(ba); + return (NULL); + } + if (alloc_func != NULL) + uma_zone_set_allocf(bz->umazone, alloc_func); + if (free_func != NULL) + uma_zone_set_freef(bz->umazone, free_func); + ++ba->num_zones; + } + + return (ba); +} + +void +busdma_bufalloc_destroy(busdma_bufalloc_t ba) +{ + struct busdma_bufzone *bz; + int i; + + if (ba == NULL) + return; + + for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { + uma_zdestroy(bz->umazone); + } + + free(ba, M_DEVBUF); +} + +struct busdma_bufzone * +busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size) +{ + struct busdma_bufzone *bz; + int i; + + if (size > MAX_ZONE_BUFSIZE) + return (NULL); + + for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) { + if (bz->size >= size) + return (bz); + } + + panic("Didn't find a buffer zone of the right size"); +} + +void * +busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag, + int wait) +{ +#ifdef VM_MEMATTR_UNCACHEABLE + + /* Inform UMA that this allocator uses kernel_map/object. */ + *pflag = UMA_SLAB_KERNEL; + + return ((void *)kmem_alloc_attr(kernel_map, size, wait, 0, + BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE)); + +#else + + panic("VM_MEMATTR_UNCACHEABLE unavailable"); + +#endif /* VM_MEMATTR_UNCACHEABLE */ +} + +void +busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag) +{ + + kmem_free(kernel_map, (vm_offset_t)item, size); +} + diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c index cd2a5f3..e9d7d22 100644 --- a/sys/kern/subr_mchain.c +++ b/sys/kern/subr_mchain.c @@ -59,7 +59,7 @@ mb_init(struct mbchain *mbp) { struct mbuf *m; - m = m_gethdr(M_WAIT, MT_DATA); + m = m_gethdr(M_WAITOK, MT_DATA); m->m_len = 0; mb_initm(mbp, m); return (0); @@ -114,7 +114,7 @@ mb_reserve(struct mbchain *mbp, int size) panic("mb_reserve: size = %d\n", size); m = mbp->mb_cur; if (mbp->mb_mleft < size) { - mn = m_get(M_WAIT, MT_DATA); + mn = m_get(M_WAITOK, MT_DATA); mbp->mb_cur = m->m_next = mn; m = mn; m->m_len = 0; @@ -205,7 +205,7 @@ mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type) while (size > 0) { if (mleft == 0) { if (m->m_next == NULL) - m = m_getm(m, size, M_WAIT, MT_DATA); + m = m_getm(m, size, M_WAITOK, MT_DATA); else m = m->m_next; mleft = M_TRAILINGSPACE(m); @@ -307,7 +307,7 @@ md_init(struct mdchain *mdp) { struct mbuf *m; - m = m_gethdr(M_WAIT, MT_DATA); + m = m_gethdr(M_WAITOK, MT_DATA); m->m_len = 0; md_initm(mdp, m); return (0); @@ -514,7 +514,7 @@ md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret) { struct mbuf *m = mdp->md_cur, *rm; - rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAIT); + rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAITOK); md_get_mem(mdp, NULL, size, MB_MZERO); *ret = rm; return (0); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 2ca0051..510033f 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -333,8 +333,8 @@ init_param2(long physpages) * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ - realmem = qmin(physpages * PAGE_SIZE, - VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + realmem = qmin((quad_t)physpages * PAGE_SIZE, + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.maxmbufmem", &maxmbufmem); if (maxmbufmem > (realmem / 4) * 3) diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 3b27dce..3614798 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -766,8 +766,9 @@ quiesce_cpus(cpuset_t map, const char *wmesg, int prio) thread_unlock(curthread); while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio, wmesg, 1); - if (error) + if (error != EWOULDBLOCK) goto out; + error = 0; } } out: diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index fe5cd0e..3d6dc5a 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -77,13 +77,12 @@ syscallenter(struct thread *td, struct syscall_args *sa) if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif - - CTR6(KTR_SYSC, -"syscall: td=%p pid %d %s (%#lx, %#lx, %#lx)", - td, td->td_proc->p_pid, syscallname(p, sa->code), - sa->args[0], sa->args[1], sa->args[2]); + KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code), + (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0], + "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]); if (error == 0) { + STOPEVENT(p, S_SCE, sa->narg); if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) { PROC_LOCK(p); @@ -150,10 +149,12 @@ syscallenter(struct thread *td, struct syscall_args *sa) sa->callp, NULL, (error) ? -1 : td->td_retval[0]); #endif syscall_thread_exit(td, sa->callp); - CTR4(KTR_SYSC, "syscall: p=%p error=%d return %#lx %#lx", - p, error, td->td_retval[0], td->td_retval[1]); } retval: + KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), + (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error, + "retval0:%#lx", td->td_retval[0], "retval1:%#lx", + td->td_retval[1]); if (traced) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; @@ -176,9 +177,6 @@ syscallret(struct thread *td, int error, struct syscall_args *sa __unused) */ userret(td, td->td_frame); - CTR4(KTR_SYSC, "syscall %s exit thread %p pid %d proc %s", - syscallname(p, sa->code), td, td->td_proc->p_pid, td->td_name); - #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) { ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ? diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c index d518cc3..7b593487 100644 --- a/sys/kern/subr_uio.c +++ b/sys/kern/subr_uio.c @@ -389,7 +389,6 @@ again: case UIO_SYSSPACE: iov_base = iov->iov_base; *iov_base = c; - iov->iov_base = iov_base; break; case UIO_NOCOPY: diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index c2aed86..3cbf51a 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -822,16 +822,16 @@ witness_init(struct lock_object *lock, const char *type) class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) - panic("%s: lock (%s) %s can not be recursable", __func__, - class->lc_name, lock->lo_name); + kassert_panic("%s: lock (%s) %s can not be recursable", + __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) - panic("%s: lock (%s) %s can not be sleepable", __func__, - class->lc_name, lock->lo_name); + kassert_panic("%s: lock (%s) %s can not be sleepable", + __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) - panic("%s: lock (%s) %s can not be upgradable", __func__, - class->lc_name, lock->lo_name); + kassert_panic("%s: lock (%s) %s can not be upgradable", + __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. @@ -847,7 +847,8 @@ witness_init(struct lock_object *lock, const char *type) pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) - panic("%s: pending locks list is too small, bump it\n", + panic("%s: pending locks list is too small, " + "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); @@ -1073,7 +1074,8 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) - panic("blockable sleep lock (%s) %s @ %s:%d", + kassert_panic("acquiring blockable sleep lock with " + "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); @@ -1117,7 +1119,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, fixup_filename(file), line); printf("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); - panic("share->excl"); + kassert_panic("share->excl"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { @@ -1126,7 +1128,7 @@ witness_checkorder(struct lock_object *lock, int flags, const char *file, fixup_filename(file), line); printf("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); - panic("excl->share"); + kassert_panic("excl->share"); } return; } @@ -1433,26 +1435,32 @@ witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) - panic("upgrade of non-upgradable lock (%s) %s @ %s:%d", + kassert_panic( + "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) - panic("upgrade of non-sleep lock (%s) %s @ %s:%d", + kassert_panic( + "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); - if (instance == NULL) - panic("upgrade of unlocked lock (%s) %s @ %s:%d", + if (instance == NULL) { + kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); + return; + } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) - panic("upgrade of exclusive lock (%s) %s @ %s:%d", + kassert_panic( + "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) - panic("upgrade of recursed lock (%s) %s r=%d @ %s:%d", + kassert_panic( + "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); @@ -1473,26 +1481,32 @@ witness_downgrade(struct lock_object *lock, int flags, const char *file, class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) - panic("downgrade of non-upgradable lock (%s) %s @ %s:%d", + kassert_panic( + "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) - panic("downgrade of non-sleep lock (%s) %s @ %s:%d", + kassert_panic( + "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); - if (instance == NULL) - panic("downgrade of unlocked lock (%s) %s @ %s:%d", + if (instance == NULL) { + kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); + return; + } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) - panic("downgrade of shared lock (%s) %s @ %s:%d", + kassert_panic( + "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) - panic("downgrade of recursed lock (%s) %s r=%d @ %s:%d", + kassert_panic( + "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); @@ -1534,11 +1548,13 @@ witness_unlock(struct lock_object *lock, int flags, const char *file, int line) * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ - if (witness_watch > 0) - panic("lock (%s) %s not locked @ %s:%d", class->lc_name, + if (witness_watch > 0) { + kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); - else return; + } else { + return; + } found: /* First, check for shared/exclusive mismatches. */ @@ -1548,7 +1564,7 @@ found: lock->lo_name, fixup_filename(file), line); printf("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); - panic("excl->ushare"); + kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { @@ -1557,7 +1573,7 @@ found: printf("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); - panic("share->uexcl"); + kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { @@ -1571,7 +1587,7 @@ found: if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { printf("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); - panic("lock marked norelease"); + kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ @@ -1626,7 +1642,8 @@ witness_thread_exit(struct thread *td) witness_list_lock(&lle->ll_children[i], printf); } - panic("Thread %p cannot exit while holding sleeplocks\n", td); + kassert_panic( + "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } @@ -1707,7 +1724,7 @@ witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) } else sched_unpin(); if (flags & WARN_PANIC && n) - panic("%s", __func__); + kassert_panic("%s", __func__); else witness_debugger(n); return (n); @@ -1750,11 +1767,13 @@ enroll(const char *description, struct lock_class *lock_class) return (NULL); else typelist = &w_spin; - } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) + } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) { typelist = &w_sleep; - else - panic("lock class %s is not sleep or spin", + } else { + kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); + return (NULL); + } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); @@ -1784,7 +1803,7 @@ found: w->w_refcount++; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) - panic( + kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); @@ -1910,18 +1929,26 @@ adopt(struct witness *parent, struct witness *child) static void itismychild(struct witness *parent, struct witness *child) { + int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { - if (witness_cold == 0) + if (witness_cold == 0) { + unlocked = 1; mtx_unlock_spin(&w_mtx); - panic("%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " + } else { + unlocked = 0; + } + kassert_panic( + "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); + if (unlocked) + mtx_lock_spin(&w_mtx); } adopt(parent, child); } @@ -2191,9 +2218,11 @@ witness_save(struct lock_object *lock, const char **filep, int *linep) lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); - if (instance == NULL) - panic("%s: lock (%s) %s not locked", __func__, + if (instance == NULL) { + kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); + return; + } *filep = instance->li_file; *linep = instance->li_line; } @@ -2225,10 +2254,12 @@ witness_restore(struct lock_object *lock, const char *file, int line) } instance = find_instance(lock_list, lock); if (instance == NULL) - panic("%s: lock (%s) %s not locked", __func__, + kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; + if (instance == NULL) + return; instance->li_file = file; instance->li_line = line; } @@ -2249,13 +2280,14 @@ witness_assert(const struct lock_object *lock, int flags, const char *file, else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { - panic("Lock (%s) %s is not sleep or spin!", + kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); + return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) - panic("Lock (%s) %s locked @ %s:%d.", + kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; @@ -2269,34 +2301,36 @@ witness_assert(const struct lock_object *lock, int flags, const char *file, case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { - panic("Lock (%s) %s not locked @ %s:%d.", + kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) - panic("Lock (%s) %s not exclusively locked @ %s:%d.", + kassert_panic( + "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) - panic("Lock (%s) %s exclusively locked @ %s:%d.", + kassert_panic( + "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) - panic("Lock (%s) %s not recursed @ %s:%d.", + kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) - panic("Lock (%s) %s recursed @ %s:%d.", + kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: - panic("Invalid lock assertion at %s:%d.", + kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } @@ -2321,9 +2355,11 @@ witness_setflag(struct lock_object *lock, int flag, int set) lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); - if (instance == NULL) - panic("%s: lock (%s) %s not locked", __func__, + if (instance == NULL) { + kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); + return; + } if (set) instance->li_flags |= flag; diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index f47cb03..b97ff7f 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -536,7 +536,8 @@ dofilewrite(td, fd, fp, auio, offset, flags) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; - if (fp->f_type == DTYPE_VNODE) + if (fp->f_type == DTYPE_VNODE && + (fp->f_vnread_flags & FDEVFS_VNODE) == 0) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index c133fcb..c146232 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -46,8 +46,6 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> -#include <vm/uma.h> - /* * System initialization * @@ -270,21 +268,31 @@ domainfinalize(void *dummy) callout_reset(&pfslow_callout, 1, pfslowtimo, NULL); } +struct domain * +pffinddomain(int family) +{ + struct domain *dp; + + for (dp = domains; dp != NULL; dp = dp->dom_next) + if (dp->dom_family == family) + return (dp); + return (NULL); +} + struct protosw * pffindtype(int family, int type) { struct domain *dp; struct protosw *pr; - for (dp = domains; dp; dp = dp->dom_next) - if (dp->dom_family == family) - goto found; - return (0); -found: + dp = pffinddomain(family); + if (dp == NULL) + return (NULL); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_type && pr->pr_type == type) return (pr); - return (0); + return (NULL); } struct protosw * @@ -292,21 +300,22 @@ pffindproto(int family, int protocol, int type) { struct domain *dp; struct protosw *pr; - struct protosw *maybe = 0; + struct protosw *maybe; + maybe = NULL; if (family == 0) - return (0); - for (dp = domains; dp; dp = dp->dom_next) - if (dp->dom_family == family) - goto found; - return (0); -found: + return (NULL); + + dp = pffinddomain(family); + if (dp == NULL) + return (NULL); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) return (pr); if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && - pr->pr_protocol == 0 && maybe == (struct protosw *)0) + pr->pr_protocol == 0 && maybe == NULL) maybe = pr; } return (maybe); @@ -334,12 +343,10 @@ pf_proto_register(int family, struct protosw *npr) return (ENXIO); /* Try to find the specified domain based on the family. */ - for (dp = domains; dp; dp = dp->dom_next) - if (dp->dom_family == family) - goto found; - return (EPFNOSUPPORT); + dp = pffinddomain(family); + if (dp == NULL) + return (EPFNOSUPPORT); -found: /* Initialize backpointer to struct domain. */ npr->pr_domain = dp; fpr = NULL; @@ -405,12 +412,10 @@ pf_proto_unregister(int family, int protocol, int type) return (EPROTOTYPE); /* Try to find the specified domain based on the family type. */ - for (dp = domains; dp; dp = dp->dom_next) - if (dp->dom_family == family) - goto found; - return (EPFNOSUPPORT); + dp = pffinddomain(family); + if (dp == NULL) + return (EPFNOSUPPORT); -found: dpr = NULL; /* Lock out everyone else while we are manipulating the protosw. */ diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 4130361..ab6163d 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -520,7 +520,7 @@ m_prepend(struct mbuf *m, int len, int how) /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. - * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. + * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ @@ -1028,7 +1028,7 @@ m_pullup(struct mbuf *n, int len) } else { if (len > MHLEN) goto bad; - MGET(m, M_DONTWAIT, n->m_type); + MGET(m, M_NOWAIT, n->m_type); if (m == NULL) goto bad; m->m_len = 0; @@ -1076,7 +1076,7 @@ m_copyup(struct mbuf *n, int len, int dstoff) if (len > (MHLEN - dstoff)) goto bad; - MGET(m, M_DONTWAIT, n->m_type); + MGET(m, M_NOWAIT, n->m_type); if (m == NULL) goto bad; m->m_len = 0; @@ -1195,10 +1195,10 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp, while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { - m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ @@ -1213,10 +1213,10 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp, m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { - m = m_getcl(M_DONTWAIT, MT_DATA, 0); + m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { - m = m_get(M_DONTWAIT, MT_DATA); + m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { @@ -1260,7 +1260,7 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) off -= mlen; totlen += mlen; if (m->m_next == NULL) { - n = m_get(M_DONTWAIT, m->m_type); + n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); @@ -1284,7 +1284,7 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) if (len == 0) break; if (m->m_next == NULL) { - n = m_get(M_DONTWAIT, m->m_type); + n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); @@ -1328,7 +1328,7 @@ m_append(struct mbuf *m0, int len, c_caddr_t cp) * Allocate a new mbuf; could check space * and allocate a cluster instead. */ - n = m_get(M_DONTWAIT, m->m_type); + n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c index 96be658..e32e2a1 100644 --- a/sys/kern/uipc_mbuf2.c +++ b/sys/kern/uipc_mbuf2.c @@ -171,7 +171,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * chop the current mbuf into two pieces, set off to 0. */ if (len <= n->m_len - off) { - o = m_dup1(n, off, n->m_len - off, M_DONTWAIT); + o = m_dup1(n, off, n->m_len - off, M_NOWAIT); if (o == NULL) { m_freem(m); return NULL; /* ENOBUFS */ @@ -231,9 +231,9 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * on both end. */ if (len > MLEN) - o = m_getcl(M_DONTWAIT, m->m_type, 0); + o = m_getcl(M_NOWAIT, m->m_type, 0); else - o = m_get(M_DONTWAIT, m->m_type); + o = m_get(M_NOWAIT, m->m_type); if (!o) { m_freem(m); return NULL; /* ENOBUFS */ diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index 272f939..6325840 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -188,7 +188,7 @@ sowakeup(struct socket *so, struct sockbuf *sb) } KNOTE_LOCKED(&sb->sb_sel.si_note, 0); if (sb->sb_upcall != NULL) { - ret = sb->sb_upcall(so, sb->sb_upcallarg, M_DONTWAIT); + ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); @@ -644,7 +644,7 @@ sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, if (asa->sa_len > MLEN) return (0); #endif - MGET(m, M_DONTWAIT, MT_SONAME); + MGET(m, M_NOWAIT, MT_SONAME); if (m == 0) return (0); m->m_len = asa->sa_len; @@ -1002,9 +1002,9 @@ sbcreatecontrol(caddr_t p, int size, int type, int level) if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) - m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); + m = m_getcl(M_NOWAIT, MT_CONTROL, 0); else - m = m_get(M_DONTWAIT, MT_CONTROL); + m = m_get(M_NOWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 4416eca..de8ae5a 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -92,7 +92,7 @@ * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. - * + * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() @@ -259,25 +259,26 @@ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); * Initialize the socket subsystem and set up the socket * memory allocator. */ -uma_zone_t socket_zone; +static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { - uma_zone_set_max(socket_zone, maxsockets); + maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_init(void *tag) { - socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(socket_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, - EVENTHANDLER_PRI_FIRST); + socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + maxsockets = uma_zone_set_max(socket_zone, maxsockets); + uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); + EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, + EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); @@ -425,7 +426,16 @@ socreate(int dom, struct socket **aso, int type, int proto, else prp = pffindtype(dom, type); - if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || + if (prp == NULL) { + /* No support for domain. */ + if (pffinddomain(dom) == NULL) + return (EAFNOSUPPORT); + /* No support for socket type. */ + if (proto == 0 && type != 0) + return (EPROTOTYPE); + return (EPROTONOSUPPORT); + } + if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); @@ -710,8 +720,10 @@ sofree(struct socket *so) ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated")); + KASSERT((TAILQ_EMPTY(&so->so_comp)), + ("sofree: so_comp populated")); + KASSERT((TAILQ_EMPTY(&so->so_incomp)), + ("sofree: so_incomp populated")); } SOCK_UNLOCK(so); ACCEPT_UNLOCK(); @@ -777,7 +789,8 @@ soclose(struct socket *so) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, - PSOCK | PCATCH, "soclos", so->so_linger * hz); + PSOCK | PCATCH, "soclos", + so->so_linger * hz); if (error) break; } @@ -947,7 +960,7 @@ struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; /* * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise * sosend_dgram() and sosend_generic() use m_uiotombuf(). - * + * * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or * all of the data referenced by the uio. If desired, it uses zero-copy. * *space will be updated to reflect data copied in. @@ -998,7 +1011,7 @@ sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, } } else { if (top == NULL) { - m = m_gethdr(M_WAIT, MT_DATA); + m = m_gethdr(M_WAITOK, MT_DATA); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = NULL; @@ -1010,7 +1023,7 @@ sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, if (atomic && m && len < MHLEN) MH_ALIGN(m, len); } else { - m = m_get(M_WAIT, MT_DATA); + m = m_get(M_WAITOK, MT_DATA); len = min(min(MLEN, resid), *space); } } @@ -1433,7 +1446,7 @@ soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); - m = m_get(M_WAIT, MT_DATA); + m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; @@ -1449,8 +1462,7 @@ soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) disposable = 0; error = uiomoveco(mtod(m, void *), - min(uio->uio_resid, m->m_len), - uio, disposable); + min(uio->uio_resid, m->m_len), uio, disposable); } else #endif /* SOCKET_RECV_PFLIP */ error = uiomove(mtod(m, void *), @@ -1485,20 +1497,19 @@ sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) else sb->sb_mb = nextrecord; - /* - * Now update any dependent socket buffer fields to reflect the new - * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the + /* + * Now update any dependent socket buffer fields to reflect the new + * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. - */ - if (sb->sb_mb == NULL) { - sb->sb_mbtail = NULL; - sb->sb_lastrecord = NULL; - } else if (sb->sb_mb->m_nextpkt == NULL) - sb->sb_lastrecord = sb->sb_mb; + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; } - /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record @@ -1748,7 +1759,7 @@ dontblock: /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. - */ + */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) @@ -1787,8 +1798,7 @@ dontblock: disposable = 0; error = uiomoveco(mtod(m, char *) + moff, - (int)len, uio, - disposable); + (int)len, uio, disposable); } else #endif /* SOCKET_RECV_PFLIP */ error = uiomove(mtod(m, char *) + moff, (int)len, uio); @@ -1841,26 +1851,26 @@ dontblock: int copy_flag; if (flags & MSG_DONTWAIT) - copy_flag = M_DONTWAIT; + copy_flag = M_NOWAIT; else copy_flag = M_WAIT; - if (copy_flag == M_WAIT) + if (copy_flag == M_WAITOK) SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, copy_flag); - if (copy_flag == M_WAIT) + if (copy_flag == M_WAITOK) SOCKBUF_LOCK(&so->so_rcv); - if (*mp == NULL) { - /* - * m_copym() couldn't + if (*mp == NULL) { + /* + * m_copym() couldn't * allocate an mbuf. Adjust * uio_resid back (it was * adjusted down by len * bytes, which we didn't end * up "copying" over). - */ - uio->uio_resid += len; - break; - } + */ + uio->uio_resid += len; + break; + } } m->m_data += len; m->m_len -= len; @@ -1893,7 +1903,8 @@ dontblock: while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); - if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) + if (so->so_error || + so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been @@ -2114,7 +2125,7 @@ deliver: KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); - m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); + m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else @@ -2382,9 +2393,8 @@ soshutdown(struct socket *so, int how) return (EINVAL); CURVNET_SET(so->so_vnet); - if (pr->pr_usrreqs->pru_flush != NULL) { - (*pr->pr_usrreqs->pru_flush)(so, how); - } + if (pr->pr_usrreqs->pru_flush != NULL) + (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { @@ -2551,7 +2561,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); + sizeof optval); if (error) goto bad; SOCK_LOCK(so); @@ -2564,7 +2574,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); + sizeof optval); if (error) goto bad; @@ -2582,7 +2592,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, - sizeof val32); + sizeof val32); if (error) goto bad; so->so_user_cookie = val32; @@ -2593,7 +2603,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); + sizeof optval); if (error) goto bad; @@ -2901,11 +2911,11 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; - MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); + MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { - MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT); + MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; @@ -2919,14 +2929,14 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) m_prev = m; while (sopt_size) { - MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); + MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { - MCLGET(m, sopt->sopt_td != NULL ? M_WAIT : - M_DONTWAIT); + MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : + M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); @@ -2955,7 +2965,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) int error; error = copyin(sopt->sopt_val, mtod(m, char *), - m->m_len); + m->m_len); if (error != 0) { m_freem(m0); return(error); @@ -2984,17 +2994,17 @@ soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) int error; error = copyout(mtod(m, char *), sopt->sopt_val, - m->m_len); + m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); - sopt->sopt_valsize -= m->m_len; - sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; - valsize += m->m_len; - m = m->m_next; + sopt->sopt_valsize -= m->m_len; + sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; + valsize += m->m_len; + m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ @@ -3322,7 +3332,7 @@ filt_solisten(struct knote *kn, long hint) struct socket *so = kn->kn_fp->f_data; kn->kn_data = so->so_qlen; - return (! TAILQ_EMPTY(&so->so_comp)); + return (!TAILQ_EMPTY(&so->so_comp)); } int @@ -3381,7 +3391,7 @@ soisconnecting(struct socket *so) void soisconnected(struct socket *so) { - struct socket *head; + struct socket *head; int ret; restart: @@ -3409,7 +3419,7 @@ restart: head->so_accf->so_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_DONTWAIT); + head->so_accf->so_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); @@ -3486,7 +3496,7 @@ soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg) { struct sockbuf *sb; - + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3570,9 +3580,10 @@ sotoxsocket(struct socket *so, struct xsocket *xso) */ void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg) +so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), + void *arg) { - + TAILQ_FOREACH(so, &so->so_comp, so_list) func(so, arg); } @@ -3692,11 +3703,13 @@ so_sowwakeup_locked(struct socket *so) void so_lock(struct socket *so) { + SOCK_LOCK(so); } void so_unlock(struct socket *so) { + SOCK_UNLOCK(so); } diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 8ecaa02..b29c2c6 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -722,7 +722,7 @@ sendit(td, s, mp, flags) if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; - M_PREPEND(control, sizeof(*cm), M_WAIT); + M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; @@ -1661,9 +1661,9 @@ sockargs(mp, buf, buflen, type) if ((u_int)buflen > MCLBYTES) return (EINVAL); } - m = m_get(M_WAIT, type); + m = m_get(M_WAITOK, type); if ((u_int)buflen > MLEN) - MCLGET(m, M_WAIT); + MCLGET(m, M_WAITOK); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error) diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index c60b177..c732c70 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1778,6 +1778,7 @@ unp_init(void) if (unp_zone == NULL) panic("unp_init"); uma_zone_set_max(unp_zone, maxsockets); + uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 0480bd4..96c8442 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -2107,15 +2107,16 @@ restart: if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; + int rv; bfreekva(bp); vm_map_lock(buffer_map); if (vm_map_findspace(buffer_map, - vm_map_min(buffer_map), maxsize, &addr)) { + vm_map_min(buffer_map), maxsize, &addr)) { /* - * Uh oh. Buffer map is to fragmented. We - * must defragment the map. + * Buffer map is too fragmented. + * We must defragment the map. */ atomic_add_int(&bufdefragcnt, 1); vm_map_unlock(buffer_map); @@ -2124,22 +2125,21 @@ restart: brelse(bp); goto restart; } - if (addr) { - vm_map_insert(buffer_map, NULL, 0, - addr, addr + maxsize, - VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); - - bp->b_kvabase = (caddr_t) addr; - bp->b_kvasize = maxsize; - atomic_add_long(&bufspace, bp->b_kvasize); - atomic_add_int(&bufreusecnt, 1); - } + rv = vm_map_insert(buffer_map, NULL, 0, addr, + addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, + MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, + ("vm_map_insert(buffer_map) rv %d", rv)); vm_map_unlock(buffer_map); + bp->b_kvabase = (caddr_t)addr; + bp->b_kvasize = maxsize; + atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } - return(bp); + return (bp); } /* @@ -2209,7 +2209,7 @@ buf_daemon() while (numdirtybuffers > lodirtybuffers) { if (buf_do_flush(NULL) == 0) break; - kern_yield(PRI_UNCHANGED); + kern_yield(PRI_USER); } lodirtybuffers = lodirtysave; @@ -2605,8 +2605,6 @@ loop: * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. - * - * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (TD_IS_IDLETHREAD(curthread)) diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 4185211..114c23e 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -208,7 +208,7 @@ vfs_hang_addrlist(struct mount *mp, struct netexport *nep, np->netc_anon = crget(); np->netc_anon->cr_uid = argp->ex_anon.cr_uid; crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups, - np->netc_anon->cr_groups); + argp->ex_anon.cr_groups); np->netc_anon->cr_prison = &prison0; prison_hold(np->netc_anon->cr_prison); np->netc_numsecflavors = argp->ex_numsecflavors; diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 25b79ae..5926e15 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -559,7 +559,7 @@ vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) - strncpy(errmsg, "Invalid fstype", errmsg_len); + strlcpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; @@ -567,7 +567,7 @@ vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) - strncpy(errmsg, "Invalid fspath", errmsg_len); + strlcpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } @@ -711,7 +711,7 @@ sys_mount(td, uap) int error; /* - * Mount flags are now 64-bits. On 32-bit archtectures only + * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ @@ -1447,7 +1447,7 @@ vfs_filteropt(struct vfsoptlist *opts, const char **legal) if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { - strncpy((char *)opt->value, errmsg, opt->len); + strlcpy((char *)opt->value, errmsg, opt->len); break; } } @@ -1724,7 +1724,7 @@ __mnt_vnode_next(struct vnode **mvp, struct mount *mp) KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); if (should_yield()) { MNT_IUNLOCK(mp); - kern_yield(PRI_UNCHANGED); + kern_yield(PRI_USER); MNT_ILOCK(mp); } vp = TAILQ_NEXT(*mvp, v_nmntvnodes); diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index 83948f2..147926e 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -672,10 +672,11 @@ parse_mount_dev_present(const char *dev) return (error != 0) ? 0 : 1; } +#define ERRMSGL 255 static int parse_mount(char **conf) { - char errmsg[255]; + char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int delay, error, timeout; @@ -707,7 +708,7 @@ parse_mount(char **conf) printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); - bzero(errmsg, sizeof(errmsg)); + errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", sizeof(errmsg)); @@ -734,7 +735,7 @@ parse_mount(char **conf) ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); - ma = mount_arg(ma, "errmsg", errmsg, sizeof(errmsg)); + ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); @@ -748,11 +749,13 @@ parse_mount(char **conf) printf(".\n"); } free(fs, M_TEMP); + free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } +#undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 2c470df..7c243b6 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$"); #include <sys/reboot.h> #include <sys/sched.h> #include <sys/sleepqueue.h> +#include <sys/smp.h> #include <sys/stat.h> #include <sys/sysctl.h> #include <sys/syslog.h> @@ -740,7 +741,7 @@ next_iter: continue; MNT_IUNLOCK(mp); yield: - kern_yield(PRI_UNCHANGED); + kern_yield(PRI_USER); relock_mnt: MNT_ILOCK(mp); } @@ -852,7 +853,7 @@ vnlru_proc(void) vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else - kern_yield(PRI_UNCHANGED); + kern_yield(PRI_USER); } } @@ -4634,7 +4635,7 @@ __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) struct vnode *vp; if (should_yield()) - kern_yield(PRI_UNCHANGED); + kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); vp = TAILQ_NEXT(*mvp, v_nmntvnodes); @@ -4710,30 +4711,48 @@ __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) * These are helper functions for filesystems to traverse their * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h */ -struct vnode * -__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +static void +mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { - struct vnode *vp, *nvp; - if (should_yield()) - kern_yield(PRI_UNCHANGED); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + MNT_ILOCK(mp); -restart: - mtx_lock(&vnode_free_list_mtx); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +static struct vnode * +mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp, *nvp; + + mtx_assert(&vnode_free_list_mtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); +restart: vp = TAILQ_NEXT(*mvp, v_actfreelist); + TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_actfreelist); continue; } if (!VI_TRYLOCK(vp)) { - mtx_unlock(&vnode_free_list_mtx); - kern_yield(PRI_UNCHANGED); - goto restart; + if (mp_ncpus == 1 || should_yield()) { + TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); + mtx_unlock(&vnode_free_list_mtx); + kern_yield(PRI_USER); + mtx_lock(&vnode_free_list_mtx); + goto restart; + } + continue; } - if (vp->v_mount == mp && vp->v_type != VMARKER && - (vp->v_iflag & VI_DOOMED) == 0) + KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); + KASSERT(vp->v_mount == mp || vp->v_mount == NULL, + ("alien vnode on the active list %p %p", vp, mp)); + if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) break; nvp = TAILQ_NEXT(vp, v_actfreelist); VI_UNLOCK(vp); @@ -4743,86 +4762,58 @@ restart: /* Check if we are done */ if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); - __mnt_vnode_markerfree_active(mvp, mp); - /* MNT_IUNLOCK(mp); -- done in above function */ - mtx_assert(MNT_MTX(mp), MA_NOTOWNED); + mnt_vnode_markerfree_active(mvp, mp); return (NULL); } - TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); - MNT_IUNLOCK(mp); ASSERT_VI_LOCKED(vp, "active iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); } struct vnode * +__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +{ + + if (should_yield()) + kern_yield(PRI_USER); + mtx_lock(&vnode_free_list_mtx); + return (mnt_vnode_next_active(mvp, mp)); +} + +struct vnode * __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) { - struct vnode *vp, *nvp; + struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); + MNT_IUNLOCK(mp); (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; -restart: mtx_lock(&vnode_free_list_mtx); vp = TAILQ_FIRST(&mp->mnt_activevnodelist); - while (vp != NULL) { - if (vp->v_type == VMARKER) { - vp = TAILQ_NEXT(vp, v_actfreelist); - continue; - } - if (!VI_TRYLOCK(vp)) { - mtx_unlock(&vnode_free_list_mtx); - kern_yield(PRI_UNCHANGED); - goto restart; - } - if (vp->v_mount == mp && vp->v_type != VMARKER && - (vp->v_iflag & VI_DOOMED) == 0) - break; - nvp = TAILQ_NEXT(vp, v_actfreelist); - VI_UNLOCK(vp); - vp = nvp; - } - - /* Check if we are done */ if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); - MNT_REL(mp); - MNT_IUNLOCK(mp); - free(*mvp, M_VNODE_MARKER); - *mvp = NULL; + mnt_vnode_markerfree_active(mvp, mp); return (NULL); } - (*mvp)->v_mount = mp; - TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); - mtx_unlock(&vnode_free_list_mtx); - MNT_IUNLOCK(mp); - ASSERT_VI_LOCKED(vp, "active iter first"); - KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); - return (vp); + TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); + return (mnt_vnode_next_active(mvp, mp)); } void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { - if (*mvp == NULL) { - MNT_IUNLOCK(mp); + if (*mvp == NULL) return; - } - mtx_assert(MNT_MTX(mp), MA_OWNED); - - KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); - MNT_REL(mp); - MNT_IUNLOCK(mp); - free(*mvp, M_VNODE_MARKER); - *mvp = NULL; + mnt_vnode_markerfree_active(mvp, mp); } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3f65b05..bbe837a 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1434,6 +1434,40 @@ vn_closefile(fp, td) * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ +static int +vn_start_write_locked(struct mount *mp, int flags) +{ + int error; + + mtx_assert(MNT_MTX(mp), MA_OWNED); + error = 0; + + /* + * Check on status of suspension. + */ + if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || + mp->mnt_susp_owner != curthread) { + while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + if (flags & V_NOWAIT) { + error = EWOULDBLOCK; + goto unlock; + } + error = msleep(&mp->mnt_flag, MNT_MTX(mp), + (PUSER - 1) | (flags & PCATCH), "suspfs", 0); + if (error) + goto unlock; + } + } + if (flags & V_XSLEEP) + goto unlock; + mp->mnt_writeopcount++; +unlock: + if (error != 0 || (flags & V_XSLEEP) != 0) + MNT_REL(mp); + MNT_IUNLOCK(mp); + return (error); +} + int vn_start_write(vp, mpp, flags) struct vnode *vp; @@ -1470,30 +1504,7 @@ vn_start_write(vp, mpp, flags) if (vp == NULL) MNT_REF(mp); - /* - * Check on status of suspension. - */ - if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || - mp->mnt_susp_owner != curthread) { - while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { - if (flags & V_NOWAIT) { - error = EWOULDBLOCK; - goto unlock; - } - error = msleep(&mp->mnt_flag, MNT_MTX(mp), - (PUSER - 1) | (flags & PCATCH), "suspfs", 0); - if (error) - goto unlock; - } - } - if (flags & V_XSLEEP) - goto unlock; - mp->mnt_writeopcount++; -unlock: - if (error != 0 || (flags & V_XSLEEP) != 0) - MNT_REL(mp); - MNT_IUNLOCK(mp); - return (error); + return (vn_start_write_locked(mp, flags)); } /* @@ -1639,8 +1650,7 @@ vfs_write_suspend(mp) * Request a filesystem to resume write operations. */ void -vfs_write_resume(mp) - struct mount *mp; +vfs_write_resume_flags(struct mount *mp, int flags) { MNT_ILOCK(mp); @@ -1652,10 +1662,26 @@ vfs_write_resume(mp) wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; + if ((flags & VR_START_WRITE) != 0) { + MNT_REF(mp); + mp->mnt_writeopcount++; + } MNT_IUNLOCK(mp); - VFS_SUSP_CLEAN(mp); - } else + if ((flags & VR_NO_SUSPCLR) == 0) + VFS_SUSP_CLEAN(mp); + } else if ((flags & VR_START_WRITE) != 0) { + MNT_REF(mp); + vn_start_write_locked(mp, 0); + } else { MNT_IUNLOCK(mp); + } +} + +void +vfs_write_resume(struct mount *mp) +{ + + vfs_write_resume_flags(mp, 0); } /* |