diff options
Diffstat (limited to 'sys/kern')
30 files changed, 1874 insertions, 825 deletions
diff --git a/sys/kern/imgact_binmisc.c b/sys/kern/imgact_binmisc.c index dd57717..39ca156 100644 --- a/sys/kern/imgact_binmisc.c +++ b/sys/kern/imgact_binmisc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2013-15, Stacey D. Son + * Copyright (c) 2013-16, Stacey D. Son * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -220,16 +220,17 @@ imgact_binmisc_add_entry(ximgact_binmisc_entry_t *xbe) { imgact_binmisc_entry_t *ibe; char *p; + int cnt; if (xbe->xbe_msize > IBE_MAGIC_MAX) return (EINVAL); - for(p = xbe->xbe_name; *p != 0; p++) - if (!isascii((int)*p)) + for(cnt = 0, p = xbe->xbe_name; *p != 0; cnt++, p++) + if (cnt >= IBE_NAME_MAX || !isascii((int)*p)) return (EINVAL); - for(p = xbe->xbe_interpreter; *p != 0; p++) - if (!isascii((int)*p)) + for(cnt = 0, p = xbe->xbe_interpreter; *p != 0; cnt++, p++) + if (cnt >= IBE_INTERP_LEN_MAX || !isascii((int)*p)) return (EINVAL); /* Make sure we don't have any invalid #'s. */ @@ -266,8 +267,6 @@ imgact_binmisc_add_entry(ximgact_binmisc_entry_t *xbe) /* Preallocate a new entry. */ ibe = imgact_binmisc_new_entry(xbe); - if (!ibe) - return (ENOMEM); SLIST_INSERT_HEAD(&interpreter_list, ibe, link); interp_list_entry_count++; diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 43d4800..0bed714 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -1370,10 +1370,6 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) * and write it out following the notes. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); - if (hdr == NULL) { - error = EINVAL; - goto done; - } error = __elfN(corehdr)(¶ms, seginfo.count, hdr, hdrsize, ¬elst, notesz); diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 96c5229..47f2f90 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: head/sys/kern/syscalls.master 296572 2016-03-09 19:05:11Z jhb + * created from FreeBSD: head/sys/kern/syscalls.master 297167 2016-03-21 21:37:33Z jhb */ #include "opt_compat.h" diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c index 8061829..6f891e0 100644 --- a/sys/kern/kern_condvar.c +++ b/sys/kern/kern_condvar.c @@ -122,15 +122,8 @@ _cv_wait(struct cv *cvp, struct lock_object *lock) "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); - if (cold || SCHEDULER_STOPPED()) { - /* - * During autoconfiguration, just give interrupts - * a chance, then just return. Don't run any other - * thread or panic below, in case this is the idle - * process and already asleep. - */ + if (SCHEDULER_STOPPED()) return; - } sleepq_lock(cvp); @@ -183,13 +176,7 @@ _cv_wait_unlock(struct cv *cvp, struct lock_object *lock) ("cv_wait_unlock cannot be used with Giant")); class = LOCK_CLASS(lock); - if (cold || SCHEDULER_STOPPED()) { - /* - * During autoconfiguration, just give interrupts - * a chance, then just return. Don't run any other - * thread or panic below, in case this is the idle - * process and already asleep. - */ + if (SCHEDULER_STOPPED()) { class->lc_unlock(lock); return; } @@ -240,15 +227,8 @@ _cv_wait_sig(struct cv *cvp, struct lock_object *lock) "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); - if (cold || SCHEDULER_STOPPED()) { - /* - * After a panic, or during autoconfiguration, just give - * interrupts a chance, then just return; don't run any other - * procs or panic below, in case this is the idle process and - * already asleep. - */ + if (SCHEDULER_STOPPED()) return (0); - } sleepq_lock(cvp); @@ -307,15 +287,8 @@ _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); - if (cold || SCHEDULER_STOPPED()) { - /* - * After a panic, or during autoconfiguration, just give - * interrupts a chance, then just return; don't run any other - * thread or panic below, in case this is the idle process and - * already asleep. - */ - return 0; - } + if (SCHEDULER_STOPPED()) + return (0); sleepq_lock(cvp); @@ -376,15 +349,8 @@ _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, "Waiting on \"%s\"", cvp->cv_description); class = LOCK_CLASS(lock); - if (cold || SCHEDULER_STOPPED()) { - /* - * After a panic, or during autoconfiguration, just give - * interrupts a chance, then just return; don't run any other - * thread or panic below, in case this is the idle process and - * already asleep. - */ - return 0; - } + if (SCHEDULER_STOPPED()) + return (0); sleepq_lock(cvp); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 00bd54b..b37adcc 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -3958,7 +3958,7 @@ badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, - int kflags, struct thread *td) + struct thread *td) { return (EBADF); @@ -4044,7 +4044,7 @@ invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, - int kflags, struct thread *td) + struct thread *td) { return (EINVAL); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 69f0774..7c88fe0 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1570,8 +1570,6 @@ exec_register(execsw_arg) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); - if (newexecsw == NULL) - return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) @@ -1604,8 +1602,6 @@ exec_unregister(execsw_arg) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); - if (newexecsw == NULL) - return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c index 3737aa3..ec466dd 100644 --- a/sys/kern/kern_fail.c +++ b/sys/kern/kern_fail.c @@ -52,17 +52,25 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_stack.h" + #include <sys/ctype.h> #include <sys/errno.h> #include <sys/fail.h> #include <sys/kernel.h> #include <sys/libkern.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/sbuf.h> +#include <sys/sleepqueue.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/types.h> +#include <machine/atomic.h> #include <machine/stdarg.h> #ifdef ILOG_DEFINE_FOR_FILE @@ -72,11 +80,45 @@ ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point); static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system"); #define fp_free(ptr) free(ptr, M_FAIL_POINT) #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags)) +#define fs_free(ptr) fp_free(ptr) +#define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \ + M_WAITOK | M_ZERO) + + /** + * These define the wchans that are used for sleeping, pausing respectively. + * They are chosen arbitrarily but need to be distinct to the failpoint and + * the sleep/pause distinction. + */ +#define FP_SLEEP_CHANNEL(fp) (void*)(fp) +#define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting) + +/** + * Don't allow more than this many entries in a fail point set by sysctl. + * The 99.99...% case is to have 1 entry. I can't imagine having this many + * entries, so it should not limit us. Saves on re-mallocs while holding + * a non-sleepable lock. + */ +#define FP_MAX_ENTRY_COUNT 20 + +/* Used to drain sbufs to the sysctl output */ +int fail_sysctl_drain_func(void *, const char *, int); + +/* Head of tailq of struct fail_point_entry */ +TAILQ_HEAD(fail_point_entry_queue, fail_point_entry); + +/** + * fp entries garbage list; outstanding entries are cleaned up in the + * garbage collector + */ +STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting); +static struct fail_point_setting_garbage fp_setting_garbage = + STAILQ_HEAD_INITIALIZER(fp_setting_garbage); +static struct mtx mtx_garbage_list; +MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx", + MTX_SPIN); -static struct mtx g_fp_mtx; -MTX_SYSINIT(g_fp_mtx, &g_fp_mtx, "fail point mtx", MTX_DEF); -#define FP_LOCK() mtx_lock(&g_fp_mtx) -#define FP_UNLOCK() mtx_unlock(&g_fp_mtx) +static struct sx sx_fp_set; +SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx"); /** * Failpoint types. @@ -90,7 +132,11 @@ enum fail_point_t { FAIL_POINT_BREAK, /**< break into the debugger */ FAIL_POINT_PRINT, /**< print a message */ FAIL_POINT_SLEEP, /**< sleep for some msecs */ - FAIL_POINT_NUMTYPES + FAIL_POINT_PAUSE, /**< sleep until failpoint is set to off */ + FAIL_POINT_YIELD, /**< yield the cpu */ + FAIL_POINT_DELAY, /**< busy wait the cpu */ + FAIL_POINT_NUMTYPES, + FAIL_POINT_INVALID = -1 }; static struct { @@ -104,53 +150,307 @@ static struct { [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"), [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"), [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"), + [FAIL_POINT_PAUSE] = FP_TYPE_NM_LEN("pause"), + [FAIL_POINT_YIELD] = FP_TYPE_NM_LEN("yield"), + [FAIL_POINT_DELAY] = FP_TYPE_NM_LEN("delay"), }; +#define FE_COUNT_UNTRACKED (INT_MIN) + /** * Internal structure tracking a single term of a complete failpoint. * @ingroup failpoint_private */ struct fail_point_entry { - enum fail_point_t fe_type; /**< type of entry */ + volatile bool fe_stale; + enum fail_point_t fe_type; /**< type of entry */ int fe_arg; /**< argument to type (e.g. return value) */ int fe_prob; /**< likelihood of firing in millionths */ - int fe_count; /**< number of times to fire, 0 means always */ + int fe_count; /**< number of times to fire, -1 means infinite */ pid_t fe_pid; /**< only fail for this process */ - TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */ + struct fail_point *fe_parent; /**< backpointer to fp */ + TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry ptr */ }; +struct fail_point_setting { + STAILQ_ENTRY(fail_point_setting) fs_garbage_link; + struct fail_point_entry_queue fp_entry_queue; + struct fail_point * fs_parent; + struct mtx feq_mtx; /* Gives fail_point_pause something to do. */ +}; + +/** + * Defines stating the equivalent of probablilty one (100%) + */ +enum { + PROB_MAX = 1000000, /* probability between zero and this number */ + PROB_DIGITS = 6 /* number of zero's in above number */ +}; + +/* Get a ref on an fp's fp_setting */ +static inline struct fail_point_setting *fail_point_setting_get_ref( + struct fail_point *fp); +/* Release a ref on an fp_setting */ +static inline void fail_point_setting_release_ref(struct fail_point *fp); +/* Allocate and initialize a struct fail_point_setting */ +static struct fail_point_setting *fail_point_setting_new(struct + fail_point *); +/* Free a struct fail_point_setting */ +static void fail_point_setting_destroy(struct fail_point_setting *fp_setting); +/* Allocate and initialize a struct fail_point_entry */ +static struct fail_point_entry *fail_point_entry_new(struct + fail_point_setting *); +/* Free a struct fail_point_entry */ +static void fail_point_entry_destroy(struct fail_point_entry *fp_entry); +/* Append fp setting to garbage list */ +static inline void fail_point_setting_garbage_append( + struct fail_point_setting *fp_setting); +/* Swap fp's setting with fp_setting_new */ +static inline struct fail_point_setting * + fail_point_swap_settings(struct fail_point *fp, + struct fail_point_setting *fp_setting_new); +/* Free up any zero-ref setting in the garbage queue */ +static void fail_point_garbage_collect(void); +/* If this fail point's setting are empty, then swap it out to NULL. */ +static inline void fail_point_eval_swap_out(struct fail_point *fp, + struct fail_point_setting *fp_setting); + +bool +fail_point_is_off(struct fail_point *fp) +{ + bool return_val; + struct fail_point_setting *fp_setting; + struct fail_point_entry *ent; + + return_val = true; + + fp_setting = fail_point_setting_get_ref(fp); + if (fp_setting != NULL) { + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, + fe_entries) { + if (!ent->fe_stale) { + return_val = false; + break; + } + } + } + fail_point_setting_release_ref(fp); + + return (return_val); +} + +/* Allocate and initialize a struct fail_point_setting */ +static struct fail_point_setting * +fail_point_setting_new(struct fail_point *fp) +{ + struct fail_point_setting *fs_new; + + fs_new = fs_malloc(); + fs_new->fs_parent = fp; + TAILQ_INIT(&fs_new->fp_entry_queue); + mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN); + + fail_point_setting_garbage_append(fs_new); + + return (fs_new); +} + +/* Free a struct fail_point_setting */ +static void +fail_point_setting_destroy(struct fail_point_setting *fp_setting) +{ + struct fail_point_entry *ent; + + while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) { + ent = TAILQ_FIRST(&fp_setting->fp_entry_queue); + TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries); + fail_point_entry_destroy(ent); + } + + fs_free(fp_setting); +} + +/* Allocate and initialize a struct fail_point_entry */ +static struct fail_point_entry * +fail_point_entry_new(struct fail_point_setting *fp_setting) +{ + struct fail_point_entry *fp_entry; + + fp_entry = fp_malloc(sizeof(struct fail_point_entry), + M_WAITOK | M_ZERO); + fp_entry->fe_parent = fp_setting->fs_parent; + fp_entry->fe_prob = PROB_MAX; + fp_entry->fe_pid = NO_PID; + fp_entry->fe_count = FE_COUNT_UNTRACKED; + TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry, + fe_entries); + + return (fp_entry); +} + +/* Free a struct fail_point_entry */ +static void +fail_point_entry_destroy(struct fail_point_entry *fp_entry) +{ + + fp_free(fp_entry); +} + +/* Get a ref on an fp's fp_setting */ +static inline struct fail_point_setting * +fail_point_setting_get_ref(struct fail_point *fp) +{ + struct fail_point_setting *fp_setting; + + /* Invariant: if we have a ref, our pointer to fp_setting is safe */ + atomic_add_acq_32(&fp->fp_ref_cnt, 1); + fp_setting = fp->fp_setting; + + return (fp_setting); +} + +/* Release a ref on an fp_setting */ +static inline void +fail_point_setting_release_ref(struct fail_point *fp) +{ + + KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs")); + atomic_subtract_rel_32(&fp->fp_ref_cnt, 1); +} + +/* Append fp entries to fp garbage list */ +static inline void +fail_point_setting_garbage_append(struct fail_point_setting *fp_setting) +{ + + mtx_lock_spin(&mtx_garbage_list); + STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting, + fs_garbage_link); + mtx_unlock_spin(&mtx_garbage_list); +} + +/* Swap fp's entries with fp_setting_new */ +static struct fail_point_setting * +fail_point_swap_settings(struct fail_point *fp, + struct fail_point_setting *fp_setting_new) +{ + struct fail_point_setting *fp_setting_old; + + fp_setting_old = fp->fp_setting; + fp->fp_setting = fp_setting_new; + + return (fp_setting_old); +} + +static inline void +fail_point_eval_swap_out(struct fail_point *fp, + struct fail_point_setting *fp_setting) +{ + + /* We may have already been swapped out and replaced; ignore. */ + if (fp->fp_setting == fp_setting) + fail_point_swap_settings(fp, NULL); +} + +/* Free up any zero-ref entries in the garbage queue */ +static void +fail_point_garbage_collect() +{ + struct fail_point_setting *fs_current, *fs_next; + struct fail_point_setting_garbage fp_ents_free_list; + + /** + * We will transfer the entries to free to fp_ents_free_list while holding + * the spin mutex, then free it after we drop the lock. This avoids + * triggering witness due to sleepable mutexes in the memory + * allocator. + */ + STAILQ_INIT(&fp_ents_free_list); + + mtx_lock_spin(&mtx_garbage_list); + STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link, + fs_next) { + if (fs_current->fs_parent->fp_setting != fs_current && + fs_current->fs_parent->fp_ref_cnt == 0) { + STAILQ_REMOVE(&fp_setting_garbage, fs_current, + fail_point_setting, fs_garbage_link); + STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current, + fs_garbage_link); + } + } + mtx_unlock_spin(&mtx_garbage_list); + + STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link, + fs_next) + fail_point_setting_destroy(fs_current); +} + +/* Drain out all refs from this fail point */ +static inline void +fail_point_drain(struct fail_point *fp, int expected_ref) +{ + struct fail_point_setting *entries; + + entries = fail_point_swap_settings(fp, NULL); + /** + * We have unpaused all threads; so we will wait no longer + * than the time taken for the longest remaining sleep, or + * the length of time of a long-running code block. + */ + while (fp->fp_ref_cnt > expected_ref) { + wakeup(FP_PAUSE_CHANNEL(fp)); + tsleep(&fp, PWAIT, "fail_point_drain", hz / 100); + } + fail_point_swap_settings(fp, entries); +} + static inline void -fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent, - int msecs, enum fail_point_return_code *pret) +fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret, + struct mtx *mtx_sleep) { - /* convert from millisecs to ticks, rounding up */ - int timo = ((msecs * hz) + 999) / 1000; + + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0); + + if (fp->fp_post_sleep_fn) + fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); +} + +static inline void +fail_point_sleep(struct fail_point *fp, int msecs, + enum fail_point_return_code *pret) +{ + int timo; + + /* Convert from millisecs to ticks, rounding up */ + timo = howmany(msecs * hz, 1000); if (timo > 0) { - if (fp->fp_sleep_fn == NULL) { - msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo); + if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) { + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo); + + if (fp->fp_post_sleep_fn) + fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } else { - timeout(fp->fp_sleep_fn, fp->fp_sleep_arg, timo); + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + timeout(fp->fp_post_sleep_fn, fp->fp_post_sleep_arg, + timo); *pret = FAIL_POINT_RC_QUEUED; } } } - -/** - * Defines stating the equivalent of probablilty one (100%) - */ -enum { - PROB_MAX = 1000000, /* probability between zero and this number */ - PROB_DIGITS = 6, /* number of zero's in above number */ -}; - -static char *parse_fail_point(struct fail_point_entries *, char *); -static char *parse_term(struct fail_point_entries *, char *); +static char *parse_fail_point(struct fail_point_setting *, char *); +static char *parse_term(struct fail_point_setting *, char *); static char *parse_number(int *out_units, int *out_decimal, char *); static char *parse_type(struct fail_point_entry *, char *); -static void free_entry(struct fail_point_entries *, struct fail_point_entry *); -static void clear_entries(struct fail_point_entries *); /** * Initialize a fail_point. The name is formed in a printf-like fashion @@ -167,7 +467,7 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...) char *name; int n; - TAILQ_INIT(&fp->fp_entries); + fp->fp_setting = NULL; fp->fp_flags = 0; /* Figure out the size of the name. */ @@ -185,25 +485,33 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...) fp->fp_name = name; fp->fp_location = ""; fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME; - fp->fp_sleep_fn = NULL; - fp->fp_sleep_arg = NULL; + fp->fp_pre_sleep_fn = NULL; + fp->fp_pre_sleep_arg = NULL; + fp->fp_post_sleep_fn = NULL; + fp->fp_post_sleep_arg = NULL; } /** - * Free the resources held by a fail_point. - * + * Free the resources held by a fail_point, and wake any paused threads. + * Thou shalt not allow threads to hit this fail point after you enter this + * function, nor shall you call this multiple times for a given fp. * @ingroup failpoint */ void fail_point_destroy(struct fail_point *fp) { + fail_point_drain(fp, 0); + if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) { fp_free(__DECONST(void *, fp->fp_name)); fp->fp_name = NULL; } fp->fp_flags = 0; - clear_entries(&fp->fp_entries); + + sx_xlock(&sx_fp_set); + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); } /** @@ -216,21 +524,51 @@ fail_point_destroy(struct fail_point *fp) enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) { - enum fail_point_return_code ret = FAIL_POINT_RC_CONTINUE; - struct fail_point_entry *ent, *next; + bool execute = false; + struct fail_point_entry *ent; + struct fail_point_setting *fp_setting; + enum fail_point_return_code ret; + int cont; + int count; int msecs; + int usecs; + + ret = FAIL_POINT_RC_CONTINUE; + cont = 0; /* don't continue by default */ + + fp_setting = fail_point_setting_get_ref(fp); + if (fp_setting == NULL) + goto abort; - FP_LOCK(); + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { - TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) { - int cont = 0; /* don't continue by default */ + if (ent->fe_stale) + continue; if (ent->fe_prob < PROB_MAX && ent->fe_prob < random() % PROB_MAX) continue; + if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid) continue; + if (ent->fe_count != FE_COUNT_UNTRACKED) { + count = ent->fe_count; + while (count > 0) { + if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) { + count--; + execute = true; + break; + } + count = ent->fe_count; + } + if (execute == false) + /* We lost the race; consider the entry stale and bail now */ + continue; + if (count == 0) + ent->fe_stale = true; + } + switch (ent->fe_type) { case FAIL_POINT_PANIC: panic("fail point %s panicking", fp->fp_name); @@ -244,7 +582,7 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) case FAIL_POINT_BREAK: printf("fail point %s breaking to debugger\n", - fp->fp_name); + fp->fp_name); breakpoint(); break; @@ -254,51 +592,95 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) break; case FAIL_POINT_SLEEP: - /* - * Free the entry now if necessary, since - * we're about to drop the mutex and sleep. - */ msecs = ent->fe_arg; - if (ent->fe_count > 0 && --ent->fe_count == 0) { - free_entry(&fp->fp_entries, ent); - ent = NULL; - } - if (msecs) - fail_point_sleep(fp, ent, msecs, &ret); + fail_point_sleep(fp, msecs, &ret); + break; + + case FAIL_POINT_PAUSE: + /** + * Pausing is inherently strange with multiple + * entries given our design. That is because some + * entries could be unreachable, for instance in cases like: + * pause->return. We can never reach the return entry. + * The sysctl layer actually truncates all entries after + * a pause for this reason. + */ + mtx_lock_spin(&fp_setting->feq_mtx); + fail_point_pause(fp, &ret, &fp_setting->feq_mtx); + mtx_unlock_spin(&fp_setting->feq_mtx); + break; + + case FAIL_POINT_YIELD: + kern_yield(-1); + break; + + case FAIL_POINT_DELAY: + usecs = ent->fe_arg; + DELAY(usecs); break; default: break; } - if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0) - free_entry(&fp->fp_entries, ent); if (cont == 0) break; } - /* Get rid of "off"s at the end. */ - while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) && - ent->fe_type == FAIL_POINT_OFF) - free_entry(&fp->fp_entries, ent); + if (fail_point_is_off(fp)) + fail_point_eval_swap_out(fp, fp_setting); - FP_UNLOCK(); +abort: + fail_point_setting_release_ref(fp); return (ret); + } /** * Translate internal fail_point structure into human-readable text. */ static void -fail_point_get(struct fail_point *fp, struct sbuf *sb) +fail_point_get(struct fail_point *fp, struct sbuf *sb, + bool verbose) { struct fail_point_entry *ent; + struct fail_point_setting *fp_setting; + struct fail_point_entry *fp_entry_cpy; + int cnt_sleeping; + int idx; + int printed_entry_count; + + cnt_sleeping = 0; + idx = 0; + printed_entry_count = 0; + + fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) * + (FP_MAX_ENTRY_COUNT + 1), M_WAITOK); + + fp_setting = fail_point_setting_get_ref(fp); - FP_LOCK(); + if (fp_setting != NULL) { + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { + if (ent->fe_stale) + continue; - TAILQ_FOREACH(ent, &fp->fp_entries, fe_entries) { + KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT, + ("FP entry list larger than allowed")); + + fp_entry_cpy[printed_entry_count] = *ent; + ++printed_entry_count; + } + } + fail_point_setting_release_ref(fp); + + /* This is our equivalent of a NULL terminator */ + fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID; + + while (idx < printed_entry_count) { + ent = &fp_entry_cpy[idx]; + ++idx; if (ent->fe_prob < PROB_MAX) { int decimal = ent->fe_prob % (PROB_MAX / 100); int units = ent->fe_prob / (PROB_MAX / 100); @@ -313,7 +695,7 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) } sbuf_printf(sb, "%%"); } - if (ent->fe_count > 0) + if (ent->fe_count >= 0) sbuf_printf(sb, "%d*", ent->fe_count); sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name); if (ent->fe_arg) @@ -323,10 +705,33 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) if (TAILQ_NEXT(ent, fe_entries)) sbuf_printf(sb, "->"); } - if (TAILQ_EMPTY(&fp->fp_entries)) + if (!printed_entry_count) sbuf_printf(sb, "off"); - FP_UNLOCK(); + fp_free(fp_entry_cpy); + if (verbose) { +#ifdef STACK + /* Print number of sleeping threads. queue=0 is the argument + * used by msleep when sending our threads to sleep. */ + sbuf_printf(sb, "\nsleeping_thread_stacks = {\n"); + sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0, + &cnt_sleeping); + + sbuf_printf(sb, "},\n"); +#endif + sbuf_printf(sb, "sleeping_thread_count = %d,\n", + cnt_sleeping); + +#ifdef STACK + sbuf_printf(sb, "paused_thread_stacks = {\n"); + sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0, + &cnt_sleeping); + + sbuf_printf(sb, "},\n"); +#endif + sbuf_printf(sb, "paused_thread_count = %d\n", + cnt_sleeping); + } } /** @@ -336,38 +741,91 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) static int fail_point_set(struct fail_point *fp, char *buf) { - int error = 0; struct fail_point_entry *ent, *ent_next; - struct fail_point_entries new_entries; + struct fail_point_setting *entries; + bool should_wake_paused; + bool should_truncate; + int error; + + error = 0; + should_wake_paused = false; + should_truncate = false; /* Parse new entries. */ - TAILQ_INIT(&new_entries); - if (!parse_fail_point(&new_entries, buf)) { - clear_entries(&new_entries); + /** + * ref protects our new malloc'd stuff from being garbage collected + * before we link it. + */ + fail_point_setting_get_ref(fp); + entries = fail_point_setting_new(fp); + if (parse_fail_point(entries, buf) == NULL) { + STAILQ_REMOVE(&fp_setting_garbage, entries, + fail_point_setting, fs_garbage_link); + fail_point_setting_destroy(entries); error = EINVAL; goto end; } - FP_LOCK(); - - /* Move new entries in. */ - TAILQ_SWAP(&fp->fp_entries, &new_entries, fail_point_entry, fe_entries); - clear_entries(&new_entries); + /** + * Transfer the entries we are going to keep to a new list. + * Get rid of useless zero probability entries, and entries with hit + * count 0. + * If 'off' is present, and it has no hit count set, then all entries + * after it are discarded since they are unreachable. + */ + TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) { + if (ent->fe_prob == 0 || ent->fe_count == 0) { + printf("Discarding entry which cannot execute %s\n", + fail_type_strings[ent->fe_type].name); + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + continue; + } else if (should_truncate) { + printf("Discarding unreachable entry %s\n", + fail_type_strings[ent->fe_type].name); + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + continue; + } - /* Get rid of useless zero probability entries. */ - TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, ent_next) { - if (ent->fe_prob == 0) - free_entry(&fp->fp_entries, ent); + if (ent->fe_type == FAIL_POINT_OFF) { + should_wake_paused = true; + if (ent->fe_count == FE_COUNT_UNTRACKED) { + should_truncate = true; + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + } + } else if (ent->fe_type == FAIL_POINT_PAUSE) { + should_truncate = true; + } else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags & + FAIL_POINT_NONSLEEPABLE)) { + /** + * If this fail point is annotated as being in a + * non-sleepable ctx, convert sleep to delay and + * convert the msec argument to usecs. + */ + printf("Sleep call request on fail point in " + "non-sleepable context; using delay instead " + "of sleep\n"); + ent->fe_type = FAIL_POINT_DELAY; + ent->fe_arg *= 1000; + } } - /* Get rid of "off"s at the end. */ - while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) && - ent->fe_type == FAIL_POINT_OFF) - free_entry(&fp->fp_entries, ent); - - FP_UNLOCK(); + if (TAILQ_EMPTY(&entries->fp_entry_queue)) { + entries = fail_point_swap_settings(fp, NULL); + if (entries != NULL) + wakeup(FP_PAUSE_CHANNEL(fp)); + } else { + if (should_wake_paused) + wakeup(FP_PAUSE_CHANNEL(fp)); + fail_point_swap_settings(fp, entries); + } - end: +end: #ifdef IWARNING if (error) IWARNING("Failed to set %s %s to %s", @@ -377,6 +835,7 @@ fail_point_set(struct fail_point *fp, char *buf) fp->fp_name, fp->fp_location, buf); #endif /* IWARNING */ + fail_point_setting_release_ref(fp); return (error); } @@ -385,25 +844,33 @@ fail_point_set(struct fail_point *fp, char *buf) /** * Handle kernel failpoint set/get. */ + int fail_point_sysctl(SYSCTL_HANDLER_ARGS) { - struct fail_point *fp = arg1; - char *buf = NULL; + struct fail_point *fp; + char *buf; + struct sbuf *sb_check; struct sbuf sb; int error; - /* Retrieving */ - sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); - fail_point_get(fp, &sb); - sbuf_trim(&sb); - error = sbuf_finish(&sb); - if (error == 0) - error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); - sbuf_delete(&sb); + error = 0; + fp = arg1; + buf = NULL; + + sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); + if (sb_check != &sb) + return (ENOMEM); + + sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Setting */ - if (!error && req->newptr) { + /** + * Lock protects any new entries from being garbage collected before we + * can link them to the fail point. + */ + sx_xlock(&sx_fp_set); + if (req->newptr) { if (req->newlen > MAX_FAIL_POINT_BUF) { error = EINVAL; goto out; @@ -417,31 +884,95 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS) buf[req->newlen] = '\0'; error = fail_point_set(fp, buf); - } + } + + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); + + /* Retrieving. */ + fail_point_get(fp, &sb, false); out: - fp_free(buf); + sbuf_finish(&sb); + sbuf_delete(&sb); + + if (buf) + fp_free(buf); + return (error); } +int +fail_point_sysctl_status(SYSCTL_HANDLER_ARGS) +{ + struct fail_point *fp; + struct sbuf sb, *sb_check; + + fp = arg1; + + sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); + if (sb_check != &sb) + return (ENOMEM); + + sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); + + /* Retrieving. */ + fail_point_get(fp, &sb, true); + + sbuf_finish(&sb); + sbuf_delete(&sb); + + /** + * Lock protects any new entries from being garbage collected before we + * can link them to the fail point. + */ + sx_xlock(&sx_fp_set); + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); + + return (0); +} + +int +fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len) +{ + struct sysctl_req *sa; + int error; + + sa = sysctl_args; + + error = SYSCTL_OUT(sa, buf, len); + + if (error == ENOMEM) + return (-1); + else + return (len); +} + + /** * Internal helper function to translate a human-readable failpoint string * into a internally-parsable fail_point structure. */ static char * -parse_fail_point(struct fail_point_entries *ents, char *p) +parse_fail_point(struct fail_point_setting *ents, char *p) { /* <fail_point> :: * <term> ( "->" <term> )* */ + uint8_t term_count; + + term_count = 1; + p = parse_term(ents, p); if (p == NULL) return (NULL); + while (*p != '\0') { - if (p[0] != '-' || p[1] != '>') - return (NULL); - p = parse_term(ents, p + 2); - if (p == NULL) + term_count++; + if (p[0] != '-' || p[1] != '>' || + (p = parse_term(ents, p+2)) == NULL || + term_count > FP_MAX_ENTRY_COUNT) return (NULL); } return (p); @@ -451,14 +982,11 @@ parse_fail_point(struct fail_point_entries *ents, char *p) * Internal helper function to parse an individual term from a failpoint. */ static char * -parse_term(struct fail_point_entries *ents, char *p) +parse_term(struct fail_point_setting *ents, char *p) { struct fail_point_entry *ent; - ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO); - ent->fe_prob = PROB_MAX; - ent->fe_pid = NO_PID; - TAILQ_INSERT_TAIL(ents, ent, fe_entries); + ent = fail_point_entry_new(ents); /* * <term> :: @@ -483,7 +1011,7 @@ parse_term(struct fail_point_entries *ents, char *p) if (ent->fe_prob > PROB_MAX) ent->fe_prob = PROB_MAX; } else if (*p == '*') { - if (!units || decimal) + if (!units || units < 0 || decimal) return (NULL); ent->fe_count = units; } else @@ -500,7 +1028,7 @@ parse_term(struct fail_point_entries *ents, char *p) /* [ "(" <integer> ")" ] */ if (*p != '(') - return p; + return (p); p++; if (!isdigit(*p) && *p != '-') return (NULL); @@ -509,7 +1037,7 @@ parse_term(struct fail_point_entries *ents, char *p) return (NULL); /* [ "[pid " <integer> "]" ] */ -#define PID_STRING "[pid " +#define PID_STRING "[pid " if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0) return (p); p += sizeof(PID_STRING) - 1; @@ -530,7 +1058,7 @@ parse_number(int *out_units, int *out_decimal, char *p) { char *old_p; - /* + /** * <number> :: * <integer> [ "." <integer> ] | * "." <integer> @@ -584,29 +1112,17 @@ parse_type(struct fail_point_entry *ent, char *beg) return (NULL); } -/** - * Internal helper function to free an individual failpoint term. - */ -static void -free_entry(struct fail_point_entries *ents, struct fail_point_entry *ent) -{ - TAILQ_REMOVE(ents, ent, fe_entries); - fp_free(ent); -} +/* The fail point sysctl tree. */ +SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points"); -/** - * Internal helper function to clear out all failpoint terms for a single - * failpoint. - */ -static void -clear_entries(struct fail_point_entries *ents) +/* Debugging/testing stuff for fail point */ +static int +sysctl_test_fail_point(SYSCTL_HANDLER_ARGS) { - struct fail_point_entry *ent, *ent_next; - TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next) - fp_free(ent); - TAILQ_INIT(ents); + KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point); + return (0); } - -/* The fail point sysctl tree. */ -SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points"); +SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point, + CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_test_fail_point, "A", + "Trigger test fail points"); diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index 7454d79..e204291 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -1763,8 +1763,6 @@ linker_hints_lookup(const char *path, int pathlen, const char *modname, goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); - if (hints == NULL) - goto bad; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index 3074589..5e634d7 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -424,6 +424,7 @@ mb_ctor_mbuf(void *mem, int size, void *arg, int how) m = (struct mbuf *)mem; flags = args->flags; + MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); @@ -572,6 +573,7 @@ mb_ctor_pack(void *mem, int size, void *arg, int how) args = (struct mb_args *)arg; flags = args->flags; type = args->type; + MPASS((flags & M_NOFREE) == 0); #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); @@ -731,6 +733,7 @@ m_clget(struct mbuf *m, int how) zone_drain(zone_pack); uma_zalloc_arg(zone_clust, m, how); } + MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } @@ -745,6 +748,7 @@ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; + void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", @@ -753,7 +757,11 @@ m_cljget(struct mbuf *m, int how, int size) } zone = m_getzone(size); - return (uma_zalloc_arg(zone, m, how)); + retval = uma_zalloc_arg(zone, m, how); + + MBUF_PROBE4(m__cljget, m, how, size, retval); + + return (retval); } /* @@ -934,6 +942,7 @@ void m_freem(struct mbuf *mb) { + MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } diff --git a/sys/kern/kern_osd.c b/sys/kern/kern_osd.c index cc9bed1..41d518f 100644 --- a/sys/kern/kern_osd.c +++ b/sys/kern/kern_osd.c @@ -44,6 +44,23 @@ __FBSDID("$FreeBSD$"); /* OSD (Object Specific Data) */ +/* + * Lock key: + * (m) osd_module_lock + * (o) osd_object_lock + * (l) osd_list_lock + */ +struct osd_master { + struct sx osd_module_lock; + struct rmlock osd_object_lock; + struct mtx osd_list_lock; + LIST_HEAD(, osd) osd_list; /* (l) */ + osd_destructor_t *osd_destructors; /* (o) */ + osd_method_t *osd_methods; /* (m) */ + u_int osd_ntslots; /* (m) */ + const u_int osd_nmethods; +}; + static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data"); static int osd_debug = 0; @@ -61,25 +78,12 @@ static void do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked); /* - * Lists of objects with OSD. - * - * Lock key: - * (m) osd_module_lock - * (o) osd_object_lock - * (l) osd_list_lock + * List of objects with OSD. */ -static LIST_HEAD(, osd) osd_list[OSD_LAST + 1]; /* (m) */ -static osd_method_t *osd_methods[OSD_LAST + 1]; /* (m) */ -static u_int osd_nslots[OSD_LAST + 1]; /* (m) */ -static osd_destructor_t *osd_destructors[OSD_LAST + 1]; /* (o) */ -static const u_int osd_nmethods[OSD_LAST + 1] = { - [OSD_JAIL] = PR_MAXMETHOD, +struct osd_master osdm[OSD_LAST + 1] = { + [OSD_JAIL] = { .osd_nmethods = PR_MAXMETHOD }, }; -static struct sx osd_module_lock[OSD_LAST + 1]; -static struct rmlock osd_object_lock[OSD_LAST + 1]; -static struct mtx osd_list_lock[OSD_LAST + 1]; - static void osd_default_destructor(void *value __unused) { @@ -101,12 +105,12 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods) if (destructor == NULL) destructor = osd_default_destructor; - sx_xlock(&osd_module_lock[type]); + sx_xlock(&osdm[type].osd_module_lock); /* * First, we try to find unused slot. */ - for (i = 0; i < osd_nslots[type]; i++) { - if (osd_destructors[type][i] == NULL) { + for (i = 0; i < osdm[type].osd_ntslots; i++) { + if (osdm[type].osd_destructors[i] == NULL) { OSD_DEBUG("Unused slot found (type=%u, slot=%u).", type, i); break; @@ -115,31 +119,31 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods) /* * If no unused slot was found, allocate one. */ - if (i == osd_nslots[type]) { - osd_nslots[type]++; - if (osd_nmethods[type] != 0) - osd_methods[type] = realloc(osd_methods[type], - sizeof(osd_method_t) * osd_nslots[type] * - osd_nmethods[type], M_OSD, M_WAITOK); - newptr = malloc(sizeof(osd_destructor_t) * osd_nslots[type], - M_OSD, M_WAITOK); - rm_wlock(&osd_object_lock[type]); - bcopy(osd_destructors[type], newptr, + if (i == osdm[type].osd_ntslots) { + osdm[type].osd_ntslots++; + if (osdm[type].osd_nmethods != 0) + osdm[type].osd_methods = realloc(osdm[type].osd_methods, + sizeof(osd_method_t) * osdm[type].osd_ntslots * + osdm[type].osd_nmethods, M_OSD, M_WAITOK); + newptr = malloc(sizeof(osd_destructor_t) * + osdm[type].osd_ntslots, M_OSD, M_WAITOK); + rm_wlock(&osdm[type].osd_object_lock); + bcopy(osdm[type].osd_destructors, newptr, sizeof(osd_destructor_t) * i); - free(osd_destructors[type], M_OSD); - osd_destructors[type] = newptr; - rm_wunlock(&osd_object_lock[type]); + free(osdm[type].osd_destructors, M_OSD); + osdm[type].osd_destructors = newptr; + rm_wunlock(&osdm[type].osd_object_lock); OSD_DEBUG("New slot allocated (type=%u, slot=%u).", type, i + 1); } - osd_destructors[type][i] = destructor; - if (osd_nmethods[type] != 0) { - for (m = 0; m < osd_nmethods[type]; m++) - osd_methods[type][i * osd_nmethods[type] + m] = - methods != NULL ? methods[m] : NULL; + osdm[type].osd_destructors[i] = destructor; + if (osdm[type].osd_nmethods != 0) { + for (m = 0; m < osdm[type].osd_nmethods; m++) + osdm[type].osd_methods[i * osdm[type].osd_nmethods + m] + = methods != NULL ? methods[m] : NULL; } - sx_xunlock(&osd_module_lock[type]); + sx_xunlock(&osdm[type].osd_module_lock); return (i + 1); } @@ -150,105 +154,142 @@ osd_deregister(u_int type, u_int slot) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - sx_xlock(&osd_module_lock[type]); - rm_wlock(&osd_object_lock[type]); + sx_xlock(&osdm[type].osd_module_lock); + rm_wlock(&osdm[type].osd_object_lock); /* * Free all OSD for the given slot. */ - mtx_lock(&osd_list_lock[type]); - LIST_FOREACH_SAFE(osd, &osd_list[type], osd_next, tosd) + mtx_lock(&osdm[type].osd_list_lock); + LIST_FOREACH_SAFE(osd, &osdm[type].osd_list, osd_next, tosd) do_osd_del(type, osd, slot, 1); - mtx_unlock(&osd_list_lock[type]); + mtx_unlock(&osdm[type].osd_list_lock); /* * Set destructor to NULL to free the slot. */ - osd_destructors[type][slot - 1] = NULL; - if (slot == osd_nslots[type]) { - osd_nslots[type]--; - osd_destructors[type] = realloc(osd_destructors[type], - sizeof(osd_destructor_t) * osd_nslots[type], M_OSD, + osdm[type].osd_destructors[slot - 1] = NULL; + if (slot == osdm[type].osd_ntslots) { + osdm[type].osd_ntslots--; + osdm[type].osd_destructors = realloc(osdm[type].osd_destructors, + sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD, M_NOWAIT | M_ZERO); - if (osd_nmethods[type] != 0) - osd_methods[type] = realloc(osd_methods[type], - sizeof(osd_method_t) * osd_nslots[type] * - osd_nmethods[type], M_OSD, M_NOWAIT | M_ZERO); + if (osdm[type].osd_nmethods != 0) + osdm[type].osd_methods = realloc(osdm[type].osd_methods, + sizeof(osd_method_t) * osdm[type].osd_ntslots * + osdm[type].osd_nmethods, M_OSD, M_NOWAIT | M_ZERO); /* * We always reallocate to smaller size, so we assume it will * always succeed. */ - KASSERT(osd_destructors[type] != NULL && - (osd_nmethods[type] == 0 || osd_methods[type] != NULL), - ("realloc() failed")); + KASSERT(osdm[type].osd_destructors != NULL && + (osdm[type].osd_nmethods == 0 || + osdm[type].osd_methods != NULL), ("realloc() failed")); OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).", type, slot); } else { OSD_DEBUG("Slot deregistration (type=%u, slot=%u).", type, slot); } - rm_wunlock(&osd_object_lock[type]); - sx_xunlock(&osd_module_lock[type]); + rm_wunlock(&osdm[type].osd_object_lock); + sx_xunlock(&osdm[type].osd_module_lock); } int osd_set(u_int type, struct osd *osd, u_int slot, void *value) { + + return (osd_set_reserved(type, osd, slot, NULL, value)); +} + +void * +osd_reserve(u_int slot) +{ + + KASSERT(slot > 0, ("Invalid slot.")); + + OSD_DEBUG("Reserving slot array (slot=%u).", slot); + return (malloc(sizeof(void *) * slot, M_OSD, M_WAITOK | M_ZERO)); +} + +int +osd_set_reserved(u_int type, struct osd *osd, u_int slot, void *rsv, + void *value) +{ struct rm_priotracker tracker; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { + void *newptr; + if (value == NULL) { OSD_DEBUG( "Not allocating null slot (type=%u, slot=%u).", type, slot); - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); + if (rsv) + osd_free_reserved(rsv); return (0); - } else if (osd->osd_nslots == 0) { + } + + /* + * Too few slots allocated here, so we need to extend or create + * the array. + */ + if (rsv) { /* - * First OSD for this object, so we need to allocate - * space and put it onto the list. + * Use the reserve passed in (assumed to be + * the right size). */ - osd->osd_slots = malloc(sizeof(void *) * slot, M_OSD, - M_NOWAIT | M_ZERO); - if (osd->osd_slots == NULL) { - rm_runlock(&osd_object_lock[type], &tracker); - return (ENOMEM); + newptr = rsv; + if (osd->osd_nslots != 0) { + memcpy(newptr, osd->osd_slots, + sizeof(void *) * osd->osd_nslots); + free(osd->osd_slots, M_OSD); } - osd->osd_nslots = slot; - mtx_lock(&osd_list_lock[type]); - LIST_INSERT_HEAD(&osd_list[type], osd, osd_next); - mtx_unlock(&osd_list_lock[type]); - OSD_DEBUG("Setting first slot (type=%u).", type); } else { - void *newptr; - - /* - * Too few slots allocated here, needs to extend - * the array. - */ newptr = realloc(osd->osd_slots, sizeof(void *) * slot, M_OSD, M_NOWAIT | M_ZERO); if (newptr == NULL) { - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, + &tracker); return (ENOMEM); } - osd->osd_slots = newptr; - osd->osd_nslots = slot; - OSD_DEBUG("Growing slots array (type=%u).", type); } - } + if (osd->osd_nslots == 0) { + /* + * First OSD for this object, so we need to put it + * onto the list. + */ + mtx_lock(&osdm[type].osd_list_lock); + LIST_INSERT_HEAD(&osdm[type].osd_list, osd, osd_next); + mtx_unlock(&osdm[type].osd_list_lock); + OSD_DEBUG("Setting first slot (type=%u).", type); + } else + OSD_DEBUG("Growing slots array (type=%u).", type); + osd->osd_slots = newptr; + osd->osd_nslots = slot; + } else if (rsv) + osd_free_reserved(rsv); OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type, slot, value); osd->osd_slots[slot - 1] = value; - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); return (0); } +void +osd_free_reserved(void *rsv) +{ + + OSD_DEBUG("Discarding reserved slot array."); + free(rsv, M_OSD); +} + void * osd_get(u_int type, struct osd *osd, u_int slot) { @@ -257,9 +298,9 @@ osd_get(u_int type, struct osd *osd, u_int slot) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); if (slot > osd->osd_nslots) { value = NULL; OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot); @@ -268,7 +309,7 @@ osd_get(u_int type, struct osd *osd, u_int slot) OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).", type, slot, value); } - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); return (value); } @@ -277,9 +318,9 @@ osd_del(u_int type, struct osd *osd, u_int slot) { struct rm_priotracker tracker; - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); do_osd_del(type, osd, slot, 0); - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); } static void @@ -289,7 +330,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); KASSERT(slot > 0, ("Invalid slot.")); - KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot.")); + KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot.")); OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot); @@ -298,7 +339,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) return; } if (osd->osd_slots[slot - 1] != NULL) { - osd_destructors[type][slot - 1](osd->osd_slots[slot - 1]); + osdm[type].osd_destructors[slot - 1](osd->osd_slots[slot - 1]); osd->osd_slots[slot - 1] = NULL; } for (i = osd->osd_nslots - 1; i >= 0; i--) { @@ -312,10 +353,10 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked) /* No values left for this object. */ OSD_DEBUG("No more slots left (type=%u).", type); if (!list_locked) - mtx_lock(&osd_list_lock[type]); + mtx_lock(&osdm[type].osd_list_lock); LIST_REMOVE(osd, osd_next); if (!list_locked) - mtx_unlock(&osd_list_lock[type]); + mtx_unlock(&osdm[type].osd_list_lock); free(osd->osd_slots, M_OSD); osd->osd_slots = NULL; osd->osd_nslots = 0; @@ -341,21 +382,21 @@ osd_call(u_int type, u_int method, void *obj, void *data) int error, i; KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type.")); - KASSERT(method < osd_nmethods[type], ("Invalid method.")); + KASSERT(method < osdm[type].osd_nmethods, ("Invalid method.")); /* * Call this method for every slot that defines it, stopping if an * error is encountered. */ error = 0; - sx_slock(&osd_module_lock[type]); - for (i = 0; i < osd_nslots[type]; i++) { - methodfun = - osd_methods[type][i * osd_nmethods[type] + method]; + sx_slock(&osdm[type].osd_module_lock); + for (i = 0; i < osdm[type].osd_ntslots; i++) { + methodfun = osdm[type].osd_methods[i * osdm[type].osd_nmethods + + method]; if (methodfun != NULL && (error = methodfun(obj, data)) != 0) break; } - sx_sunlock(&osd_module_lock[type]); + sx_sunlock(&osdm[type].osd_module_lock); return (error); } @@ -373,14 +414,14 @@ osd_exit(u_int type, struct osd *osd) return; } - rm_rlock(&osd_object_lock[type], &tracker); + rm_rlock(&osdm[type].osd_object_lock, &tracker); for (i = 1; i <= osd->osd_nslots; i++) { - if (osd_destructors[type][i - 1] != NULL) + if (osdm[type].osd_destructors[i - 1] != NULL) do_osd_del(type, osd, i, 0); else OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i); } - rm_runlock(&osd_object_lock[type], &tracker); + rm_runlock(&osdm[type].osd_object_lock, &tracker); OSD_DEBUG("Object exit (type=%u).", type); } @@ -390,13 +431,13 @@ osd_init(void *arg __unused) u_int i; for (i = OSD_FIRST; i <= OSD_LAST; i++) { - osd_nslots[i] = 0; - LIST_INIT(&osd_list[i]); - sx_init(&osd_module_lock[i], "osd_module"); - rm_init(&osd_object_lock[i], "osd_object"); - mtx_init(&osd_list_lock[i], "osd_list", NULL, MTX_DEF); - osd_destructors[i] = NULL; - osd_methods[i] = NULL; + sx_init(&osdm[i].osd_module_lock, "osd_module"); + rm_init(&osdm[i].osd_object_lock, "osd_object"); + mtx_init(&osdm[i].osd_list_lock, "osd_list", NULL, MTX_DEF); + LIST_INIT(&osdm[i].osd_list); + osdm[i].osd_destructors = NULL; + osdm[i].osd_ntslots = 0; + osdm[i].osd_methods = NULL; } } SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL); diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 1c3c9d7..419c69e 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -114,6 +114,8 @@ SDT_PROBE_DEFINE3(racct, , rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__failure, "struct proc *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, set__force, + "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, @@ -532,7 +534,7 @@ racct_adjust_resource(struct racct *racct, int resource, } static int -racct_add_locked(struct proc *p, int resource, uint64_t amount) +racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) { #ifdef RCTL int error; @@ -540,8 +542,6 @@ racct_add_locked(struct proc *p, int resource, uint64_t amount) ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, , rusage, add, p, resource, amount); - /* * We need proc lock to dereference p->p_ucred. */ @@ -549,7 +549,7 @@ racct_add_locked(struct proc *p, int resource, uint64_t amount) #ifdef RCTL error = rctl_enforce(p, resource, amount); - if (error && RACCT_IS_DENIABLE(resource)) { + if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } @@ -572,12 +572,32 @@ racct_add(struct proc *p, int resource, uint64_t amount) if (!racct_enable) return (0); + SDT_PROBE3(racct, , rusage, add, p, resource, amount); + mtx_lock(&racct_lock); - error = racct_add_locked(p, resource, amount); + error = racct_add_locked(p, resource, amount, 0); mtx_unlock(&racct_lock); return (error); } +/* + * Increase allocation of 'resource' by 'amount' for process 'p'. + * Doesn't check for limits and never fails. + */ +void +racct_add_force(struct proc *p, int resource, uint64_t amount) +{ + + if (!racct_enable) + return; + + SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); + + mtx_lock(&racct_lock); + racct_add_locked(p, resource, amount, 1); + mtx_unlock(&racct_lock); +} + static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { @@ -597,8 +617,6 @@ racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. - * - * XXX: Shouldn't this ever return an error? */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) @@ -612,32 +630,8 @@ racct_add_cred(struct ucred *cred, int resource, uint64_t amount) mtx_unlock(&racct_lock); } -/* - * Increase allocation of 'resource' by 'amount' for process 'p'. - * Doesn't check for limits and never fails. - */ -void -racct_add_force(struct proc *p, int resource, uint64_t amount) -{ - - if (!racct_enable) - return; - - SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); - - /* - * We need proc lock to dereference p->p_ucred. - */ - PROC_LOCK_ASSERT(p, MA_OWNED); - - mtx_lock(&racct_lock); - racct_adjust_resource(p->p_racct, resource, amount); - racct_add_cred_locked(p->p_ucred, resource, amount); - mtx_unlock(&racct_lock); -} - static int -racct_set_locked(struct proc *p, int resource, uint64_t amount) +racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; @@ -647,8 +641,6 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount) ASSERT_RACCT_ENABLED(); - SDT_PROBE3(racct, , rusage, set, p, resource, amount); - /* * We need proc lock to dereference p->p_ucred. */ @@ -678,7 +670,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount) #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); - if (error && RACCT_IS_DENIABLE(resource)) { + if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, set__failure, p, resource, amount); return (error); @@ -709,51 +701,14 @@ racct_set(struct proc *p, int resource, uint64_t amount) if (!racct_enable) return (0); + SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); + mtx_lock(&racct_lock); - error = racct_set_locked(p, resource, amount); + error = racct_set_locked(p, resource, amount, 0); mtx_unlock(&racct_lock); return (error); } -static void -racct_set_force_locked(struct proc *p, int resource, uint64_t amount) -{ - int64_t old_amount, decayed_amount; - int64_t diff_proc, diff_cred; - - ASSERT_RACCT_ENABLED(); - - SDT_PROBE3(racct, , rusage, set, p, resource, amount); - - /* - * We need proc lock to dereference p->p_ucred. - */ - PROC_LOCK_ASSERT(p, MA_OWNED); - - old_amount = p->p_racct->r_resources[resource]; - /* - * The diffs may be negative. - */ - diff_proc = amount - old_amount; - if (RACCT_IS_DECAYING(resource)) { - /* - * Resources in per-credential racct containers may decay. - * If this is the case, we need to calculate the difference - * between the new amount and the proportional value of the - * old amount that has decayed in the ucred racct containers. - */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; - diff_cred = amount - decayed_amount; - } else - diff_cred = diff_proc; - - racct_adjust_resource(p->p_racct, resource, diff_proc); - if (diff_cred > 0) - racct_add_cred_locked(p->p_ucred, resource, diff_cred); - else if (diff_cred < 0) - racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); -} - void racct_set_force(struct proc *p, int resource, uint64_t amount) { @@ -761,8 +716,10 @@ racct_set_force(struct proc *p, int resource, uint64_t amount) if (!racct_enable) return; + SDT_PROBE3(racct, , rusage, set, p, resource, amount); + mtx_lock(&racct_lock); - racct_set_force_locked(p, resource, amount); + racct_set_locked(p, resource, amount, 1); mtx_unlock(&racct_lock); } @@ -930,13 +887,13 @@ racct_proc_fork(struct proc *parent, struct proc *child) continue; error = racct_set_locked(child, i, - parent->p_racct->r_resources[i]); + parent->p_racct->r_resources[i], 0); if (error != 0) goto out; } - error = racct_add_locked(child, RACCT_NPROC, 1); - error += racct_add_locked(child, RACCT_NTHR, 1); + error = racct_add_locked(child, RACCT_NPROC, 1, 0); + error += racct_add_locked(child, RACCT_NTHR, 1, 0); out: mtx_unlock(&racct_lock); @@ -1002,7 +959,7 @@ racct_proc_exit(struct proc *p) pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); - racct_set_locked(p, RACCT_CPU, runtime); + racct_set_locked(p, RACCT_CPU, runtime, 0); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); for (i = 0; i <= RACCT_MAX; i++) { @@ -1010,7 +967,7 @@ racct_proc_exit(struct proc *p) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; - racct_set_locked(p, i, 0); + racct_set_locked(p, i, 0, 0); } mtx_unlock(&racct_lock); @@ -1150,23 +1107,21 @@ racct_proc_wakeup(struct proc *p) } static void -racct_decay_resource(struct racct *racct, void * res, void* dummy) +racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { - int resource; int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); - resource = *(int *)res; - r_old = racct->r_resources[resource]; + r_old = racct->r_resources[RACCT_PCTCPU]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; - racct->r_resources[resource] = r_new; + racct->r_resources[RACCT_PCTCPU] = r_new; } static void @@ -1184,17 +1139,17 @@ racct_decay_post(void) } static void -racct_decay(int resource) +racct_decay(void) { ASSERT_RACCT_ENABLED(); - ui_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); - loginclass_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); - prison_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); + ui_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); + loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); + prison_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); } static void @@ -1209,7 +1164,7 @@ racctd(void) ASSERT_RACCT_ENABLED(); for (;;) { - racct_decay(RACCT_PCTCPU); + racct_decay(); sx_slock(&allproc_lock); @@ -1249,11 +1204,11 @@ racctd(void) pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); - racct_set_force_locked(p, RACCT_PCTCPU, pct); - racct_set_locked(p, RACCT_CPU, runtime); + racct_set_locked(p, RACCT_PCTCPU, pct, 1); + racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); + wallclock.tv_usec, 0); mtx_unlock(&racct_lock); PROC_UNLOCK(p); } diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index b0d3fd6..1970ed8 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -78,10 +78,16 @@ FEATURE(rctl, "Resource Limits"); #define RCTL_PCPU_SHIFT (10 * 1000000) unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; +static int rctl_log_rate_limit = 10; +static int rctl_devctl_rate_limit = 10; SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, + &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW, + &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); /* * 'rctl_rule_link' connects a rule with every racct it's related to. @@ -219,43 +225,43 @@ rctl_resource_name(int resource) panic("rctl_resource_name: unknown resource %d", resource); } -/* - * Return the amount of resource that can be allocated by 'p' before - * hitting 'rule'. - */ -static int64_t -rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) +static struct racct * +rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { - int resource; - int64_t available = INT64_MAX; struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_LOCKED); - resource = rule->rr_resource; switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: - available = rule->rr_amount - - p->p_racct->r_resources[resource]; - break; + return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: - available = rule->rr_amount - - cred->cr_ruidinfo->ui_racct->r_resources[resource]; - break; + return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: - available = rule->rr_amount - - cred->cr_loginclass->lc_racct->r_resources[resource]; - break; + return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: - available = rule->rr_amount - - cred->cr_prison->pr_prison_racct->prr_racct-> - r_resources[resource]; - break; + return (cred->cr_prison->pr_prison_racct->prr_racct); default: - panic("rctl_compute_available: unknown per %d", - rule->rr_per); + panic("%s: unknown per %d", __func__, rule->rr_per); } +} + +/* + * Return the amount of resource that can be allocated by 'p' before + * hitting 'rule'. + */ +static int64_t +rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) +{ + int64_t available; + const struct racct *racct; + + ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_LOCKED); + + racct = rctl_proc_rule_to_racct(p, rule); + available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } @@ -336,13 +342,13 @@ rctl_pcpu_available(const struct proc *p) { int rctl_enforce(struct proc *p, int resource, uint64_t amount) { + static struct timeval log_lasttime, devctl_lasttime; + static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; int should_deny = 0; char *buf; - static int curtime = 0; - static struct timeval lasttime; ASSERT_RACCT_ENABLED(); @@ -383,7 +389,8 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) if (p->p_state != PRS_NORMAL) continue; - if (!ppsratecheck(&lasttime, &curtime, 10)) + if (!ppsratecheck(&log_lasttime, &log_curtime, + rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); @@ -409,6 +416,10 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) if (p->p_state != PRS_NORMAL) continue; + if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, + rctl_devctl_rate_limit)) + continue; + buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); @@ -642,6 +653,9 @@ str2int64(const char *str, int64_t *value) if ((size_t)(end - str) != strlen(str)) return (EINVAL); + if (*value < 0) + return (ERANGE); + return (0); } @@ -1008,8 +1022,13 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; - if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) + if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { + if (rule->rr_amount > INT64_MAX / 1000000) { + error = ERANGE; + goto out; + } rule->rr_amount *= 1000000; + } } if (perstr == NULL || perstr[0] == '\0') diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index a2e1311..7ee7780 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -516,7 +516,7 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp, int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, - int kflags, struct thread *td) + struct thread *td) { struct file *sock_fp; struct vnode *vp; @@ -534,7 +534,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, so = NULL; m = mh = NULL; sfs = NULL; - sbytes = 0; + hdrlen = sbytes = 0; softerr = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); @@ -560,26 +560,6 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, cv_init(&sfs->cv, "sendfile"); } - /* If headers are specified copy them into mbufs. */ - if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { - hdr_uio->uio_td = td; - hdr_uio->uio_rw = UIO_WRITE; - /* - * In FBSD < 5.0 the nbytes to send also included - * the header. If compat is specified subtract the - * header size from nbytes. - */ - if (kflags & SFK_COMPAT) { - if (nbytes > hdr_uio->uio_resid) - nbytes -= hdr_uio->uio_resid; - else - nbytes = 0; - } - mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0); - hdrlen = m_length(mh, &mhtail); - } else - hdrlen = 0; - rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* @@ -668,11 +648,20 @@ retry_space: SOCKBUF_UNLOCK(&so->so_snd); /* - * Reduce space in the socket buffer by the size of - * the header mbuf chain. - * hdrlen is set to 0 after the first loop. + * At the beginning of the first loop check if any headers + * are specified and copy them into mbufs. Reduce space in + * the socket buffer by the size of the header mbuf chain. + * Clear hdr_uio here and hdrlen at the end of the first loop. */ - space -= hdrlen; + if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { + hdr_uio->uio_td = td; + hdr_uio->uio_rw = UIO_WRITE; + hdr_uio->uio_resid = min(hdr_uio->uio_resid, space); + mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0); + hdrlen = m_length(mh, &mhtail); + space -= hdrlen; + hdr_uio = NULL; + } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); @@ -944,6 +933,19 @@ sendfile(struct thread *td, struct sendfile_args *uap, int compat) &hdr_uio); if (error != 0) goto out; +#ifdef COMPAT_FREEBSD4 + /* + * In FreeBSD < 5.0 the nbytes to send also included + * the header. If compat is specified subtract the + * header size from nbytes. + */ + if (compat) { + if (uap->nbytes > hdr_uio->uio_resid) + uap->nbytes -= hdr_uio->uio_resid; + else + uap->nbytes = 0; + } +#endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, @@ -965,7 +967,7 @@ sendfile(struct thread *td, struct sendfile_args *uap, int compat) } error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, - uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); + uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 8c84773..c9932f56 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -162,15 +162,7 @@ _sleep(void *ident, struct lock_object *lock, int priority, else class = NULL; - if (cold || SCHEDULER_STOPPED()) { - /* - * During autoconfiguration, just return; - * don't run any other threads or panic below, - * in case this is the idle thread and already asleep. - * XXX: this used to do "s = splhigh(); splx(safepri); - * splx(s);" to give interrupts a chance, but there is - * no way to give interrupts a chance now. - */ + if (SCHEDULER_STOPPED()) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); @@ -264,17 +256,8 @@ msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); - if (cold || SCHEDULER_STOPPED()) { - /* - * During autoconfiguration, just return; - * don't run any other threads or panic below, - * in case this is the idle thread and already asleep. - * XXX: this used to do "s = splhigh(); splx(safepri); - * splx(s);" to give interrupts a chance, but there is - * no way to give interrupts a chance now. - */ + if (SCHEDULER_STOPPED()) return (0); - } sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", @@ -438,8 +421,10 @@ mi_switch(int flags, struct thread *newtd) if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; - } else + } else { td->td_ru.ru_nivcsw++; + td->td_swinvoltick = ticks; + } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif diff --git a/sys/kern/pic_if.m b/sys/kern/pic_if.m index 2309f0c..3003c35 100644 --- a/sys/kern/pic_if.m +++ b/sys/kern/pic_if.m @@ -1,7 +1,6 @@ #- -# Copyright (c) 2012 Jakub Wojciech Klama <jceel@FreeBSD.org> -# Copyright (c) 2015 Svatopluk Kraus -# Copyright (c) 2015 Michal Meloun +# Copyright (c) 2015-2016 Svatopluk Kraus +# Copyright (c) 2015-2016 Michal Meloun # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,88 +29,132 @@ #include <sys/bus.h> #include <sys/cpuset.h> -#include <machine/frame.h> -#include <machine/intr.h> +#include <sys/resource.h> +#include <sys/intr.h> INTERFACE pic; CODE { - static int null_pic_bind(device_t dev, struct intr_irqsrc *isrc) + static int + dflt_pic_bind_intr(device_t dev, struct intr_irqsrc *isrc) { + return (EOPNOTSUPP); } - static void null_pic_disable_intr(device_t dev, struct intr_irqsrc *isrc) + static int + null_pic_alloc_intr(device_t dev, struct intr_irqsrc *isrc, + struct resource *res, struct intr_map_data *data) + { + + return (0); + } + + static int + null_pic_release_intr(device_t dev, struct intr_irqsrc *isrc, + struct resource *res, struct intr_map_data *data) + { + + return (0); + } + + static int + null_pic_setup_intr(device_t dev, struct intr_irqsrc *isrc, + struct resource *res, struct intr_map_data *data) { - return; + + return (0); } - static void null_pic_enable_intr(device_t dev, struct intr_irqsrc *isrc) + static int + null_pic_teardown_intr(device_t dev, struct intr_irqsrc *isrc, + struct resource *res, struct intr_map_data *data) { - return; + + return (0); } - static void null_pic_init_secondary(device_t dev) + static void + null_pic_init_secondary(device_t dev) { - return; } - static void null_pic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi) + static void + null_pic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi) { - return; + } + + static int + dflt_pic_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc *isrc) + { + + return (EOPNOTSUPP); } }; -METHOD int register { +METHOD int alloc_intr { device_t dev; struct intr_irqsrc *isrc; - boolean_t *is_percpu; -}; + struct resource *res; + struct intr_map_data *data; +} DEFAULT null_pic_alloc_intr; -METHOD int unregister { +METHOD int bind_intr { device_t dev; struct intr_irqsrc *isrc; -}; +} DEFAULT dflt_pic_bind_intr; METHOD void disable_intr { device_t dev; struct intr_irqsrc *isrc; -} DEFAULT null_pic_disable_intr; +}; -METHOD void disable_source { +METHOD void enable_intr { device_t dev; struct intr_irqsrc *isrc; }; -METHOD void enable_source { +METHOD int map_intr { device_t dev; - struct intr_irqsrc *isrc; + struct intr_map_data *data; + struct intr_irqsrc **isrcp; }; -METHOD void enable_intr { +METHOD int release_intr { device_t dev; struct intr_irqsrc *isrc; -} DEFAULT null_pic_enable_intr; + struct resource *res; + struct intr_map_data *data; +} DEFAULT null_pic_release_intr; -METHOD void pre_ithread { +METHOD int setup_intr { device_t dev; struct intr_irqsrc *isrc; -}; + struct resource *res; + struct intr_map_data *data; +} DEFAULT null_pic_setup_intr; -METHOD void post_ithread { +METHOD int teardown_intr { device_t dev; struct intr_irqsrc *isrc; -}; + struct resource *res; + struct intr_map_data *data; +} DEFAULT null_pic_teardown_intr; METHOD void post_filter { device_t dev; struct intr_irqsrc *isrc; }; -METHOD int bind { +METHOD void post_ithread { device_t dev; struct intr_irqsrc *isrc; -} DEFAULT null_pic_bind; +}; + +METHOD void pre_ithread { + device_t dev; + struct intr_irqsrc *isrc; +}; METHOD void init_secondary { device_t dev; @@ -121,4 +164,11 @@ METHOD void ipi_send { device_t dev; struct intr_irqsrc *isrc; cpuset_t cpus; + u_int ipi; } DEFAULT null_pic_ipi_send; + +METHOD int ipi_setup { + device_t dev; + u_int ipi; + struct intr_irqsrc **isrcp; +} DEFAULT dflt_pic_ipi_setup; diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index 8daa9f2..caf9202 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -839,6 +839,38 @@ sysctl_devctl_queue(SYSCTL_HANDLER_ARGS) return (0); } +/** + * @brief safely quotes strings that might have double quotes in them. + * + * The devctl protocol relies on quoted strings having matching quotes. + * This routine quotes any internal quotes so the resulting string + * is safe to pass to snprintf to construct, for example pnp info strings. + * Strings are always terminated with a NUL, but may be truncated if longer + * than @p len bytes after quotes. + * + * @param dst Buffer to hold the string. Must be at least @p len bytes long + * @param src Original buffer. + * @param len Length of buffer pointed to by @dst, including trailing NUL + */ +void +devctl_safe_quote(char *dst, const char *src, size_t len) +{ + char *walker = dst, *ep = dst + len - 1; + + if (len == 0) + return; + while (src != NULL && walker < ep) + { + if (*src == '"') { + if (ep - walker < 2) + break; + *walker++ = '\\'; + } + *walker++ = *src++; + } + *walker = '\0'; +} + /* End of /dev/devctl code */ static TAILQ_HEAD(,device) bus_data_devices; diff --git a/sys/kern/subr_counter.c b/sys/kern/subr_counter.c index ea2759c..5149f2d 100644 --- a/sys/kern/subr_counter.c +++ b/sys/kern/subr_counter.c @@ -94,3 +94,28 @@ sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS) return (0); } + +int +sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS) +{ + uint64_t *out; + int error; + + out = malloc(arg2 * sizeof(uint64_t), M_TEMP, M_WAITOK); + for (int i = 0; i < arg2; i++) + out[i] = counter_u64_fetch(((counter_u64_t *)arg1)[i]); + + error = SYSCTL_OUT(req, out, arg2 * sizeof(uint64_t)); + free(out, M_TEMP); + + if (error || !req->newptr) + return (error); + + /* + * Any write attempt to a counter zeroes it. + */ + for (int i = 0; i < arg2; i++) + counter_u64_zero(((counter_u64_t *)arg1)[i]); + + return (0); +} diff --git a/sys/kern/subr_intr.c b/sys/kern/subr_intr.c index 1c97fd4..96319ad 100644 --- a/sys/kern/subr_intr.c +++ b/sys/kern/subr_intr.c @@ -1,7 +1,6 @@ /*- - * Copyright (c) 2012-2014 Jakub Wojciech Klama <jceel@FreeBSD.org>. - * Copyright (c) 2015 Svatopluk Kraus - * Copyright (c) 2015 Michal Meloun + * Copyright (c) 2015-2016 Svatopluk Kraus + * Copyright (c) 2015-2016 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,8 +23,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #include <sys/cdefs.h> @@ -38,6 +35,7 @@ __FBSDID("$FreeBSD$"); * - to complete things for removable PICs */ +#include "opt_acpi.h" #include "opt_ddb.h" #include "opt_platform.h" @@ -52,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include <sys/interrupt.h> #include <sys/conf.h> #include <sys/cpuset.h> +#include <sys/rman.h> #include <sys/sched.h> #include <sys/smp.h> #include <machine/atomic.h> @@ -112,6 +111,35 @@ u_int irq_next_free; #define IRQ_INVALID nitems(irq_sources) +/* + * XXX - All stuff around struct intr_dev_data is considered as temporary + * until better place for storing struct intr_map_data will be find. + * + * For now, there are two global interrupt numbers spaces: + * <0, NIRQ) ... interrupts without config data + * managed in irq_sources[] + * IRQ_DDATA_BASE + <0, 2 * NIRQ) ... interrupts with config data + * managed in intr_ddata_tab[] + * + * Read intr_ddata_lookup() to see how these spaces are worked with. + * Note that each interrupt number from second space duplicates some number + * from first space at this moment. An interrupt number from first space can + * be duplicated even multiple times in second space. + */ +struct intr_dev_data { + device_t idd_dev; + intptr_t idd_xref; + u_int idd_irq; + struct intr_map_data idd_data; + struct intr_irqsrc * idd_isrc; +}; + +static struct intr_dev_data *intr_ddata_tab[2 * NIRQ]; +static u_int intr_ddata_first_unused; + +#define IRQ_DDATA_BASE 10000 +CTASSERT(IRQ_DDATA_BASE > IRQ_INVALID); + #ifdef SMP static boolean_t irq_assign_cpu = FALSE; #endif @@ -173,12 +201,10 @@ static inline void isrc_increment_count(struct intr_irqsrc *isrc) { - /* - * XXX - It should be atomic for PPI interrupts. It was proven that - * the lost is measurable easily for timer PPI interrupts. - */ - isrc->isrc_count[0]++; - /*atomic_add_long(&isrc->isrc_count[0], 1);*/ + if (isrc->isrc_flags & INTR_ISRCF_PPI) + atomic_add_long(&isrc->isrc_count[0], 1); + else + isrc->isrc_count[0]++; } /* @@ -233,6 +259,16 @@ isrc_setup_counters(struct intr_irqsrc *isrc) isrc_update_name(isrc, NULL); } +/* + * Virtualization for interrupt source interrupt counters release. + */ +static void +isrc_release_counters(struct intr_irqsrc *isrc) +{ + + panic("%s: not implemented", __func__); +} + #ifdef SMP /* * Virtualization for interrupt source IPI counters setup. @@ -279,8 +315,8 @@ intr_irq_handler(struct trapframe *tf) * be called straight from the interrupt controller, when associated interrupt * source is learned. */ -void -intr_irq_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) +int +intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) { KASSERT(isrc != NULL, ("%s: no source", __func__)); @@ -293,57 +329,16 @@ intr_irq_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) error = isrc->isrc_filter(isrc->isrc_arg, tf); PIC_POST_FILTER(isrc->isrc_dev, isrc); if (error == FILTER_HANDLED) - return; - } else + return (0); + } else #endif if (isrc->isrc_event != NULL) { if (intr_event_handle(isrc->isrc_event, tf) == 0) - return; + return (0); } isrc_increment_straycount(isrc); - PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc); - - device_printf(isrc->isrc_dev, "stray irq <%s> disabled", - isrc->isrc_name); -} - -/* - * Allocate interrupt source. - */ -static struct intr_irqsrc * -isrc_alloc(u_int type, u_int extsize) -{ - struct intr_irqsrc *isrc; - - isrc = malloc(sizeof(*isrc) + extsize, M_INTRNG, M_WAITOK | M_ZERO); - isrc->isrc_irq = IRQ_INVALID; /* just to be safe */ - isrc->isrc_type = type; - isrc->isrc_nspc_type = INTR_IRQ_NSPC_NONE; - isrc->isrc_trig = INTR_TRIGGER_CONFORM; - isrc->isrc_pol = INTR_POLARITY_CONFORM; - CPU_ZERO(&isrc->isrc_cpu); - return (isrc); -} - -/* - * Free interrupt source. - */ -static void -isrc_free(struct intr_irqsrc *isrc) -{ - - free(isrc, M_INTRNG); -} - -void -intr_irq_set_name(struct intr_irqsrc *isrc, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap); - va_end(ap); + return (EINVAL); } /* @@ -356,8 +351,8 @@ intr_irq_set_name(struct intr_irqsrc *isrc, const char *fmt, ...) * immediately. However, if only one free handle left which is reused * constantly... */ -static int -isrc_alloc_irq_locked(struct intr_irqsrc *isrc) +static inline int +isrc_alloc_irq(struct intr_irqsrc *isrc) { u_int maxirqs, irq; @@ -383,46 +378,35 @@ found: isrc->isrc_irq = irq; irq_sources[irq] = isrc; - intr_irq_set_name(isrc, "irq%u", irq); - isrc_setup_counters(isrc); - irq_next_free = irq + 1; if (irq_next_free >= maxirqs) irq_next_free = 0; return (0); } -#ifdef notyet + /* * Free unique interrupt number (resource handle) from interrupt source. */ -static int +static inline int isrc_free_irq(struct intr_irqsrc *isrc) { - u_int maxirqs; - mtx_assert(&isrc_table_lock, MA_NOTOWNED); + mtx_assert(&isrc_table_lock, MA_OWNED); - maxirqs = nitems(irq_sources); - if (isrc->isrc_irq >= maxirqs) + if (isrc->isrc_irq >= nitems(irq_sources)) return (EINVAL); - - mtx_lock(&isrc_table_lock); - if (irq_sources[isrc->isrc_irq] != isrc) { - mtx_unlock(&isrc_table_lock); + if (irq_sources[isrc->isrc_irq] != isrc) return (EINVAL); - } irq_sources[isrc->isrc_irq] = NULL; isrc->isrc_irq = IRQ_INVALID; /* just to be safe */ - mtx_unlock(&isrc_table_lock); - return (0); } -#endif + /* * Lookup interrupt source by interrupt number (resource handle). */ -static struct intr_irqsrc * +static inline struct intr_irqsrc * isrc_lookup(u_int irq) { @@ -432,158 +416,159 @@ isrc_lookup(u_int irq) } /* - * Lookup interrupt source by namespace description. + * Initialize interrupt source and register it into global interrupt table. */ -static struct intr_irqsrc * -isrc_namespace_lookup(device_t dev, uint16_t type, uint16_t num) +int +intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags, + const char *fmt, ...) { - u_int irq; - struct intr_irqsrc *isrc; + int error; + va_list ap; - mtx_assert(&isrc_table_lock, MA_OWNED); + bzero(isrc, sizeof(struct intr_irqsrc)); + isrc->isrc_dev = dev; + isrc->isrc_irq = IRQ_INVALID; /* just to be safe */ + isrc->isrc_flags = flags; + + va_start(ap, fmt); + vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap); + va_end(ap); - for (irq = 0; irq < nitems(irq_sources); irq++) { - isrc = irq_sources[irq]; - if (isrc != NULL && isrc->isrc_dev == dev && - isrc->isrc_nspc_type == type && isrc->isrc_nspc_num == num) - return (isrc); + mtx_lock(&isrc_table_lock); + error = isrc_alloc_irq(isrc); + if (error != 0) { + mtx_unlock(&isrc_table_lock); + return (error); } - return (NULL); + /* + * Setup interrupt counters, but not for IPI sources. Those are setup + * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust + * our counter pool. + */ + if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) + isrc_setup_counters(isrc); + mtx_unlock(&isrc_table_lock); + return (0); } /* - * Map interrupt source according to namespace into framework. If such mapping - * does not exist, create it. Return unique interrupt number (resource handle) - * associated with mapped interrupt source. + * Deregister interrupt source from global interrupt table. */ -u_int -intr_namespace_map_irq(device_t dev, uint16_t type, uint16_t num) +int +intr_isrc_deregister(struct intr_irqsrc *isrc) { - struct intr_irqsrc *isrc, *new_isrc; int error; - new_isrc = isrc_alloc(INTR_ISRCT_NAMESPACE, 0); - mtx_lock(&isrc_table_lock); - isrc = isrc_namespace_lookup(dev, type, num); - if (isrc != NULL) { - mtx_unlock(&isrc_table_lock); - isrc_free(new_isrc); - return (isrc->isrc_irq); /* already mapped */ - } + if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) + isrc_release_counters(isrc); + error = isrc_free_irq(isrc); + mtx_unlock(&isrc_table_lock); + return (error); +} - error = isrc_alloc_irq_locked(new_isrc); - if (error != 0) { +static struct intr_dev_data * +intr_ddata_alloc(u_int extsize) +{ + struct intr_dev_data *ddata; + + ddata = malloc(sizeof(*ddata) + extsize, M_INTRNG, M_WAITOK | M_ZERO); + + mtx_lock(&isrc_table_lock); + if (intr_ddata_first_unused >= nitems(intr_ddata_tab)) { mtx_unlock(&isrc_table_lock); - isrc_free(new_isrc); - return (IRQ_INVALID); /* no space left */ + free(ddata, M_INTRNG); + return (NULL); } - - new_isrc->isrc_dev = dev; - new_isrc->isrc_nspc_type = type; - new_isrc->isrc_nspc_num = num; + intr_ddata_tab[intr_ddata_first_unused] = ddata; + ddata->idd_irq = IRQ_DDATA_BASE + intr_ddata_first_unused++; mtx_unlock(&isrc_table_lock); - - return (new_isrc->isrc_irq); + return (ddata); } -#ifdef FDT -/* - * Lookup interrupt source by FDT description. - */ static struct intr_irqsrc * -isrc_fdt_lookup(intptr_t xref, pcell_t *cells, u_int ncells) +intr_ddata_lookup(u_int irq, struct intr_map_data **datap) { - u_int irq, cellsize; + int error; struct intr_irqsrc *isrc; + struct intr_dev_data *ddata; - mtx_assert(&isrc_table_lock, MA_OWNED); + isrc = isrc_lookup(irq); + if (isrc != NULL) { + if (datap != NULL) + *datap = NULL; + return (isrc); + } - cellsize = ncells * sizeof(*cells); - for (irq = 0; irq < nitems(irq_sources); irq++) { - isrc = irq_sources[irq]; - if (isrc != NULL && isrc->isrc_type == INTR_ISRCT_FDT && - isrc->isrc_xref == xref && isrc->isrc_ncells == ncells && - memcmp(isrc->isrc_cells, cells, cellsize) == 0) - return (isrc); + if (irq < IRQ_DDATA_BASE) + return (NULL); + + irq -= IRQ_DDATA_BASE; + if (irq >= nitems(intr_ddata_tab)) + return (NULL); + + ddata = intr_ddata_tab[irq]; + if (ddata->idd_isrc == NULL) { + error = intr_map_irq(ddata->idd_dev, ddata->idd_xref, + &ddata->idd_data, &irq); + if (error != 0) + return (NULL); + ddata->idd_isrc = isrc_lookup(irq); } - return (NULL); + if (datap != NULL) + *datap = &ddata->idd_data; + return (ddata->idd_isrc); } +#ifdef DEV_ACPI /* - * Map interrupt source according to FDT data into framework. If such mapping + * Map interrupt source according to ACPI info into framework. If such mapping * does not exist, create it. Return unique interrupt number (resource handle) * associated with mapped interrupt source. */ u_int -intr_fdt_map_irq(phandle_t node, pcell_t *cells, u_int ncells) +intr_acpi_map_irq(device_t dev, u_int irq, enum intr_polarity pol, + enum intr_trigger trig) { - struct intr_irqsrc *isrc, *new_isrc; - u_int cellsize; - intptr_t xref; - int error; - - xref = (intptr_t)node; /* It's so simple for now. */ - - cellsize = ncells * sizeof(*cells); - new_isrc = isrc_alloc(INTR_ISRCT_FDT, cellsize); - - mtx_lock(&isrc_table_lock); - isrc = isrc_fdt_lookup(xref, cells, ncells); - if (isrc != NULL) { - mtx_unlock(&isrc_table_lock); - isrc_free(new_isrc); - return (isrc->isrc_irq); /* already mapped */ - } - - error = isrc_alloc_irq_locked(new_isrc); - if (error != 0) { - mtx_unlock(&isrc_table_lock); - isrc_free(new_isrc); - return (IRQ_INVALID); /* no space left */ - } - - new_isrc->isrc_xref = xref; - new_isrc->isrc_ncells = ncells; - memcpy(new_isrc->isrc_cells, cells, cellsize); - mtx_unlock(&isrc_table_lock); - - return (new_isrc->isrc_irq); + struct intr_dev_data *ddata; + + ddata = intr_ddata_alloc(0); + if (ddata == NULL) + return (0xFFFFFFFF); /* no space left */ + + ddata->idd_dev = dev; + ddata->idd_data.type = INTR_MAP_DATA_ACPI; + ddata->idd_data.acpi.irq = irq; + ddata->idd_data.acpi.pol = pol; + ddata->idd_data.acpi.trig = trig; + return (ddata->idd_irq); } #endif - +#ifdef FDT /* - * Register interrupt source into interrupt controller. + * Map interrupt source according to FDT data into framework. If such mapping + * does not exist, create it. Return unique interrupt number (resource handle) + * associated with mapped interrupt source. */ -static int -isrc_register(struct intr_irqsrc *isrc) +u_int +intr_fdt_map_irq(phandle_t node, pcell_t *cells, u_int ncells) { - struct intr_pic *pic; - boolean_t is_percpu; - int error; - - if (isrc->isrc_flags & INTR_ISRCF_REGISTERED) - return (0); - - if (isrc->isrc_dev == NULL) { - pic = pic_lookup(NULL, isrc->isrc_xref); - if (pic == NULL || pic->pic_dev == NULL) - return (ESRCH); - isrc->isrc_dev = pic->pic_dev; - } - - error = PIC_REGISTER(isrc->isrc_dev, isrc, &is_percpu); - if (error != 0) - return (error); + struct intr_dev_data *ddata; + u_int cellsize; - mtx_lock(&isrc_table_lock); - isrc->isrc_flags |= INTR_ISRCF_REGISTERED; - if (is_percpu) - isrc->isrc_flags |= INTR_ISRCF_PERCPU; - isrc_update_name(isrc, NULL); - mtx_unlock(&isrc_table_lock); - return (0); + cellsize = ncells * sizeof(*cells); + ddata = intr_ddata_alloc(cellsize); + if (ddata == NULL) + return (0xFFFFFFFF); /* no space left */ + + ddata->idd_xref = (intptr_t)node; + ddata->idd_data.type = INTR_MAP_DATA_FDT; + ddata->idd_data.fdt.ncells = ncells; + ddata->idd_data.fdt.cells = (pcell_t *)(ddata + 1); + memcpy(ddata->idd_data.fdt.cells, cells, cellsize); + return (ddata->idd_irq); } +#endif #ifdef INTR_SOLO /* @@ -678,7 +663,7 @@ intr_isrc_assign_cpu(void *arg, int cpu) * informed if the call is successfull. */ if (irq_assign_cpu) { - error = PIC_BIND(isrc->isrc_dev, isrc); + error = PIC_BIND_INTR(isrc->isrc_dev, isrc); if (error) { CPU_ZERO(&isrc->isrc_cpu); mtx_unlock(&isrc_table_lock); @@ -774,7 +759,7 @@ isrc_add_handler(struct intr_irqsrc *isrc, const char *name, /* * Lookup interrupt controller locked. */ -static struct intr_pic * +static inline struct intr_pic * pic_lookup_locked(device_t dev, intptr_t xref) { struct intr_pic *pic; @@ -801,7 +786,6 @@ pic_lookup(device_t dev, intptr_t xref) mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref); mtx_unlock(&pic_list_lock); - return (pic); } @@ -871,7 +855,7 @@ intr_pic_register(device_t dev, intptr_t xref) * Unregister interrupt controller. */ int -intr_pic_unregister(device_t dev, intptr_t xref) +intr_pic_deregister(device_t dev, intptr_t xref) { panic("%s: not implemented", __func__); @@ -923,12 +907,73 @@ intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter, } int -intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand, - void *arg, u_int irq, int flags, void **cookiep) +intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data, + u_int *irqp) { - const char *name; + int error; + struct intr_irqsrc *isrc; + struct intr_pic *pic; + + if (data == NULL) + return (EINVAL); + + pic = pic_lookup(dev, xref); + if (pic == NULL || pic->pic_dev == NULL) + return (ESRCH); + + error = PIC_MAP_INTR(pic->pic_dev, data, &isrc); + if (error == 0) + *irqp = isrc->isrc_irq; + return (error); +} + +int +intr_alloc_irq(device_t dev, struct resource *res) +{ + struct intr_map_data *data; + struct intr_irqsrc *isrc; + + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); + + isrc = intr_ddata_lookup(rman_get_start(res), &data); + if (isrc == NULL) + return (EINVAL); + + return (PIC_ALLOC_INTR(isrc->isrc_dev, isrc, res, data)); +} + +int +intr_release_irq(device_t dev, struct resource *res) +{ + struct intr_map_data *data; struct intr_irqsrc *isrc; + + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); + + isrc = intr_ddata_lookup(rman_get_start(res), &data); + if (isrc == NULL) + return (EINVAL); + + return (PIC_RELEASE_INTR(isrc->isrc_dev, isrc, res, data)); +} + +int +intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt, + driver_intr_t hand, void *arg, int flags, void **cookiep) +{ int error; + struct intr_map_data *data; + struct intr_irqsrc *isrc; + const char *name; + + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); + + isrc = intr_ddata_lookup(rman_get_start(res), &data); + if (isrc == NULL) + return (EINVAL); name = device_get_nameunit(dev); @@ -947,21 +992,7 @@ intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand, debugf("irq %u cannot solo on %s\n", irq, name); return (EINVAL); } -#endif - isrc = isrc_lookup(irq); - if (isrc == NULL) { - debugf("irq %u without source on %s\n", irq, name); - return (EINVAL); - } - - error = isrc_register(isrc); - if (error != 0) { - debugf("irq %u map error %d on %s\n", irq, error, name); - return (error); - } - -#ifdef INTR_SOLO if (flags & INTR_SOLO) { error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt, arg, cookiep); @@ -978,24 +1009,32 @@ intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand, return (error); mtx_lock(&isrc_table_lock); - isrc->isrc_handlers++; - if (isrc->isrc_handlers == 1) { - PIC_ENABLE_INTR(isrc->isrc_dev, isrc); - PIC_ENABLE_SOURCE(isrc->isrc_dev, isrc); + error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data); + if (error == 0) { + isrc->isrc_handlers++; + if (isrc->isrc_handlers == 1) + PIC_ENABLE_INTR(isrc->isrc_dev, isrc); } mtx_unlock(&isrc_table_lock); - return (0); + if (error != 0) + intr_event_remove_handler(*cookiep); + return (error); } int -intr_irq_remove_handler(device_t dev, u_int irq, void *cookie) +intr_teardown_irq(device_t dev, struct resource *res, void *cookie) { - struct intr_irqsrc *isrc; int error; + struct intr_map_data *data; + struct intr_irqsrc *isrc; - isrc = isrc_lookup(irq); + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); + + isrc = intr_ddata_lookup(rman_get_start(res), &data); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); + #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { if (isrc != cookie) @@ -1005,8 +1044,8 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie) isrc->isrc_filter = NULL; isrc->isrc_arg = NULL; isrc->isrc_handlers = 0; - PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc); PIC_DISABLE_INTR(isrc->isrc_dev, isrc); + PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); isrc_update_name(isrc, NULL); mtx_unlock(&isrc_table_lock); return (0); @@ -1019,10 +1058,9 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie) if (error == 0) { mtx_lock(&isrc_table_lock); isrc->isrc_handlers--; - if (isrc->isrc_handlers == 0) { - PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc); + if (isrc->isrc_handlers == 0) PIC_DISABLE_INTR(isrc->isrc_dev, isrc); - } + PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } @@ -1030,36 +1068,16 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie) } int -intr_irq_config(u_int irq, enum intr_trigger trig, enum intr_polarity pol) +intr_describe_irq(device_t dev, struct resource *res, void *cookie, + const char *descr) { + int error; struct intr_irqsrc *isrc; - isrc = isrc_lookup(irq); - if (isrc == NULL) - return (EINVAL); - - if (isrc->isrc_handlers != 0) - return (EBUSY); /* interrrupt is enabled (active) */ + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); - /* - * Once an interrupt is enabled, we do not change its configuration. - * A controller PIC_ENABLE_INTR() method is called when an interrupt - * is going to be enabled. In this method, a controller should setup - * the interrupt according to saved configuration parameters. - */ - isrc->isrc_trig = trig; - isrc->isrc_pol = pol; - - return (0); -} - -int -intr_irq_describe(u_int irq, void *cookie, const char *descr) -{ - struct intr_irqsrc *isrc; - int error; - - isrc = isrc_lookup(irq); + isrc = intr_ddata_lookup(rman_get_start(res), NULL); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO @@ -1084,11 +1102,14 @@ intr_irq_describe(u_int irq, void *cookie, const char *descr) #ifdef SMP int -intr_irq_bind(u_int irq, int cpu) +intr_bind_irq(device_t dev, struct resource *res, int cpu) { struct intr_irqsrc *isrc; - isrc = isrc_lookup(irq); + KASSERT(rman_get_start(res) == rman_get_end(res), + ("%s: more interrupts in resource", __func__)); + + isrc = intr_ddata_lookup(rman_get_start(res), NULL); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO @@ -1135,7 +1156,7 @@ intr_irq_shuffle(void *arg __unused) for (i = 0; i < NIRQ; i++) { isrc = irq_sources[i]; if (isrc == NULL || isrc->isrc_handlers == 0 || - isrc->isrc_flags & INTR_ISRCF_PERCPU) + isrc->isrc_flags & INTR_ISRCF_PPI) continue; if (isrc->isrc_event != NULL && @@ -1151,7 +1172,7 @@ intr_irq_shuffle(void *arg __unused) * for bound ISRC. The best thing we can do is to clear * isrc_cpu so inconsistency with ie_cpu will be detectable. */ - if (PIC_BIND(isrc->isrc_dev, isrc) != 0) + if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0) CPU_ZERO(&isrc->isrc_cpu); } mtx_unlock(&isrc_table_lock); @@ -1196,6 +1217,7 @@ intr_pic_init_secondary(void) DB_SHOW_COMMAND(irqs, db_show_irqs) { u_int i, irqsum; + u_long num; struct intr_irqsrc *isrc; for (irqsum = 0, i = 0; i < NIRQ; i++) { @@ -1203,11 +1225,11 @@ DB_SHOW_COMMAND(irqs, db_show_irqs) if (isrc == NULL) continue; + num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0; db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i, isrc->isrc_name, isrc->isrc_cpu.__bits[0], - isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", - isrc->isrc_count[0]); - irqsum += isrc->isrc_count[0]; + isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num); + irqsum += num; } db_printf("irq total %u\n", irqsum); } diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c index 41f1f34..9bbec64 100644 --- a/sys/kern/subr_rman.c +++ b/sys/kern/subr_rman.c @@ -159,7 +159,7 @@ rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end) struct resource_i *r, *s, *t; int rv = 0; - DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n", + DPRINTF(("rman_manage_region: <%s> request: start %#jx, end %#jx\n", rm->rm_descr, start, end)); if (start < rm->rm_start || end > rm->rm_end) return EINVAL; @@ -174,7 +174,7 @@ rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end) /* Skip entries before us. */ TAILQ_FOREACH(s, &rm->rm_list, r_link) { - if (s->r_end == ULONG_MAX) + if (s->r_end == ~0) break; if (s->r_end + 1 >= r->r_start) break; @@ -444,8 +444,8 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, rv = NULL; - DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], " - "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end, + DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#jx, %#jx], " + "length %#jx, flags %x, device %s\n", rm->rm_descr, start, end, count, flags, dev == NULL ? "<null>" : device_get_nameunit(dev))); KASSERT((flags & RF_FIRSTSHARE) == 0, @@ -454,19 +454,29 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, mtx_lock(rm->rm_mtx); + r = TAILQ_FIRST(&rm->rm_list); + if (r == NULL) { + DPRINTF(("NULL list head\n")); + } else { + DPRINTF(("rman_reserve_resource_bound: trying %#jx <%#jx,%#jx>\n", + r->r_end, start, count-1)); + } for (r = TAILQ_FIRST(&rm->rm_list); r && r->r_end < start + count - 1; - r = TAILQ_NEXT(r, r_link)) + r = TAILQ_NEXT(r, r_link)) { ; + DPRINTF(("rman_reserve_resource_bound: tried %#jx <%#jx,%#jx>\n", + r->r_end, start, count-1)); + } if (r == NULL) { DPRINTF(("could not find a region\n")); goto out; } - amask = (1ul << RF_ALIGNMENT(flags)) - 1; - KASSERT(start <= ULONG_MAX - amask, - ("start (%#lx) + amask (%#lx) would wrap around", start, amask)); + amask = (1ull << RF_ALIGNMENT(flags)) - 1; + KASSERT(start <= RM_MAX_END - amask, + ("start (%#jx) + amask (%#jx) would wrap around", start, amask)); /* If bound is 0, bmask will also be 0 */ bmask = ~(bound - 1); @@ -474,18 +484,18 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, * First try to find an acceptable totally-unshared region. */ for (s = r; s; s = TAILQ_NEXT(s, r_link)) { - DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end)); + DPRINTF(("considering [%#jx, %#jx]\n", s->r_start, s->r_end)); /* * The resource list is sorted, so there is no point in * searching further once r_start is too large. */ if (s->r_start > end - (count - 1)) { - DPRINTF(("s->r_start (%#lx) + count - 1> end (%#lx)\n", + DPRINTF(("s->r_start (%#jx) + count - 1> end (%#jx)\n", s->r_start, end)); break; } - if (s->r_start > ULONG_MAX - amask) { - DPRINTF(("s->r_start (%#lx) + amask (%#lx) too large\n", + if (s->r_start > RM_MAX_END - amask) { + DPRINTF(("s->r_start (%#jx) + amask (%#jx) too large\n", s->r_start, amask)); break; } @@ -493,7 +503,7 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, DPRINTF(("region is allocated\n")); continue; } - rstart = ulmax(s->r_start, start); + rstart = ummax(s->r_start, start); /* * Try to find a region by adjusting to boundary and alignment * until both conditions are satisfied. This is not an optimal @@ -505,16 +515,16 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, rstart += bound - (rstart & ~bmask); } while ((rstart & amask) != 0 && rstart < end && rstart < s->r_end); - rend = ulmin(s->r_end, ulmax(rstart + count - 1, end)); + rend = ummin(s->r_end, ummax(rstart + count - 1, end)); if (rstart > rend) { DPRINTF(("adjusted start exceeds end\n")); continue; } - DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n", + DPRINTF(("truncated region: [%#jx, %#jx]; size %#jx (requested %#jx)\n", rstart, rend, (rend - rstart + 1), count)); if ((rend - rstart + 1) >= count) { - DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n", + DPRINTF(("candidate region: [%#jx, %#jx], size %#jx\n", rstart, rend, (rend - rstart + 1))); if ((s->r_end - s->r_start + 1) == count) { DPRINTF(("candidate region is entire chunk\n")); @@ -545,7 +555,7 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, if (s->r_start < rv->r_start && s->r_end > rv->r_end) { DPRINTF(("splitting region in three parts: " - "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n", + "[%#jx, %#jx]; [%#jx, %#jx]; [%#jx, %#jx]\n", s->r_start, rv->r_start - 1, rv->r_start, rv->r_end, rv->r_end + 1, s->r_end)); @@ -1032,8 +1042,8 @@ dump_rman_header(struct rman *rm) if (db_pager_quit) return; - db_printf("rman %p: %s (0x%lx-0x%lx full range)\n", - rm, rm->rm_descr, rm->rm_start, rm->rm_end); + db_printf("rman %p: %s (0x%jx-0x%jx full range)\n", + rm, rm->rm_descr, (rman_res_t)rm->rm_start, (rman_res_t)rm->rm_end); } static void @@ -1051,7 +1061,7 @@ dump_rman(struct rman *rm) devname = "nomatch"; } else devname = NULL; - db_printf(" 0x%lx-0x%lx (RID=%d) ", + db_printf(" 0x%jx-0x%jx (RID=%d) ", r->r_start, r->r_end, r->r_rid); if (devname != NULL) db_printf("(%s)\n", devname); diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c index 12908f6..ef06c48 100644 --- a/sys/kern/subr_sleepqueue.c +++ b/sys/kern/subr_sleepqueue.c @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include "opt_sleepqueue_profiling.h" #include "opt_ddb.h" #include "opt_sched.h" +#include "opt_stack.h" #include <sys/param.h> #include <sys/systm.h> @@ -75,6 +76,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sdt.h> #include <sys/signalvar.h> #include <sys/sleepqueue.h> +#include <sys/stack.h> #include <sys/sysctl.h> #include <vm/uma.h> @@ -83,6 +85,7 @@ __FBSDID("$FreeBSD$"); #include <ddb/ddb.h> #endif + /* * Constants for the hash table of sleep queue chains. * SC_TABLESIZE must be a power of two for SC_MASK to work properly. @@ -382,6 +385,8 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_sleepqueue == NULL); MPASS(wchan != NULL); + if (cold) + panic("timed sleep before timers are working"); callout_reset_sbt_on(&td->td_slpcallout, sbt, pr, sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); } @@ -1034,6 +1039,122 @@ sleepq_abort(struct thread *td, int intrval) return (sleepq_resume_thread(sq, td, 0)); } +/* + * Prints the stacks of all threads presently sleeping on wchan/queue to + * the sbuf sb. Sets count_stacks_printed to the number of stacks actually + * printed. Typically, this will equal the number of threads sleeping on the + * queue, but may be less if sb overflowed before all stacks were printed. + */ +#ifdef STACK +int +sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, + int *count_stacks_printed) +{ + struct thread *td, *td_next; + struct sleepqueue *sq; + struct stack **st; + struct sbuf **td_infos; + int i, stack_idx, error, stacks_to_allocate; + bool finished, partial_print; + + error = 0; + finished = false; + partial_print = false; + + KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); + MPASS((queue >= 0) && (queue < NR_SLEEPQS)); + + stacks_to_allocate = 10; + for (i = 0; i < 3 && !finished ; i++) { + /* We cannot malloc while holding the queue's spinlock, so + * we do our mallocs now, and hope it is enough. If it + * isn't, we will free these, drop the lock, malloc more, + * and try again, up to a point. After that point we will + * give up and report ENOMEM. We also cannot write to sb + * during this time since the client may have set the + * SBUF_AUTOEXTEND flag on their sbuf, which could cause a + * malloc as we print to it. So we defer actually printing + * to sb until after we drop the spinlock. + */ + + /* Where we will store the stacks. */ + st = malloc(sizeof(struct stack *) * stacks_to_allocate, + M_TEMP, M_WAITOK); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + st[stack_idx] = stack_create(); + + /* Where we will store the td name, tid, etc. */ + td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate, + M_TEMP, M_WAITOK); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + td_infos[stack_idx] = sbuf_new(NULL, NULL, + MAXCOMLEN + sizeof(struct thread *) * 2 + 40, + SBUF_FIXEDLEN); + + sleepq_lock(wchan); + sq = sleepq_lookup(wchan); + if (sq == NULL) { + /* This sleepq does not exist; exit and return ENOENT. */ + error = ENOENT; + finished = true; + sleepq_release(wchan); + goto loop_end; + } + + stack_idx = 0; + /* Save thread info */ + TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, + td_next) { + if (stack_idx >= stacks_to_allocate) + goto loop_end; + + /* Note the td_lock is equal to the sleepq_lock here. */ + stack_save_td(st[stack_idx], td); + + sbuf_printf(td_infos[stack_idx], "%d: %s %p", + td->td_tid, td->td_name, td); + + ++stack_idx; + } + + finished = true; + sleepq_release(wchan); + + /* Print the stacks */ + for (i = 0; i < stack_idx; i++) { + sbuf_finish(td_infos[i]); + sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i])); + stack_sbuf_print(sb, st[i]); + sbuf_printf(sb, "\n"); + + error = sbuf_error(sb); + if (error == 0) + *count_stacks_printed = stack_idx; + } + +loop_end: + if (!finished) + sleepq_release(wchan); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + stack_destroy(st[stack_idx]); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + sbuf_delete(td_infos[stack_idx]); + free(st, M_TEMP); + free(td_infos, M_TEMP); + stacks_to_allocate *= 10; + } + + if (!finished && error == 0) + error = ENOMEM; + + return (error); +} +#endif + #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 82349f8..9848136 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/bus.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/mutex.h> #include <sys/pcpu.h> #include <sys/sched.h> @@ -51,6 +52,10 @@ __FBSDID("$FreeBSD$"); #include "opt_sched.h" #ifdef SMP +MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); +#endif + +#ifdef SMP volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; @@ -556,7 +561,7 @@ smp_rendezvous(void (* setup_func)(void *), smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } -static struct cpu_group group[MAXCPU]; +static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; struct cpu_group * smp_topo(void) @@ -616,6 +621,17 @@ smp_topo(void) } struct cpu_group * +smp_topo_alloc(u_int count) +{ + static u_int index; + u_int curr; + + curr = index; + index += count; + return (&group[curr]); +} + +struct cpu_group * smp_topo_none(void) { struct cpu_group *top; @@ -861,3 +877,233 @@ sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) return (error); } + +#ifdef SMP +void +topo_init_node(struct topo_node *node) +{ + + bzero(node, sizeof(*node)); + TAILQ_INIT(&node->children); +} + +void +topo_init_root(struct topo_node *root) +{ + + topo_init_node(root); + root->type = TOPO_TYPE_SYSTEM; +} + +struct topo_node * +topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + struct topo_node *node; + + TAILQ_FOREACH_REVERSE(node, &parent->children, + topo_children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + node = malloc(sizeof(*node), M_TOPO, M_WAITOK); + topo_init_node(node); + node->parent = parent; + node->hwid = hwid; + node->type = type; + node->subtype = subtype; + TAILQ_INSERT_TAIL(&parent->children, node, siblings); + parent->nchildren++; + + return (node); +} + +struct topo_node * +topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + + struct topo_node *node; + + TAILQ_FOREACH(node, &parent->children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + return (NULL); +} + +void +topo_promote_child(struct topo_node *child) +{ + struct topo_node *next; + struct topo_node *node; + struct topo_node *parent; + + parent = child->parent; + next = TAILQ_NEXT(child, siblings); + TAILQ_REMOVE(&parent->children, child, siblings); + TAILQ_INSERT_HEAD(&parent->children, child, siblings); + + while (next != NULL) { + node = next; + next = TAILQ_NEXT(node, siblings); + TAILQ_REMOVE(&parent->children, node, siblings); + TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); + child = node; + } +} + +struct topo_node * +topo_next_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_FIRST(&node->children)) != NULL) + return (next); + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +struct topo_node * +topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +void +topo_set_pu_id(struct topo_node *node, cpuid_t id) +{ + + KASSERT(node->type == TOPO_TYPE_PU, + ("topo_set_pu_id: wrong node type: %u", node->type)); + KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, + ("topo_set_pu_id: cpuset already not empty")); + node->id = id; + CPU_SET(id, &node->cpuset); + node->cpu_count = 1; + node->subtype = 1; + + while ((node = node->parent) != NULL) { + if (CPU_ISSET(id, &node->cpuset)) + break; + CPU_SET(id, &node->cpuset); + node->cpu_count++; + } +} + +int +topo_analyze(struct topo_node *topo_root, int all, + int *pkg_count, int *cores_per_pkg, int *thrs_per_core) +{ + struct topo_node *pkg_node; + struct topo_node *core_node; + struct topo_node *pu_node; + int thrs_per_pkg; + int cpp_counter; + int tpc_counter; + int tpp_counter; + + *pkg_count = 0; + *cores_per_pkg = -1; + *thrs_per_core = -1; + thrs_per_pkg = -1; + pkg_node = topo_root; + while (pkg_node != NULL) { + if (pkg_node->type != TOPO_TYPE_PKG) { + pkg_node = topo_next_node(topo_root, pkg_node); + continue; + } + if (!all && CPU_EMPTY(&pkg_node->cpuset)) { + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + continue; + } + + (*pkg_count)++; + + cpp_counter = 0; + tpp_counter = 0; + core_node = pkg_node; + while (core_node != NULL) { + if (core_node->type == TOPO_TYPE_CORE) { + if (!all && CPU_EMPTY(&core_node->cpuset)) { + core_node = + topo_next_nonchild_node(pkg_node, + core_node); + continue; + } + + cpp_counter++; + + tpc_counter = 0; + pu_node = core_node; + while (pu_node != NULL) { + if (pu_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&pu_node->cpuset))) + tpc_counter++; + pu_node = topo_next_node(core_node, + pu_node); + } + + if (*thrs_per_core == -1) + *thrs_per_core = tpc_counter; + else if (*thrs_per_core != tpc_counter) + return (0); + + core_node = topo_next_nonchild_node(pkg_node, + core_node); + } else { + /* PU node directly under PKG. */ + if (core_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&core_node->cpuset))) + tpp_counter++; + core_node = topo_next_node(pkg_node, + core_node); + } + } + + if (*cores_per_pkg == -1) + *cores_per_pkg = cpp_counter; + else if (*cores_per_pkg != cpp_counter) + return (0); + if (thrs_per_pkg == -1) + thrs_per_pkg = tpp_counter; + else if (thrs_per_pkg != tpp_counter) + return (0); + + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + } + + KASSERT(*pkg_count > 0, + ("bug in topology or analysis")); + if (*cores_per_pkg == 0) { + KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0, + ("bug in topology or analysis")); + *thrs_per_core = thrs_per_pkg; + } + + return (1); +} +#endif /* SMP */ + diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index fe79f52..75fb66e 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -89,12 +89,14 @@ __FBSDID("$FreeBSD$"); #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ -int iosize_max_clamp = 0; +#ifdef __LP64__ +static int iosize_max_clamp = 0; SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); -int devfs_iosize_max_clamp = 1; +static int devfs_iosize_max_clamp = 1; SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); +#endif /* * Assert that the return value of read(2) and write(2) syscalls fits @@ -159,6 +161,24 @@ struct selfd { static uma_zone_t selfd_zone; static struct mtx_pool *mtxpool_select; +#ifdef __LP64__ +size_t +devfs_iosize_max(void) +{ + + return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? + INT_MAX : SSIZE_MAX); +} + +size_t +iosize_max(void) +{ + + return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? + INT_MAX : SSIZE_MAX); +} +#endif + #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 44ddd81..3575028 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -3,7 +3,7 @@ * * DO NOT EDIT-- this file is automatically generated. * $FreeBSD$ - * created from FreeBSD: head/sys/kern/syscalls.master 296572 2016-03-09 19:05:11Z jhb + * created from FreeBSD: head/sys/kern/syscalls.master 297167 2016-03-21 21:37:33Z jhb */ const char *syscallnames[] = { diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index f08f5e3..5675425 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -554,7 +554,7 @@ 312 AUE_SETRESGID STD { int setresgid(gid_t rgid, gid_t egid, \ gid_t sgid); } 313 AUE_NULL OBSOL signanosleep -314 AUE_NULL STD { int aio_return(struct aiocb *aiocbp); } +314 AUE_NULL STD { ssize_t aio_return(struct aiocb *aiocbp); } 315 AUE_NULL STD { int aio_suspend( \ struct aiocb * const * aiocbp, int nent, \ const struct timespec *timeout); } @@ -643,7 +643,7 @@ 358 AUE_EXTATTR_DELETE_FILE STD { int extattr_delete_file(const char *path, \ int attrnamespace, \ const char *attrname); } -359 AUE_NULL STD { int aio_waitcomplete( \ +359 AUE_NULL STD { ssize_t aio_waitcomplete( \ struct aiocb **aiocbp, \ struct timespec *timeout); } 360 AUE_GETRESUID STD { int getresuid(uid_t *ruid, uid_t *euid, \ diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 6fd03f1..c2ea2b4 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -9796,7 +9796,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) /* aio_return */ case 314: if (ndx == 0 || ndx == 1) - p = "int"; + p = "ssize_t"; break; /* aio_suspend */ case 315: @@ -9972,7 +9972,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) /* aio_waitcomplete */ case 359: if (ndx == 0 || ndx == 1) - p = "int"; + p = "ssize_t"; break; /* getresuid */ case 360: diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 23bfbff..62dfda6 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -47,6 +47,49 @@ __FBSDID("$FreeBSD$"); #include <sys/domain.h> #include <sys/protosw.h> #include <sys/uio.h> +#include <sys/sdt.h> + +SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, + "struct mbuf *", "mbufinfo_t *", + "uint32_t", "uint32_t", + "uint16_t", "uint16_t", + "uint32_t", "uint32_t", + "uint32_t", "uint32_t"); + +SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, + "uint32_t", "uint32_t", + "uint16_t", "uint16_t", + "struct mbuf *", "mbufinfo_t *"); + +SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, + "uint32_t", "uint32_t", + "uint16_t", "uint16_t", + "struct mbuf *", "mbufinfo_t *"); + +SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, + "uint32_t", "uint32_t", + "uint16_t", "uint16_t", + "uint32_t", "uint32_t", + "struct mbuf *", "mbufinfo_t *"); + +SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, + "struct mbuf *", "mbufinfo_t *", + "uint32_t", "uint32_t", + "uint32_t", "uint32_t"); + +SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, + "struct mbuf *", "mbufinfo_t *", + "uint32_t", "uint32_t", + "uint32_t", "uint32_t", + "void*", "void*"); + +SDT_PROBE_DEFINE(sdt, , , m__cljset); + +SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, + "struct mbuf *", "mbufinfo_t *"); + +SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, + "struct mbuf *", "mbufinfo_t *"); #include <security/mac/mac_framework.h> @@ -1627,7 +1670,7 @@ m_unshare(struct mbuf *m0, int how) * don't know how to break up the non-contiguous memory when * doing DMA. */ - n = m_getcl(how, m->m_type, m->m_flags); + n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); @@ -1657,7 +1700,7 @@ m_unshare(struct mbuf *m0, int how) break; off += cc; - n = m_getcl(how, m->m_type, m->m_flags); + n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 35d17b1..d873217 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -358,7 +358,7 @@ sysctl_maxsockets(SYSCTL_HANDLER_ARGS) } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, &maxsockets, 0, sysctl_maxsockets, "IU", - "Maximum number of sockets avaliable"); + "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 27fa239..20df141 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -743,7 +743,7 @@ aio_process_rw(struct kaiocb *job) struct file *fp; struct uio auio; struct iovec aiov; - int cnt; + ssize_t cnt; int error; int oublock_st, oublock_end; int inblock_st, inblock_end; @@ -1173,7 +1173,7 @@ aio_qphysio(struct proc *p, struct kaiocb *job) struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; - int error, ref, unmap, poff; + int error, ref, poff; vm_prot_t prot; cb = &job->uaiocb; @@ -1206,12 +1206,13 @@ aio_qphysio(struct proc *p, struct kaiocb *job) ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; - unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed); - if (unmap) { + if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } + + pbuf = NULL; } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; @@ -1221,17 +1222,14 @@ aio_qphysio(struct proc *p, struct kaiocb *job) error = -1; goto unref; } - } - job->bp = bp = g_alloc_bio(); - if (!unmap) { + job->pbuf = pbuf = (struct buf *)getpbuf(NULL); BUF_KERNPROC(pbuf); - } - - AIO_LOCK(ki); - if (!unmap) + AIO_LOCK(ki); ki->kaio_buffer_count++; - AIO_UNLOCK(ki); + AIO_UNLOCK(ki); + } + job->bp = bp = g_alloc_bio(); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; @@ -1245,17 +1243,18 @@ aio_qphysio(struct proc *p, struct kaiocb *job) prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ - if ((job->npages = vm_fault_quick_hold_pages( - &curproc->p_vmspace->vm_map, + job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages, - sizeof(job->pages)/sizeof(job->pages[0]))) < 0) { + nitems(job->pages)); + if (job->npages < 0) { error = EFAULT; goto doerror; } - if (!unmap) { + if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, job->pages, job->npages); bp->bio_data = pbuf->b_data + poff; + atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = job->pages; bp->bio_ma_n = job->npages; @@ -1264,20 +1263,16 @@ aio_qphysio(struct proc *p, struct kaiocb *job) bp->bio_flags |= BIO_UNMAPPED; } - if (!unmap) - atomic_add_int(&num_buf_aio, 1); - /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: - AIO_LOCK(ki); - if (!unmap) + if (pbuf != NULL) { + AIO_LOCK(ki); ki->kaio_buffer_count--; - AIO_UNLOCK(ki); - if (pbuf) { + AIO_UNLOCK(ki); relpbuf(pbuf, NULL); job->pbuf = NULL; } @@ -1446,8 +1441,7 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, return (error); } - /* XXX: aio_nbytes is later casted to signed types. */ - if (job->uaiocb.aio_nbytes > INT_MAX) { + if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { uma_zfree(aiocb_zone, job); return (EINVAL); } @@ -1788,7 +1782,7 @@ kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; - int status, error; + long status, error; ki = p->p_aioinfo; if (ki == NULL) @@ -2344,7 +2338,8 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; - int error, status, timo; + long error, status; + int timo; ops->store_aiocb(ujobp, NULL); diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 9be0ece..9eb523c 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -256,10 +256,10 @@ vfs_free_addrlist_af(struct radix_node_head **prnh) rnh = *prnh; RADIX_NODE_HEAD_LOCK(rnh); - (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, &rnh->rh); + (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh); RADIX_NODE_HEAD_UNLOCK(rnh); RADIX_NODE_HEAD_DESTROY(rnh); - free(rnh, M_RTABLE); + rn_detachhead((void **)prnh); prnh = NULL; } diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index cf24253..a33e376 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -89,6 +89,7 @@ __FBSDID("$FreeBSD$"); static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); +static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* @@ -488,6 +489,8 @@ parse_dir_ask(char **conf) char *mnt; int error; + vfs_mountroot_wait(); + printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); |