diff options
author | cem <cem@FreeBSD.org> | 2016-03-16 04:22:32 +0000 |
---|---|---|
committer | cem <cem@FreeBSD.org> | 2016-03-16 04:22:32 +0000 |
commit | 1cab282ecb65bc4e743d463a25435dd44ea8723b (patch) | |
tree | dfa868368b0a045b3c845eb776636c27840d6504 | |
parent | d0976574d3998776fb6647b0d464ce2e9bcc4351 (diff) | |
download | FreeBSD-src-1cab282ecb65bc4e743d463a25435dd44ea8723b.zip FreeBSD-src-1cab282ecb65bc4e743d463a25435dd44ea8723b.tar.gz |
fail(9): Upstreaming some fail point enhancements
This is several year's worth of fail point upgrades done at EMC Isilon. They
are interdependent enough that it makes sense to put a single diff up for them.
Primarily, we added:
- Changing all mainline execution paths to be lockless, which lets us use fail
points in more sleep-sensitive areas, and allows more parallel execution
- A number of additional commands, including 'pause' that lets us do some
interesting deterministic repros of race conditions
- The ability to dump the stacks of all threads sleeping on a fail point
- A number of other API changes to allow marking up the fail point's context in
the code, and firing callbacks before and after execution
- A man page update
Submitted by: Matthew Bryan <matthew.bryan@isilon.com>
Reviewed by: cem (earlier version), jhb, kib, pho
With feedback from: bdrewery
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D5427
-rw-r--r-- | share/man/man9/fail.9 | 91 | ||||
-rw-r--r-- | sys/kern/kern_fail.c | 776 | ||||
-rw-r--r-- | sys/kern/subr_sleepqueue.c | 116 | ||||
-rw-r--r-- | sys/sys/fail.h | 246 | ||||
-rw-r--r-- | sys/sys/sleepqueue.h | 4 |
5 files changed, 1025 insertions, 208 deletions
diff --git a/share/man/man9/fail.9 b/share/man/man9/fail.9 index 8dbba1b..b5f0882 100644 --- a/share/man/man9/fail.9 +++ b/share/man/man9/fail.9 @@ -26,25 +26,31 @@ .\" .\" $FreeBSD$ .\" -.Dd May 10, 2009 +.Dd February 02, 2016 .Dt FAIL 9 .Os .Sh NAME .Nm KFAIL_POINT_CODE , +.Nm KFAIL_POINT_CODE_FLAGS , +.Nm KFAIL_POINT_CODE_COND , .Nm KFAIL_POINT_RETURN , .Nm KFAIL_POINT_RETURN_VOID , .Nm KFAIL_POINT_ERROR , .Nm KFAIL_POINT_GOTO , +.Nm KFAIL_POINT_SLEEP_CALLBACKS , .Nm fail_point , .Nm DEBUG_FP .Nd fail points .Sh SYNOPSIS .In sys/fail.h .Fn KFAIL_POINT_CODE "parent" "name" "code" +.Fn KFAIL_POINT_CODE_FLAGS "parent" "name" "flags" "code" +.Fn KFAIL_POINT_CODE_COND "parent" "name" "cond" "flags" "code" .Fn KFAIL_POINT_RETURN "parent" "name" .Fn KFAIL_POINT_RETURN_VOID "parent" "name" .Fn KFAIL_POINT_ERROR "parent" "name" "error_var" .Fn KFAIL_POINT_GOTO "parent" "name" "error_var" "label" +.Fn KFAIL_POINT_SLEEP_CALLBACKS "parent" "name" "pre_func" "pre_arg" "post_func" "post_arg" "code" .Sh DESCRIPTION Fail points are used to add code points where errors may be injected in a user controlled fashion. @@ -77,6 +83,42 @@ argument, the evaluation of is derived from the .Fn return value set in the sysctl MIB. +.Pp +Additionally, +.Fn KFAIL_POINT_CODE_FLAGS +provides a +.Fa flags +argument which controls the fail point's behaviour. +This can be used to e.g., mark the fail point's context as non-sleepable, +which causes the +.Sy sleep +action to be coerced to a busy wait. +The supported flags are: +.Bl -ohang -offset indent +.It FAIL_POINT_USE_TIMEOUT_PATH +Rather than sleeping on a +.Fn sleep +call, just fire the post-sleep function after a timeout fires. +.It FAIL_POINT_NONSLEEPABLE +Mark the fail point as being in a non-sleepable context, which coerces +.Fn sleep +calls to +.Fn delay +calls. +.El +.Pp +Likewise, +.Fn KFAIL_POINT_CODE_COND +supplies a +.Fa cond +argument, which allows you to set the condition under which the fail point's +code may fire. +This is equivalent to: +.Bd -literal + if (cond) + KFAIL_POINT_CODE_FLAGS(...); + +.Ed See .Sx SYSCTL VARIABLES below. @@ -107,26 +149,12 @@ Many base kernel MIBs can be found in the tree (referenced in code by .Sy DEBUG_FP ) . .Pp -The sysctl variable may be set using the following grammar: +The sysctl variable may be set in a number of ways: .Bd -literal - <fail_point> :: - <term> ( "->" <term> )* - - <term> :: - ( (<float> "%") | (<integer> "*" ) )* - <type> - [ "(" <integer> ")" ] - [ "[pid " <integer> "]" ] - - <float> :: - <integer> [ "." <integer> ] | - "." <integer> - - <type> :: - "off" | "return" | "sleep" | "panic" | "break" | "print" + [<pct>%][<cnt>*]<type>[(args...)][-><more terms>] .Ed .Pp -The <type> argument specifies which action to take: +The <type> argument specifies which action to take; it can be one of: .Bl -tag -width ".Dv return" .It Sy off Take no action (does not trigger fail point code) @@ -140,13 +168,23 @@ Panic Break into the debugger, or trap if there is no debugger support .It Sy print Print that the fail point executed +.It Sy pause +Threads sleep at the fail point until the fail point is set to +.Sy off +.It Sy yield +Thread yields the cpu when the fail point is evaluated +.It Sy delay +Similar to sleep, but busy waits the cpu. +(Useful in non-sleepable contexts.) .El .Pp -The <float>% and <integer>* modifiers prior to <type> control when +The <pct>% and <cnt>* modifiers prior to <type> control when <type> is executed. -The <float>% form (e.g. "1.2%") can be used to specify a +The <pct>% form (e.g. "1.2%") can be used to specify a probability that <type> will execute. -The <integer>* form (e.g. "5*") can be used to specify the number of +This is a decimal in the range (0, 100] which can specify up to +1/10,000% precision. +The <cnt>* form (e.g. "5*") can be used to specify the number of times <type> should be executed before this <term> is disabled. Only the last probability and the last count are used if multiple are specified, i.e. "1.2%2%" is the same as "2%". @@ -191,6 +229,10 @@ Return 5 once, when pid 1234 executes the fail point. .Sh AUTHORS .An -nosplit This manual page was written by +.Pp +.An Matthew Bryan Aq Mt matthew.bryan@isilon.com +and +.Pp .An Zach Loafman Aq Mt zml@FreeBSD.org . .Sh CAVEATS It is easy to shoot yourself in the foot by setting fail points too @@ -206,3 +248,10 @@ Currently, .Fn fail_point_eval does not verify whether the context is appropriate for calling .Fn msleep . +You can force it to evaluate a +.Sy sleep +action as a +.Sy delay +action by specifying the +.Sy FAIL_POINT_NONSLEEPABLE +flag at the point the fail point is declared. diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c index 3737aa3..b0a166d 100644 --- a/sys/kern/kern_fail.c +++ b/sys/kern/kern_fail.c @@ -57,12 +57,18 @@ __FBSDID("$FreeBSD$"); #include <sys/fail.h> #include <sys/kernel.h> #include <sys/libkern.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/sbuf.h> +#include <sys/sleepqueue.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/types.h> +#include <machine/atomic.h> #include <machine/stdarg.h> #ifdef ILOG_DEFINE_FOR_FILE @@ -72,11 +78,45 @@ ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point); static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system"); #define fp_free(ptr) free(ptr, M_FAIL_POINT) #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags)) +#define fs_free(ptr) fp_free(ptr) +#define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \ + M_WAITOK | M_ZERO) + + /** + * These define the wchans that are used for sleeping, pausing respectively. + * They are chosen arbitrarily but need to be distinct to the failpoint and + * the sleep/pause distinction. + */ +#define FP_SLEEP_CHANNEL(fp) (void*)(fp) +#define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting) -static struct mtx g_fp_mtx; -MTX_SYSINIT(g_fp_mtx, &g_fp_mtx, "fail point mtx", MTX_DEF); -#define FP_LOCK() mtx_lock(&g_fp_mtx) -#define FP_UNLOCK() mtx_unlock(&g_fp_mtx) +/** + * Don't allow more than this many entries in a fail point set by sysctl. + * The 99.99...% case is to have 1 entry. I can't imagine having this many + * entries, so it should not limit us. Saves on re-mallocs while holding + * a non-sleepable lock. + */ +#define FP_MAX_ENTRY_COUNT 20 + +/* Used to drain sbufs to the sysctl output */ +int fail_sysctl_drain_func(void *, const char *, int); + +/* Head of tailq of struct fail_point_entry */ +TAILQ_HEAD(fail_point_entry_queue, fail_point_entry); + +/** + * fp entries garbage list; outstanding entries are cleaned up in the + * garbage collector + */ +STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting); +static struct fail_point_setting_garbage fp_setting_garbage = + STAILQ_HEAD_INITIALIZER(fp_setting_garbage); +static struct mtx mtx_garbage_list; +MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx", + MTX_SPIN); + +static struct sx sx_fp_set; +SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx"); /** * Failpoint types. @@ -90,7 +130,11 @@ enum fail_point_t { FAIL_POINT_BREAK, /**< break into the debugger */ FAIL_POINT_PRINT, /**< print a message */ FAIL_POINT_SLEEP, /**< sleep for some msecs */ - FAIL_POINT_NUMTYPES + FAIL_POINT_PAUSE, /**< sleep until failpoint is set to off */ + FAIL_POINT_YIELD, /**< yield the cpu */ + FAIL_POINT_DELAY, /**< busy wait the cpu */ + FAIL_POINT_NUMTYPES, + FAIL_POINT_INVALID = -1 }; static struct { @@ -104,53 +148,307 @@ static struct { [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"), [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"), [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"), + [FAIL_POINT_PAUSE] = FP_TYPE_NM_LEN("pause"), + [FAIL_POINT_YIELD] = FP_TYPE_NM_LEN("yield"), + [FAIL_POINT_DELAY] = FP_TYPE_NM_LEN("delay"), }; +#define FE_COUNT_UNTRACKED (INT_MIN) + /** * Internal structure tracking a single term of a complete failpoint. * @ingroup failpoint_private */ struct fail_point_entry { - enum fail_point_t fe_type; /**< type of entry */ + volatile bool fe_stale; + enum fail_point_t fe_type; /**< type of entry */ int fe_arg; /**< argument to type (e.g. return value) */ int fe_prob; /**< likelihood of firing in millionths */ - int fe_count; /**< number of times to fire, 0 means always */ + int fe_count; /**< number of times to fire, -1 means infinite */ pid_t fe_pid; /**< only fail for this process */ - TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */ + struct fail_point *fe_parent; /**< backpointer to fp */ + TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry ptr */ +}; + +struct fail_point_setting { + STAILQ_ENTRY(fail_point_setting) fs_garbage_link; + struct fail_point_entry_queue fp_entry_queue; + struct fail_point * fs_parent; + struct mtx feq_mtx; /* Gives fail_point_pause something to do. */ }; +/** + * Defines stating the equivalent of probablilty one (100%) + */ +enum { + PROB_MAX = 1000000, /* probability between zero and this number */ + PROB_DIGITS = 6 /* number of zero's in above number */ +}; + +/* Get a ref on an fp's fp_setting */ +static inline struct fail_point_setting *fail_point_setting_get_ref( + struct fail_point *fp); +/* Release a ref on an fp_setting */ +static inline void fail_point_setting_release_ref(struct fail_point *fp); +/* Allocate and initialize a struct fail_point_setting */ +static struct fail_point_setting *fail_point_setting_new(struct + fail_point *); +/* Free a struct fail_point_setting */ +static void fail_point_setting_destroy(struct fail_point_setting *fp_setting); +/* Allocate and initialize a struct fail_point_entry */ +static struct fail_point_entry *fail_point_entry_new(struct + fail_point_setting *); +/* Free a struct fail_point_entry */ +static void fail_point_entry_destroy(struct fail_point_entry *fp_entry); +/* Append fp setting to garbage list */ +static inline void fail_point_setting_garbage_append( + struct fail_point_setting *fp_setting); +/* Swap fp's setting with fp_setting_new */ +static inline struct fail_point_setting * + fail_point_swap_settings(struct fail_point *fp, + struct fail_point_setting *fp_setting_new); +/* Free up any zero-ref setting in the garbage queue */ +static void fail_point_garbage_collect(void); +/* If this fail point's setting are empty, then swap it out to NULL. */ +static inline void fail_point_eval_swap_out(struct fail_point *fp, + struct fail_point_setting *fp_setting); + +bool +fail_point_is_off(struct fail_point *fp) +{ + bool return_val; + struct fail_point_setting *fp_setting; + struct fail_point_entry *ent; + + return_val = true; + + fp_setting = fail_point_setting_get_ref(fp); + if (fp_setting != NULL) { + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, + fe_entries) { + if (!ent->fe_stale) { + return_val = false; + break; + } + } + } + fail_point_setting_release_ref(fp); + + return (return_val); +} + +/* Allocate and initialize a struct fail_point_setting */ +static struct fail_point_setting * +fail_point_setting_new(struct fail_point *fp) +{ + struct fail_point_setting *fs_new; + + fs_new = fs_malloc(); + fs_new->fs_parent = fp; + TAILQ_INIT(&fs_new->fp_entry_queue); + mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN); + + fail_point_setting_garbage_append(fs_new); + + return (fs_new); +} + +/* Free a struct fail_point_setting */ +static void +fail_point_setting_destroy(struct fail_point_setting *fp_setting) +{ + struct fail_point_entry *ent; + + while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) { + ent = TAILQ_FIRST(&fp_setting->fp_entry_queue); + TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries); + fail_point_entry_destroy(ent); + } + + fs_free(fp_setting); +} + +/* Allocate and initialize a struct fail_point_entry */ +static struct fail_point_entry * +fail_point_entry_new(struct fail_point_setting *fp_setting) +{ + struct fail_point_entry *fp_entry; + + fp_entry = fp_malloc(sizeof(struct fail_point_entry), + M_WAITOK | M_ZERO); + fp_entry->fe_parent = fp_setting->fs_parent; + fp_entry->fe_prob = PROB_MAX; + fp_entry->fe_pid = NO_PID; + fp_entry->fe_count = FE_COUNT_UNTRACKED; + TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry, + fe_entries); + + return (fp_entry); +} + +/* Free a struct fail_point_entry */ +static void +fail_point_entry_destroy(struct fail_point_entry *fp_entry) +{ + + fp_free(fp_entry); +} + +/* Get a ref on an fp's fp_setting */ +static inline struct fail_point_setting * +fail_point_setting_get_ref(struct fail_point *fp) +{ + struct fail_point_setting *fp_setting; + + /* Invariant: if we have a ref, our pointer to fp_setting is safe */ + atomic_add_acq_32(&fp->fp_ref_cnt, 1); + fp_setting = fp->fp_setting; + + return (fp_setting); +} + +/* Release a ref on an fp_setting */ +static inline void +fail_point_setting_release_ref(struct fail_point *fp) +{ + + KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs")); + atomic_subtract_rel_32(&fp->fp_ref_cnt, 1); +} + +/* Append fp entries to fp garbage list */ +static inline void +fail_point_setting_garbage_append(struct fail_point_setting *fp_setting) +{ + + mtx_lock_spin(&mtx_garbage_list); + STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting, + fs_garbage_link); + mtx_unlock_spin(&mtx_garbage_list); +} + +/* Swap fp's entries with fp_setting_new */ +static struct fail_point_setting * +fail_point_swap_settings(struct fail_point *fp, + struct fail_point_setting *fp_setting_new) +{ + struct fail_point_setting *fp_setting_old; + + fp_setting_old = fp->fp_setting; + fp->fp_setting = fp_setting_new; + + return (fp_setting_old); +} + +static inline void +fail_point_eval_swap_out(struct fail_point *fp, + struct fail_point_setting *fp_setting) +{ + + /* We may have already been swapped out and replaced; ignore. */ + if (fp->fp_setting == fp_setting) + fail_point_swap_settings(fp, NULL); +} + +/* Free up any zero-ref entries in the garbage queue */ +static void +fail_point_garbage_collect() +{ + struct fail_point_setting *fs_current, *fs_next; + struct fail_point_setting_garbage fp_ents_free_list; + + /** + * We will transfer the entries to free to fp_ents_free_list while holding + * the spin mutex, then free it after we drop the lock. This avoids + * triggering witness due to sleepable mutexes in the memory + * allocator. + */ + STAILQ_INIT(&fp_ents_free_list); + + mtx_lock_spin(&mtx_garbage_list); + STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link, + fs_next) { + if (fs_current->fs_parent->fp_setting != fs_current && + fs_current->fs_parent->fp_ref_cnt == 0) { + STAILQ_REMOVE(&fp_setting_garbage, fs_current, + fail_point_setting, fs_garbage_link); + STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current, + fs_garbage_link); + } + } + mtx_unlock_spin(&mtx_garbage_list); + + STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link, + fs_next) + fail_point_setting_destroy(fs_current); +} + +/* Drain out all refs from this fail point */ +static inline void +fail_point_drain(struct fail_point *fp, int expected_ref) +{ + struct fail_point_setting *entries; + + entries = fail_point_swap_settings(fp, NULL); + /** + * We have unpaused all threads; so we will wait no longer + * than the time taken for the longest remaining sleep, or + * the length of time of a long-running code block. + */ + while (fp->fp_ref_cnt > expected_ref) { + wakeup(FP_PAUSE_CHANNEL(fp)); + tsleep(&fp, PWAIT, "fail_point_drain", hz / 100); + } + fail_point_swap_settings(fp, entries); +} + +static inline void +fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret, + struct mtx *mtx_sleep) +{ + + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0); + + if (fp->fp_post_sleep_fn) + fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); +} + static inline void -fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent, - int msecs, enum fail_point_return_code *pret) +fail_point_sleep(struct fail_point *fp, int msecs, + enum fail_point_return_code *pret) { - /* convert from millisecs to ticks, rounding up */ - int timo = ((msecs * hz) + 999) / 1000; + int timo; + + /* Convert from millisecs to ticks, rounding up */ + timo = howmany(msecs * hz, 1000); if (timo > 0) { - if (fp->fp_sleep_fn == NULL) { - msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo); + if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) { + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo); + + if (fp->fp_post_sleep_fn) + fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } else { - timeout(fp->fp_sleep_fn, fp->fp_sleep_arg, timo); + if (fp->fp_pre_sleep_fn) + fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); + + timeout(fp->fp_post_sleep_fn, fp->fp_post_sleep_arg, + timo); *pret = FAIL_POINT_RC_QUEUED; } } } - -/** - * Defines stating the equivalent of probablilty one (100%) - */ -enum { - PROB_MAX = 1000000, /* probability between zero and this number */ - PROB_DIGITS = 6, /* number of zero's in above number */ -}; - -static char *parse_fail_point(struct fail_point_entries *, char *); -static char *parse_term(struct fail_point_entries *, char *); +static char *parse_fail_point(struct fail_point_setting *, char *); +static char *parse_term(struct fail_point_setting *, char *); static char *parse_number(int *out_units, int *out_decimal, char *); static char *parse_type(struct fail_point_entry *, char *); -static void free_entry(struct fail_point_entries *, struct fail_point_entry *); -static void clear_entries(struct fail_point_entries *); /** * Initialize a fail_point. The name is formed in a printf-like fashion @@ -167,7 +465,7 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...) char *name; int n; - TAILQ_INIT(&fp->fp_entries); + fp->fp_setting = NULL; fp->fp_flags = 0; /* Figure out the size of the name. */ @@ -185,25 +483,33 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...) fp->fp_name = name; fp->fp_location = ""; fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME; - fp->fp_sleep_fn = NULL; - fp->fp_sleep_arg = NULL; + fp->fp_pre_sleep_fn = NULL; + fp->fp_pre_sleep_arg = NULL; + fp->fp_post_sleep_fn = NULL; + fp->fp_post_sleep_arg = NULL; } /** - * Free the resources held by a fail_point. - * + * Free the resources held by a fail_point, and wake any paused threads. + * Thou shalt not allow threads to hit this fail point after you enter this + * function, nor shall you call this multiple times for a given fp. * @ingroup failpoint */ void fail_point_destroy(struct fail_point *fp) { + fail_point_drain(fp, 0); + if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) { fp_free(__DECONST(void *, fp->fp_name)); fp->fp_name = NULL; } fp->fp_flags = 0; - clear_entries(&fp->fp_entries); + + sx_xlock(&sx_fp_set); + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); } /** @@ -216,21 +522,51 @@ fail_point_destroy(struct fail_point *fp) enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) { - enum fail_point_return_code ret = FAIL_POINT_RC_CONTINUE; - struct fail_point_entry *ent, *next; + bool execute = false; + struct fail_point_entry *ent; + struct fail_point_setting *fp_setting; + enum fail_point_return_code ret; + int cont; + int count; int msecs; + int usecs; - FP_LOCK(); + ret = FAIL_POINT_RC_CONTINUE; + cont = 0; /* don't continue by default */ - TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) { - int cont = 0; /* don't continue by default */ + fp_setting = fail_point_setting_get_ref(fp); + if (fp_setting == NULL) + goto abort; + + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { + + if (ent->fe_stale) + continue; if (ent->fe_prob < PROB_MAX && ent->fe_prob < random() % PROB_MAX) continue; + if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid) continue; + if (ent->fe_count != FE_COUNT_UNTRACKED) { + count = ent->fe_count; + while (count > 0) { + if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) { + count--; + execute = true; + break; + } + count = ent->fe_count; + } + if (execute == false) + /* We lost the race; consider the entry stale and bail now */ + continue; + if (count == 0) + ent->fe_stale = true; + } + switch (ent->fe_type) { case FAIL_POINT_PANIC: panic("fail point %s panicking", fp->fp_name); @@ -244,7 +580,7 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) case FAIL_POINT_BREAK: printf("fail point %s breaking to debugger\n", - fp->fp_name); + fp->fp_name); breakpoint(); break; @@ -254,51 +590,95 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) break; case FAIL_POINT_SLEEP: - /* - * Free the entry now if necessary, since - * we're about to drop the mutex and sleep. - */ msecs = ent->fe_arg; - if (ent->fe_count > 0 && --ent->fe_count == 0) { - free_entry(&fp->fp_entries, ent); - ent = NULL; - } - if (msecs) - fail_point_sleep(fp, ent, msecs, &ret); + fail_point_sleep(fp, msecs, &ret); + break; + + case FAIL_POINT_PAUSE: + /** + * Pausing is inherently strange with multiple + * entries given our design. That is because some + * entries could be unreachable, for instance in cases like: + * pause->return. We can never reach the return entry. + * The sysctl layer actually truncates all entries after + * a pause for this reason. + */ + mtx_lock_spin(&fp_setting->feq_mtx); + fail_point_pause(fp, &ret, &fp_setting->feq_mtx); + mtx_unlock_spin(&fp_setting->feq_mtx); + break; + + case FAIL_POINT_YIELD: + kern_yield(-1); + break; + + case FAIL_POINT_DELAY: + usecs = ent->fe_arg; + DELAY(usecs); break; default: break; } - if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0) - free_entry(&fp->fp_entries, ent); if (cont == 0) break; } - /* Get rid of "off"s at the end. */ - while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) && - ent->fe_type == FAIL_POINT_OFF) - free_entry(&fp->fp_entries, ent); + if (fail_point_is_off(fp)) + fail_point_eval_swap_out(fp, fp_setting); - FP_UNLOCK(); +abort: + fail_point_setting_release_ref(fp); return (ret); + } /** * Translate internal fail_point structure into human-readable text. */ static void -fail_point_get(struct fail_point *fp, struct sbuf *sb) +fail_point_get(struct fail_point *fp, struct sbuf *sb, + bool verbose) { struct fail_point_entry *ent; + struct fail_point_setting *fp_setting; + struct fail_point_entry *fp_entry_cpy; + int cnt_sleeping; + int idx; + int printed_entry_count; - FP_LOCK(); + cnt_sleeping = 0; + idx = 0; + printed_entry_count = 0; - TAILQ_FOREACH(ent, &fp->fp_entries, fe_entries) { + fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) * + (FP_MAX_ENTRY_COUNT + 1), M_WAITOK); + + fp_setting = fail_point_setting_get_ref(fp); + + if (fp_setting != NULL) { + TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { + if (ent->fe_stale) + continue; + + KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT, + ("FP entry list larger than allowed")); + + fp_entry_cpy[printed_entry_count] = *ent; + ++printed_entry_count; + } + } + fail_point_setting_release_ref(fp); + + /* This is our equivalent of a NULL terminator */ + fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID; + + while (idx < printed_entry_count) { + ent = &fp_entry_cpy[idx]; + ++idx; if (ent->fe_prob < PROB_MAX) { int decimal = ent->fe_prob % (PROB_MAX / 100); int units = ent->fe_prob / (PROB_MAX / 100); @@ -313,7 +693,7 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) } sbuf_printf(sb, "%%"); } - if (ent->fe_count > 0) + if (ent->fe_count >= 0) sbuf_printf(sb, "%d*", ent->fe_count); sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name); if (ent->fe_arg) @@ -323,10 +703,29 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) if (TAILQ_NEXT(ent, fe_entries)) sbuf_printf(sb, "->"); } - if (TAILQ_EMPTY(&fp->fp_entries)) + if (!printed_entry_count) sbuf_printf(sb, "off"); - FP_UNLOCK(); + fp_free(fp_entry_cpy); + if (verbose) { + /* Print number of sleeping threads. queue=0 is the argument + * used by msleep when sending our threads to sleep. */ + sbuf_printf(sb, "\nsleeping_thread_stacks = {\n"); + sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0, + &cnt_sleeping); + + sbuf_printf(sb, "},\n"); + sbuf_printf(sb, "sleeping_thread_count = %d,\n", + cnt_sleeping); + + sbuf_printf(sb, "paused_thread_stacks = {\n"); + sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0, + &cnt_sleeping); + + sbuf_printf(sb, "},\n"); + sbuf_printf(sb, "paused_thread_count = %d\n", + cnt_sleeping); + } } /** @@ -336,38 +735,91 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb) static int fail_point_set(struct fail_point *fp, char *buf) { - int error = 0; struct fail_point_entry *ent, *ent_next; - struct fail_point_entries new_entries; + struct fail_point_setting *entries; + bool should_wake_paused; + bool should_truncate; + int error; + + error = 0; + should_wake_paused = false; + should_truncate = false; /* Parse new entries. */ - TAILQ_INIT(&new_entries); - if (!parse_fail_point(&new_entries, buf)) { - clear_entries(&new_entries); + /** + * ref protects our new malloc'd stuff from being garbage collected + * before we link it. + */ + fail_point_setting_get_ref(fp); + entries = fail_point_setting_new(fp); + if (parse_fail_point(entries, buf) == NULL) { + STAILQ_REMOVE(&fp_setting_garbage, entries, + fail_point_setting, fs_garbage_link); + fail_point_setting_destroy(entries); error = EINVAL; goto end; } - FP_LOCK(); - - /* Move new entries in. */ - TAILQ_SWAP(&fp->fp_entries, &new_entries, fail_point_entry, fe_entries); - clear_entries(&new_entries); + /** + * Transfer the entries we are going to keep to a new list. + * Get rid of useless zero probability entries, and entries with hit + * count 0. + * If 'off' is present, and it has no hit count set, then all entries + * after it are discarded since they are unreachable. + */ + TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) { + if (ent->fe_prob == 0 || ent->fe_count == 0) { + printf("Discarding entry which cannot execute %s\n", + fail_type_strings[ent->fe_type].name); + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + continue; + } else if (should_truncate) { + printf("Discarding unreachable entry %s\n", + fail_type_strings[ent->fe_type].name); + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + continue; + } - /* Get rid of useless zero probability entries. */ - TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, ent_next) { - if (ent->fe_prob == 0) - free_entry(&fp->fp_entries, ent); + if (ent->fe_type == FAIL_POINT_OFF) { + should_wake_paused = true; + if (ent->fe_count == FE_COUNT_UNTRACKED) { + should_truncate = true; + TAILQ_REMOVE(&entries->fp_entry_queue, ent, + fe_entries); + fp_free(ent); + } + } else if (ent->fe_type == FAIL_POINT_PAUSE) { + should_truncate = true; + } else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags & + FAIL_POINT_NONSLEEPABLE)) { + /** + * If this fail point is annotated as being in a + * non-sleepable ctx, convert sleep to delay and + * convert the msec argument to usecs. + */ + printf("Sleep call request on fail point in " + "non-sleepable context; using delay instead " + "of sleep\n"); + ent->fe_type = FAIL_POINT_DELAY; + ent->fe_arg *= 1000; + } } - /* Get rid of "off"s at the end. */ - while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) && - ent->fe_type == FAIL_POINT_OFF) - free_entry(&fp->fp_entries, ent); - - FP_UNLOCK(); + if (TAILQ_EMPTY(&entries->fp_entry_queue)) { + entries = fail_point_swap_settings(fp, NULL); + if (entries != NULL) + wakeup(FP_PAUSE_CHANNEL(fp)); + } else { + if (should_wake_paused) + wakeup(FP_PAUSE_CHANNEL(fp)); + fail_point_swap_settings(fp, entries); + } - end: +end: #ifdef IWARNING if (error) IWARNING("Failed to set %s %s to %s", @@ -377,6 +829,7 @@ fail_point_set(struct fail_point *fp, char *buf) fp->fp_name, fp->fp_location, buf); #endif /* IWARNING */ + fail_point_setting_release_ref(fp); return (error); } @@ -385,25 +838,33 @@ fail_point_set(struct fail_point *fp, char *buf) /** * Handle kernel failpoint set/get. */ + int fail_point_sysctl(SYSCTL_HANDLER_ARGS) { - struct fail_point *fp = arg1; - char *buf = NULL; + struct fail_point *fp; + char *buf; + struct sbuf *sb_check; struct sbuf sb; int error; - /* Retrieving */ - sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); - fail_point_get(fp, &sb); - sbuf_trim(&sb); - error = sbuf_finish(&sb); - if (error == 0) - error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); - sbuf_delete(&sb); + error = 0; + fp = arg1; + buf = NULL; + + sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); + if (sb_check != &sb) + return (ENOMEM); + + sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Setting */ - if (!error && req->newptr) { + /** + * Lock protects any new entries from being garbage collected before we + * can link them to the fail point. + */ + sx_xlock(&sx_fp_set); + if (req->newptr) { if (req->newlen > MAX_FAIL_POINT_BUF) { error = EINVAL; goto out; @@ -417,31 +878,95 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS) buf[req->newlen] = '\0'; error = fail_point_set(fp, buf); - } + } + + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); + + /* Retrieving. */ + fail_point_get(fp, &sb, false); out: - fp_free(buf); + sbuf_finish(&sb); + sbuf_delete(&sb); + + if (buf) + fp_free(buf); + return (error); } +int +fail_point_sysctl_status(SYSCTL_HANDLER_ARGS) +{ + struct fail_point *fp; + struct sbuf sb, *sb_check; + + fp = arg1; + + sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); + if (sb_check != &sb) + return (ENOMEM); + + sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); + + /* Retrieving. */ + fail_point_get(fp, &sb, true); + + sbuf_finish(&sb); + sbuf_delete(&sb); + + /** + * Lock protects any new entries from being garbage collected before we + * can link them to the fail point. + */ + sx_xlock(&sx_fp_set); + fail_point_garbage_collect(); + sx_xunlock(&sx_fp_set); + + return (0); +} + +int +fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len) +{ + struct sysctl_req *sa; + int error; + + sa = sysctl_args; + + error = SYSCTL_OUT(sa, buf, len); + + if (error == ENOMEM) + return (-1); + else + return (len); +} + + /** * Internal helper function to translate a human-readable failpoint string * into a internally-parsable fail_point structure. */ static char * -parse_fail_point(struct fail_point_entries *ents, char *p) +parse_fail_point(struct fail_point_setting *ents, char *p) { /* <fail_point> :: * <term> ( "->" <term> )* */ + uint8_t term_count; + + term_count = 1; + p = parse_term(ents, p); if (p == NULL) return (NULL); + while (*p != '\0') { - if (p[0] != '-' || p[1] != '>') - return (NULL); - p = parse_term(ents, p + 2); - if (p == NULL) + term_count++; + if (p[0] != '-' || p[1] != '>' || + (p = parse_term(ents, p+2)) == NULL || + term_count > FP_MAX_ENTRY_COUNT) return (NULL); } return (p); @@ -451,14 +976,11 @@ parse_fail_point(struct fail_point_entries *ents, char *p) * Internal helper function to parse an individual term from a failpoint. */ static char * -parse_term(struct fail_point_entries *ents, char *p) +parse_term(struct fail_point_setting *ents, char *p) { struct fail_point_entry *ent; - ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO); - ent->fe_prob = PROB_MAX; - ent->fe_pid = NO_PID; - TAILQ_INSERT_TAIL(ents, ent, fe_entries); + ent = fail_point_entry_new(ents); /* * <term> :: @@ -483,7 +1005,7 @@ parse_term(struct fail_point_entries *ents, char *p) if (ent->fe_prob > PROB_MAX) ent->fe_prob = PROB_MAX; } else if (*p == '*') { - if (!units || decimal) + if (!units || units < 0 || decimal) return (NULL); ent->fe_count = units; } else @@ -500,7 +1022,7 @@ parse_term(struct fail_point_entries *ents, char *p) /* [ "(" <integer> ")" ] */ if (*p != '(') - return p; + return (p); p++; if (!isdigit(*p) && *p != '-') return (NULL); @@ -509,7 +1031,7 @@ parse_term(struct fail_point_entries *ents, char *p) return (NULL); /* [ "[pid " <integer> "]" ] */ -#define PID_STRING "[pid " +#define PID_STRING "[pid " if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0) return (p); p += sizeof(PID_STRING) - 1; @@ -530,7 +1052,7 @@ parse_number(int *out_units, int *out_decimal, char *p) { char *old_p; - /* + /** * <number> :: * <integer> [ "." <integer> ] | * "." <integer> @@ -584,29 +1106,17 @@ parse_type(struct fail_point_entry *ent, char *beg) return (NULL); } -/** - * Internal helper function to free an individual failpoint term. - */ -static void -free_entry(struct fail_point_entries *ents, struct fail_point_entry *ent) -{ - TAILQ_REMOVE(ents, ent, fe_entries); - fp_free(ent); -} +/* The fail point sysctl tree. */ +SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points"); -/** - * Internal helper function to clear out all failpoint terms for a single - * failpoint. - */ -static void -clear_entries(struct fail_point_entries *ents) +/* Debugging/testing stuff for fail point */ +static int +sysctl_test_fail_point(SYSCTL_HANDLER_ARGS) { - struct fail_point_entry *ent, *ent_next; - TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next) - fp_free(ent); - TAILQ_INIT(ents); + KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point); + return (0); } - -/* The fail point sysctl tree. */ -SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points"); +SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point, + CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_test_fail_point, "A", + "Trigger test fail points"); diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c index 12908f6..2c68454 100644 --- a/sys/kern/subr_sleepqueue.c +++ b/sys/kern/subr_sleepqueue.c @@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sdt.h> #include <sys/signalvar.h> #include <sys/sleepqueue.h> +#include <sys/stack.h> #include <sys/sysctl.h> #include <vm/uma.h> @@ -83,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <ddb/ddb.h> #endif + /* * Constants for the hash table of sleep queue chains. * SC_TABLESIZE must be a power of two for SC_MASK to work properly. @@ -1034,6 +1036,120 @@ sleepq_abort(struct thread *td, int intrval) return (sleepq_resume_thread(sq, td, 0)); } +/* + * Prints the stacks of all threads presently sleeping on wchan/queue to + * the sbuf sb. Sets count_stacks_printed to the number of stacks actually + * printed. Typically, this will equal the number of threads sleeping on the + * queue, but may be less if sb overflowed before all stacks were printed. + */ +int +sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, + int *count_stacks_printed) +{ + struct thread *td, *td_next; + struct sleepqueue *sq; + struct stack **st; + struct sbuf **td_infos; + int i, stack_idx, error, stacks_to_allocate; + bool finished, partial_print; + + error = 0; + finished = false; + partial_print = false; + + KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); + MPASS((queue >= 0) && (queue < NR_SLEEPQS)); + + stacks_to_allocate = 10; + for (i = 0; i < 3 && !finished ; i++) { + /* We cannot malloc while holding the queue's spinlock, so + * we do our mallocs now, and hope it is enough. If it + * isn't, we will free these, drop the lock, malloc more, + * and try again, up to a point. After that point we will + * give up and report ENOMEM. We also cannot write to sb + * during this time since the client may have set the + * SBUF_AUTOEXTEND flag on their sbuf, which could cause a + * malloc as we print to it. So we defer actually printing + * to sb until after we drop the spinlock. + */ + + /* Where we will store the stacks. */ + st = malloc(sizeof(struct stack *) * stacks_to_allocate, + M_TEMP, M_WAITOK); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + st[stack_idx] = stack_create(); + + /* Where we will store the td name, tid, etc. */ + td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate, + M_TEMP, M_WAITOK); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + td_infos[stack_idx] = sbuf_new(NULL, NULL, + MAXCOMLEN + sizeof(struct thread *) * 2 + 40, + SBUF_FIXEDLEN); + + sleepq_lock(wchan); + sq = sleepq_lookup(wchan); + if (sq == NULL) { + /* This sleepq does not exist; exit and return ENOENT. */ + error = ENOENT; + finished = true; + sleepq_release(wchan); + goto loop_end; + } + + stack_idx = 0; + /* Save thread info */ + TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, + td_next) { + if (stack_idx >= stacks_to_allocate) + goto loop_end; + + /* Note the td_lock is equal to the sleepq_lock here. */ + stack_save_td(st[stack_idx], td); + + sbuf_printf(td_infos[stack_idx], "%d: %s %p", + td->td_tid, td->td_name, td); + + ++stack_idx; + } + + finished = true; + sleepq_release(wchan); + + /* Print the stacks */ + for (i = 0; i < stack_idx; i++) { + sbuf_finish(td_infos[i]); + sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i])); + stack_sbuf_print(sb, st[i]); + sbuf_printf(sb, "\n"); + + error = sbuf_error(sb); + if (error == 0) + *count_stacks_printed = stack_idx; + } + +loop_end: + if (!finished) + sleepq_release(wchan); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + stack_destroy(st[stack_idx]); + for (stack_idx = 0; stack_idx < stacks_to_allocate; + stack_idx++) + sbuf_delete(td_infos[stack_idx]); + free(st, M_TEMP); + free(td_infos, M_TEMP); + stacks_to_allocate *= 10; + } + + if (!finished && error == 0) + error = ENOMEM; + + return (error); +} + #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 diff --git a/sys/sys/fail.h b/sys/sys/fail.h index e011459..bd2eab1 100644 --- a/sys/sys/fail.h +++ b/sys/sys/fail.h @@ -37,6 +37,11 @@ #include <sys/linker_set.h> #include <sys/queue.h> #include <sys/sysctl.h> +#include <sys/condvar.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> /** * Failpoint return codes, used internally. @@ -49,7 +54,8 @@ enum fail_point_return_code { }; struct fail_point_entry; -TAILQ_HEAD(fail_point_entries, fail_point_entry); +struct fail_point_setting; + /** * Internal failpoint structure, tracking all the current details of the * failpoint. This structure is the core component shared between the @@ -57,22 +63,42 @@ TAILQ_HEAD(fail_point_entries, fail_point_entry); * @ingroup failpoint_private */ struct fail_point { - const char *fp_name; /**< name of fail point */ - const char *fp_location; /**< file:line of fail point */ - struct fail_point_entries fp_entries; /**< list of entries */ + const char *fp_name; /* name of fail point */ + const char *fp_location; /* file:line of fail point */ + volatile int fp_ref_cnt; /** + * protects fp_setting: while holding + * a ref, fp_setting points to an + * unfreed fail_point_setting + */ + struct fail_point_setting * volatile fp_setting; int fp_flags; - void (*fp_sleep_fn)(void *); /**< Function to call at end of - * sleep for sleep failpoints */ - void *fp_sleep_arg; /**< Arg for sleep_fn */ + + /**< Function to call before sleep or pause */ + void (*fp_pre_sleep_fn)(void *); + /**< Arg for fp_pre_sleep_fn */ + void *fp_pre_sleep_arg; + + /**< Function to call after waking from sleep or pause */ + void (*fp_post_sleep_fn)(void *); + /**< Arg for fp_post_sleep_fn */ + void *fp_post_sleep_arg; }; #define FAIL_POINT_DYNAMIC_NAME 0x01 /**< Must free name on destroy */ +/**< Use timeout path for sleep instead of msleep */ +#define FAIL_POINT_USE_TIMEOUT_PATH 0x02 +/**< If fail point is set to sleep, replace the sleep call with delay */ +#define FAIL_POINT_NONSLEEPABLE 0x04 + +#define FAIL_POINT_CV_DESC "fp cv no iterators" +#define FAIL_POINT_IS_OFF(fp) (__predict_true((fp)->fp_setting == NULL) || \ + __predict_true(fail_point_is_off(fp))) __BEGIN_DECLS /* Private failpoint eval function -- use fail_point_eval() instead. */ enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *, - int *ret); + int *ret); /** * @addtogroup failpoint @@ -86,26 +112,62 @@ enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *, void fail_point_init(struct fail_point *, const char *fmt, ...) __printflike(2, 3); +/* Return true iff this fail point is set to off, false otherwise */ +bool fail_point_is_off(struct fail_point *fp); + +/** + * Set the pre-sleep function for a fail point + * If fp_post_sleep_fn is specified, then FAIL_POINT_SLEEP will result in a + * (*fp->fp_pre_sleep_fn)(fp->fp_pre_sleep_arg) call by the thread. + */ +static inline void +fail_point_sleep_set_pre_func(struct fail_point *fp, void (*sleep_fn)(void *)) +{ + fp->fp_pre_sleep_fn = sleep_fn; +} + +static inline void +fail_point_sleep_set_pre_arg(struct fail_point *fp, void *sleep_arg) +{ + fp->fp_pre_sleep_arg = sleep_arg; +} + /** - * Set the sleep function for a fail point - * If sleep_fn is specified, then FAIL_POINT_SLEEP will result in a - * (*fp->sleep_fn)(fp->sleep_arg) call by the timer thread. Otherwise, - * if sleep_fn is NULL (default), then FAIL_POINT_SLEEP will result in the - * fail_point_eval() call sleeping. + * Set the post-sleep function. This will be passed to timeout if we take + * the timeout path. This must be set if you sleep using the timeout path. */ -static __inline void -fail_point_sleep_set_func(struct fail_point *fp, void (*sleep_fn)(void *)) +static inline void +fail_point_sleep_set_post_func(struct fail_point *fp, void (*sleep_fn)(void *)) { - fp->fp_sleep_fn = sleep_fn; + fp->fp_post_sleep_fn = sleep_fn; } +static inline void +fail_point_sleep_set_post_arg(struct fail_point *fp, void *sleep_arg) +{ + fp->fp_post_sleep_arg = sleep_arg; +} /** - * Set the argument for the sleep function for a fail point + * If the FAIL_POINT_USE_TIMEOUT flag is set on a failpoint, then + * FAIL_POINT_SLEEP will result in a call to timeout instead of + * msleep. Note that if you sleep while this flag is set, you must + * set fp_post_sleep_fn or an error will occur upon waking. */ -static __inline void -fail_point_sleep_set_arg(struct fail_point *fp, void *sleep_arg) +static inline void +fail_point_use_timeout_path(struct fail_point *fp, bool use_timeout, + void (*post_sleep_fn)(void *)) { - fp->fp_sleep_arg = sleep_arg; + KASSERT(!use_timeout || post_sleep_fn != NULL || + (post_sleep_fn == NULL && fp->fp_post_sleep_fn != NULL), + ("Setting fp to use timeout, but not setting post_sleep_fn\n")); + + if (use_timeout) + fp->fp_flags |= FAIL_POINT_USE_TIMEOUT_PATH; + else + fp->fp_flags &= ~FAIL_POINT_USE_TIMEOUT_PATH; + + if (post_sleep_fn != NULL) + fp->fp_post_sleep_fn = post_sleep_fn; } /** @@ -116,33 +178,64 @@ void fail_point_destroy(struct fail_point *); /** * Evaluate a failpoint. */ -static __inline enum fail_point_return_code +static inline enum fail_point_return_code fail_point_eval(struct fail_point *fp, int *ret) { - if (TAILQ_EMPTY(&fp->fp_entries)) { + if (__predict_true(fp->fp_setting == NULL)) return (FAIL_POINT_RC_CONTINUE); - } return (fail_point_eval_nontrivial(fp, ret)); } __END_DECLS /* Declare a fail_point and its sysctl in a function. */ -#define _FAIL_POINT_NAME(name) _fail_point_##name -#define _FAIL_POINT_LOCATION() "(" __FILE__ ":" __XSTRING(__LINE__) ")" +#define _FAIL_POINT_NAME(name) _fail_point_##name +#define _FAIL_POINT_LOCATION() "(" __FILE__ ":" __XSTRING(__LINE__) ")" +#define _FAIL_POINT_INIT(parent, name, flags) \ + static struct fail_point _FAIL_POINT_NAME(name) = { \ + .fp_name = #name, \ + .fp_location = _FAIL_POINT_LOCATION(), \ + .fp_ref_cnt = 0, \ + .fp_setting = NULL, \ + .fp_flags = (flags), \ + .fp_pre_sleep_fn = NULL, \ + .fp_pre_sleep_arg = NULL, \ + .fp_post_sleep_fn = NULL, \ + .fp_post_sleep_arg = NULL, \ + }; \ + SYSCTL_OID(parent, OID_AUTO, name, \ + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, \ + &_FAIL_POINT_NAME(name), 0, fail_point_sysctl, \ + "A", ""); \ + SYSCTL_OID(parent, OID_AUTO, status_##name, \ + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, \ + &_FAIL_POINT_NAME(name), 0, \ + fail_point_sysctl_status, "A", ""); +#define _FAIL_POINT_EVAL(name, cond, code...) \ + int RETURN_VALUE; \ + \ + if (__predict_false(cond && \ + fail_point_eval(&_FAIL_POINT_NAME(name), &RETURN_VALUE))) { \ + \ + code; \ + \ + } + /** - * Instantiate a failpoint which returns "value" from the function when triggered. - * @param parent The parent sysctl under which to locate the sysctl + * Instantiate a failpoint which returns "RETURN_VALUE" from the function + * when triggered. + * @param parent The parent sysctl under which to locate the fp's sysctl * @param name The name of the failpoint in the sysctl tree (and printouts) - * @return Instantly returns the return("value") specified in the + * @return Instantly returns the RETURN_VALUE specified in the * failpoint, if triggered. */ #define KFAIL_POINT_RETURN(parent, name) \ KFAIL_POINT_CODE(parent, name, return RETURN_VALUE) /** - * Instantiate a failpoint which returns (void) from the function when triggered. + * Instantiate a failpoint which returns (void) from the function when + * triggered. * @param parent The parent sysctl under which to locate the sysctl * @param name The name of the failpoint in the sysctl tree (and printouts) * @return Instantly returns void, if triggered in the failpoint. @@ -153,7 +246,8 @@ __END_DECLS /** * Instantiate a failpoint which sets an error when triggered. * @param parent The parent sysctl under which to locate the sysctl - * @param name The name of the failpoint in the sysctl tree (and printouts) + * @param name The name of the failpoint in the sysctl tree (and + * printouts) * @param error_var A variable to set to the failpoint's specified * return-value when triggered */ @@ -164,7 +258,8 @@ __END_DECLS * Instantiate a failpoint which sets an error and then goes to a * specified label in the function when triggered. * @param parent The parent sysctl under which to locate the sysctl - * @param name The name of the failpoint in the sysctl tree (and printouts) + * @param name The name of the failpoint in the sysctl tree (and + * printouts) * @param error_var A variable to set to the failpoint's specified * return-value when triggered * @param label The location to goto when triggered. @@ -173,39 +268,81 @@ __END_DECLS KFAIL_POINT_CODE(parent, name, (error_var) = RETURN_VALUE; goto label) /** + * Instantiate a failpoint which sets its pre- and post-sleep callback + * mechanisms. + * @param parent The parent sysctl under which to locate the sysctl + * @param name The name of the failpoint in the sysctl tree (and + * printouts) + * @param pre_func Function pointer to the pre-sleep function, which will be + * called directly before going to sleep. + * @param pre_arg Argument to the pre-sleep function + * @param post_func Function pointer to the pot-sleep function, which will be + * called directly before going to sleep. + * @param post_arg Argument to the post-sleep function + */ +#define KFAIL_POINT_SLEEP_CALLBACKS(parent, name, pre_func, pre_arg, \ + post_func, post_arg) \ + KFAIL_POINT_CODE_SLEEP_CALLBACKS(parent, name, pre_func, \ + pre_arg, post_func, post_arg, return RETURN_VALUE) + +/** + * Instantiate a failpoint which runs arbitrary code when triggered, and sets + * its pre- and post-sleep callback mechanisms + * @param parent The parent sysctl under which to locate the sysctl + * @param name The name of the failpoint in the sysctl tree (and + * printouts) + * @param pre_func Function pointer to the pre-sleep function, which will be + * called directly before going to sleep. + * @param pre_arg Argument to the pre-sleep function + * @param post_func Function pointer to the pot-sleep function, which will be + * called directly before going to sleep. + * @param post_arg Argument to the post-sleep function + * @param code The arbitrary code to run when triggered. Can reference + * "RETURN_VALUE" if desired to extract the specified + * user return-value when triggered. Note that this is + * implemented with a do-while loop so be careful of + * break and continue statements. + */ +#define KFAIL_POINT_CODE_SLEEP_CALLBACKS(parent, name, pre_func, pre_arg, \ + post_func, post_arg, code...) \ + do { \ + _FAIL_POINT_INIT(parent, name) \ + _FAIL_POINT_NAME(name).fp_pre_sleep_fn = pre_func; \ + _FAIL_POINT_NAME(name).fp_pre_sleep_arg = pre_arg; \ + _FAIL_POINT_NAME(name).fp_post_sleep_fn = post_func; \ + _FAIL_POINT_NAME(name).fp_post_sleep_arg = post_arg; \ + _FAIL_POINT_EVAL(name, true, code) \ + } while (0) + + +/** * Instantiate a failpoint which runs arbitrary code when triggered. * @param parent The parent sysctl under which to locate the sysctl * @param name The name of the failpoint in the sysctl tree - * (and printouts) + * (and printouts) * @param code The arbitrary code to run when triggered. Can reference * "RETURN_VALUE" if desired to extract the specified * user return-value when triggered. Note that this is * implemented with a do-while loop so be careful of * break and continue statements. */ -#define KFAIL_POINT_CODE(parent, name, code) \ -do { \ - int RETURN_VALUE; \ - static struct fail_point _FAIL_POINT_NAME(name) = { \ - #name, \ - _FAIL_POINT_LOCATION(), \ - TAILQ_HEAD_INITIALIZER(_FAIL_POINT_NAME(name).fp_entries), \ - 0, \ - NULL, NULL, \ - }; \ - SYSCTL_OID(parent, OID_AUTO, name, \ - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, \ - &_FAIL_POINT_NAME(name), 0, fail_point_sysctl, \ - "A", ""); \ - \ - if (__predict_false( \ - fail_point_eval(&_FAIL_POINT_NAME(name), &RETURN_VALUE))) { \ - \ - code; \ - \ - } \ -} while (0) +#define KFAIL_POINT_CODE(parent, name, code...) \ + do { \ + _FAIL_POINT_INIT(parent, name, 0) \ + _FAIL_POINT_EVAL(name, true, code) \ + } while (0) + +#define KFAIL_POINT_CODE_FLAGS(parent, name, flags, code...) \ + do { \ + _FAIL_POINT_INIT(parent, name, flags) \ + _FAIL_POINT_EVAL(name, true, code) \ + } while (0) +#define KFAIL_POINT_CODE_COND(parent, name, cond, flags, code...) \ + do { \ + _FAIL_POINT_INIT(parent, name, flags) \ + _FAIL_POINT_EVAL(name, cond, code) \ + } while (0) /** * @} @@ -214,6 +351,7 @@ do { \ #ifdef _KERNEL int fail_point_sysctl(SYSCTL_HANDLER_ARGS); +int fail_point_sysctl_status(SYSCTL_HANDLER_ARGS); /* The fail point sysctl tree. */ SYSCTL_DECL(_debug_fail_point); diff --git a/sys/sys/sleepqueue.h b/sys/sys/sleepqueue.h index cdb7a39..b607a53 100644 --- a/sys/sys/sleepqueue.h +++ b/sys/sys/sleepqueue.h @@ -107,5 +107,9 @@ int sleepq_type(void *wchan); void sleepq_wait(void *wchan, int pri); int sleepq_wait_sig(void *wchan, int pri); +#include <sys/sbuf.h> +int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue, + int *count_stacks_printed); + #endif /* _KERNEL */ #endif /* !_SYS_SLEEPQUEUE_H_ */ |