summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/imgact_binmisc.c13
-rw-r--r--sys/kern/imgact_elf.c4
-rw-r--r--sys/kern/init_sysent.c2
-rw-r--r--sys/kern/kern_condvar.c48
-rw-r--r--sys/kern/kern_descrip.c4
-rw-r--r--sys/kern/kern_exec.c4
-rw-r--r--sys/kern/kern_fail.c782
-rw-r--r--sys/kern/kern_linker.c2
-rw-r--r--sys/kern/kern_mbuf.c11
-rw-r--r--sys/kern/kern_osd.c263
-rw-r--r--sys/kern/kern_racct.c149
-rw-r--r--sys/kern/kern_rctl.c75
-rw-r--r--sys/kern/kern_sendfile.c56
-rw-r--r--sys/kern/kern_synch.c25
-rw-r--r--sys/kern/pic_if.m112
-rw-r--r--sys/kern/subr_bus.c32
-rw-r--r--sys/kern/subr_counter.c25
-rw-r--r--sys/kern/subr_intr.c534
-rw-r--r--sys/kern/subr_rman.c50
-rw-r--r--sys/kern/subr_sleepqueue.c121
-rw-r--r--sys/kern/subr_smp.c248
-rw-r--r--sys/kern/sys_generic.c24
-rw-r--r--sys/kern/syscalls.c2
-rw-r--r--sys/kern/syscalls.master4
-rw-r--r--sys/kern/systrace_args.c4
-rw-r--r--sys/kern/uipc_mbuf.c47
-rw-r--r--sys/kern/uipc_socket.c2
-rw-r--r--sys/kern/vfs_aio.c49
-rw-r--r--sys/kern/vfs_export.c4
-rw-r--r--sys/kern/vfs_mountroot.c3
30 files changed, 1874 insertions, 825 deletions
diff --git a/sys/kern/imgact_binmisc.c b/sys/kern/imgact_binmisc.c
index dd57717..39ca156 100644
--- a/sys/kern/imgact_binmisc.c
+++ b/sys/kern/imgact_binmisc.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2013-15, Stacey D. Son
+ * Copyright (c) 2013-16, Stacey D. Son
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -220,16 +220,17 @@ imgact_binmisc_add_entry(ximgact_binmisc_entry_t *xbe)
{
imgact_binmisc_entry_t *ibe;
char *p;
+ int cnt;
if (xbe->xbe_msize > IBE_MAGIC_MAX)
return (EINVAL);
- for(p = xbe->xbe_name; *p != 0; p++)
- if (!isascii((int)*p))
+ for(cnt = 0, p = xbe->xbe_name; *p != 0; cnt++, p++)
+ if (cnt >= IBE_NAME_MAX || !isascii((int)*p))
return (EINVAL);
- for(p = xbe->xbe_interpreter; *p != 0; p++)
- if (!isascii((int)*p))
+ for(cnt = 0, p = xbe->xbe_interpreter; *p != 0; cnt++, p++)
+ if (cnt >= IBE_INTERP_LEN_MAX || !isascii((int)*p))
return (EINVAL);
/* Make sure we don't have any invalid #'s. */
@@ -266,8 +267,6 @@ imgact_binmisc_add_entry(ximgact_binmisc_entry_t *xbe)
/* Preallocate a new entry. */
ibe = imgact_binmisc_new_entry(xbe);
- if (!ibe)
- return (ENOMEM);
SLIST_INSERT_HEAD(&interpreter_list, ibe, link);
interp_list_entry_count++;
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 43d4800..0bed714 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -1370,10 +1370,6 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
* and write it out following the notes.
*/
hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
- if (hdr == NULL) {
- error = EINVAL;
- goto done;
- }
error = __elfN(corehdr)(&params, seginfo.count, hdr, hdrsize, &notelst,
notesz);
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 96c5229..47f2f90 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -3,7 +3,7 @@
*
* DO NOT EDIT-- this file is automatically generated.
* $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 296572 2016-03-09 19:05:11Z jhb
+ * created from FreeBSD: head/sys/kern/syscalls.master 297167 2016-03-21 21:37:33Z jhb
*/
#include "opt_compat.h"
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
index 8061829..6f891e0 100644
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c
@@ -122,15 +122,8 @@ _cv_wait(struct cv *cvp, struct lock_object *lock)
"Waiting on \"%s\"", cvp->cv_description);
class = LOCK_CLASS(lock);
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * During autoconfiguration, just give interrupts
- * a chance, then just return. Don't run any other
- * thread or panic below, in case this is the idle
- * process and already asleep.
- */
+ if (SCHEDULER_STOPPED())
return;
- }
sleepq_lock(cvp);
@@ -183,13 +176,7 @@ _cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
("cv_wait_unlock cannot be used with Giant"));
class = LOCK_CLASS(lock);
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * During autoconfiguration, just give interrupts
- * a chance, then just return. Don't run any other
- * thread or panic below, in case this is the idle
- * process and already asleep.
- */
+ if (SCHEDULER_STOPPED()) {
class->lc_unlock(lock);
return;
}
@@ -240,15 +227,8 @@ _cv_wait_sig(struct cv *cvp, struct lock_object *lock)
"Waiting on \"%s\"", cvp->cv_description);
class = LOCK_CLASS(lock);
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * After a panic, or during autoconfiguration, just give
- * interrupts a chance, then just return; don't run any other
- * procs or panic below, in case this is the idle process and
- * already asleep.
- */
+ if (SCHEDULER_STOPPED())
return (0);
- }
sleepq_lock(cvp);
@@ -307,15 +287,8 @@ _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt,
"Waiting on \"%s\"", cvp->cv_description);
class = LOCK_CLASS(lock);
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * After a panic, or during autoconfiguration, just give
- * interrupts a chance, then just return; don't run any other
- * thread or panic below, in case this is the idle process and
- * already asleep.
- */
- return 0;
- }
+ if (SCHEDULER_STOPPED())
+ return (0);
sleepq_lock(cvp);
@@ -376,15 +349,8 @@ _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock,
"Waiting on \"%s\"", cvp->cv_description);
class = LOCK_CLASS(lock);
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * After a panic, or during autoconfiguration, just give
- * interrupts a chance, then just return; don't run any other
- * thread or panic below, in case this is the idle process and
- * already asleep.
- */
- return 0;
- }
+ if (SCHEDULER_STOPPED())
+ return (0);
sleepq_lock(cvp);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 00bd54b..b37adcc 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -3958,7 +3958,7 @@ badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
static int
badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
- int kflags, struct thread *td)
+ struct thread *td)
{
return (EBADF);
@@ -4044,7 +4044,7 @@ invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
int
invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
- int kflags, struct thread *td)
+ struct thread *td)
{
return (EINVAL);
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 69f0774..7c88fe0 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -1570,8 +1570,6 @@ exec_register(execsw_arg)
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
- if (newexecsw == NULL)
- return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
@@ -1604,8 +1602,6 @@ exec_unregister(execsw_arg)
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
- if (newexecsw == NULL)
- return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
index 3737aa3..ec466dd 100644
--- a/sys/kern/kern_fail.c
+++ b/sys/kern/kern_fail.c
@@ -52,17 +52,25 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_stack.h"
+
#include <sys/ctype.h>
#include <sys/errno.h>
#include <sys/fail.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
+#include <sys/sleepqueue.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#include <machine/atomic.h>
#include <machine/stdarg.h>
#ifdef ILOG_DEFINE_FOR_FILE
@@ -72,11 +80,45 @@ ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point);
static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system");
#define fp_free(ptr) free(ptr, M_FAIL_POINT)
#define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags))
+#define fs_free(ptr) fp_free(ptr)
+#define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \
+ M_WAITOK | M_ZERO)
+
+ /**
+ * These define the wchans that are used for sleeping, pausing respectively.
+ * They are chosen arbitrarily but need to be distinct to the failpoint and
+ * the sleep/pause distinction.
+ */
+#define FP_SLEEP_CHANNEL(fp) (void*)(fp)
+#define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting)
+
+/**
+ * Don't allow more than this many entries in a fail point set by sysctl.
+ * The 99.99...% case is to have 1 entry. I can't imagine having this many
+ * entries, so it should not limit us. Saves on re-mallocs while holding
+ * a non-sleepable lock.
+ */
+#define FP_MAX_ENTRY_COUNT 20
+
+/* Used to drain sbufs to the sysctl output */
+int fail_sysctl_drain_func(void *, const char *, int);
+
+/* Head of tailq of struct fail_point_entry */
+TAILQ_HEAD(fail_point_entry_queue, fail_point_entry);
+
+/**
+ * fp entries garbage list; outstanding entries are cleaned up in the
+ * garbage collector
+ */
+STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting);
+static struct fail_point_setting_garbage fp_setting_garbage =
+ STAILQ_HEAD_INITIALIZER(fp_setting_garbage);
+static struct mtx mtx_garbage_list;
+MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx",
+ MTX_SPIN);
-static struct mtx g_fp_mtx;
-MTX_SYSINIT(g_fp_mtx, &g_fp_mtx, "fail point mtx", MTX_DEF);
-#define FP_LOCK() mtx_lock(&g_fp_mtx)
-#define FP_UNLOCK() mtx_unlock(&g_fp_mtx)
+static struct sx sx_fp_set;
+SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx");
/**
* Failpoint types.
@@ -90,7 +132,11 @@ enum fail_point_t {
FAIL_POINT_BREAK, /**< break into the debugger */
FAIL_POINT_PRINT, /**< print a message */
FAIL_POINT_SLEEP, /**< sleep for some msecs */
- FAIL_POINT_NUMTYPES
+ FAIL_POINT_PAUSE, /**< sleep until failpoint is set to off */
+ FAIL_POINT_YIELD, /**< yield the cpu */
+ FAIL_POINT_DELAY, /**< busy wait the cpu */
+ FAIL_POINT_NUMTYPES,
+ FAIL_POINT_INVALID = -1
};
static struct {
@@ -104,53 +150,307 @@ static struct {
[FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"),
[FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"),
[FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"),
+ [FAIL_POINT_PAUSE] = FP_TYPE_NM_LEN("pause"),
+ [FAIL_POINT_YIELD] = FP_TYPE_NM_LEN("yield"),
+ [FAIL_POINT_DELAY] = FP_TYPE_NM_LEN("delay"),
};
+#define FE_COUNT_UNTRACKED (INT_MIN)
+
/**
* Internal structure tracking a single term of a complete failpoint.
* @ingroup failpoint_private
*/
struct fail_point_entry {
- enum fail_point_t fe_type; /**< type of entry */
+ volatile bool fe_stale;
+ enum fail_point_t fe_type; /**< type of entry */
int fe_arg; /**< argument to type (e.g. return value) */
int fe_prob; /**< likelihood of firing in millionths */
- int fe_count; /**< number of times to fire, 0 means always */
+ int fe_count; /**< number of times to fire, -1 means infinite */
pid_t fe_pid; /**< only fail for this process */
- TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */
+ struct fail_point *fe_parent; /**< backpointer to fp */
+ TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry ptr */
};
+struct fail_point_setting {
+ STAILQ_ENTRY(fail_point_setting) fs_garbage_link;
+ struct fail_point_entry_queue fp_entry_queue;
+ struct fail_point * fs_parent;
+ struct mtx feq_mtx; /* Gives fail_point_pause something to do. */
+};
+
+/**
+ * Defines stating the equivalent of probablilty one (100%)
+ */
+enum {
+ PROB_MAX = 1000000, /* probability between zero and this number */
+ PROB_DIGITS = 6 /* number of zero's in above number */
+};
+
+/* Get a ref on an fp's fp_setting */
+static inline struct fail_point_setting *fail_point_setting_get_ref(
+ struct fail_point *fp);
+/* Release a ref on an fp_setting */
+static inline void fail_point_setting_release_ref(struct fail_point *fp);
+/* Allocate and initialize a struct fail_point_setting */
+static struct fail_point_setting *fail_point_setting_new(struct
+ fail_point *);
+/* Free a struct fail_point_setting */
+static void fail_point_setting_destroy(struct fail_point_setting *fp_setting);
+/* Allocate and initialize a struct fail_point_entry */
+static struct fail_point_entry *fail_point_entry_new(struct
+ fail_point_setting *);
+/* Free a struct fail_point_entry */
+static void fail_point_entry_destroy(struct fail_point_entry *fp_entry);
+/* Append fp setting to garbage list */
+static inline void fail_point_setting_garbage_append(
+ struct fail_point_setting *fp_setting);
+/* Swap fp's setting with fp_setting_new */
+static inline struct fail_point_setting *
+ fail_point_swap_settings(struct fail_point *fp,
+ struct fail_point_setting *fp_setting_new);
+/* Free up any zero-ref setting in the garbage queue */
+static void fail_point_garbage_collect(void);
+/* If this fail point's setting are empty, then swap it out to NULL. */
+static inline void fail_point_eval_swap_out(struct fail_point *fp,
+ struct fail_point_setting *fp_setting);
+
+bool
+fail_point_is_off(struct fail_point *fp)
+{
+ bool return_val;
+ struct fail_point_setting *fp_setting;
+ struct fail_point_entry *ent;
+
+ return_val = true;
+
+ fp_setting = fail_point_setting_get_ref(fp);
+ if (fp_setting != NULL) {
+ TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue,
+ fe_entries) {
+ if (!ent->fe_stale) {
+ return_val = false;
+ break;
+ }
+ }
+ }
+ fail_point_setting_release_ref(fp);
+
+ return (return_val);
+}
+
+/* Allocate and initialize a struct fail_point_setting */
+static struct fail_point_setting *
+fail_point_setting_new(struct fail_point *fp)
+{
+ struct fail_point_setting *fs_new;
+
+ fs_new = fs_malloc();
+ fs_new->fs_parent = fp;
+ TAILQ_INIT(&fs_new->fp_entry_queue);
+ mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN);
+
+ fail_point_setting_garbage_append(fs_new);
+
+ return (fs_new);
+}
+
+/* Free a struct fail_point_setting */
+static void
+fail_point_setting_destroy(struct fail_point_setting *fp_setting)
+{
+ struct fail_point_entry *ent;
+
+ while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) {
+ ent = TAILQ_FIRST(&fp_setting->fp_entry_queue);
+ TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries);
+ fail_point_entry_destroy(ent);
+ }
+
+ fs_free(fp_setting);
+}
+
+/* Allocate and initialize a struct fail_point_entry */
+static struct fail_point_entry *
+fail_point_entry_new(struct fail_point_setting *fp_setting)
+{
+ struct fail_point_entry *fp_entry;
+
+ fp_entry = fp_malloc(sizeof(struct fail_point_entry),
+ M_WAITOK | M_ZERO);
+ fp_entry->fe_parent = fp_setting->fs_parent;
+ fp_entry->fe_prob = PROB_MAX;
+ fp_entry->fe_pid = NO_PID;
+ fp_entry->fe_count = FE_COUNT_UNTRACKED;
+ TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry,
+ fe_entries);
+
+ return (fp_entry);
+}
+
+/* Free a struct fail_point_entry */
+static void
+fail_point_entry_destroy(struct fail_point_entry *fp_entry)
+{
+
+ fp_free(fp_entry);
+}
+
+/* Get a ref on an fp's fp_setting */
+static inline struct fail_point_setting *
+fail_point_setting_get_ref(struct fail_point *fp)
+{
+ struct fail_point_setting *fp_setting;
+
+ /* Invariant: if we have a ref, our pointer to fp_setting is safe */
+ atomic_add_acq_32(&fp->fp_ref_cnt, 1);
+ fp_setting = fp->fp_setting;
+
+ return (fp_setting);
+}
+
+/* Release a ref on an fp_setting */
+static inline void
+fail_point_setting_release_ref(struct fail_point *fp)
+{
+
+ KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs"));
+ atomic_subtract_rel_32(&fp->fp_ref_cnt, 1);
+}
+
+/* Append fp entries to fp garbage list */
+static inline void
+fail_point_setting_garbage_append(struct fail_point_setting *fp_setting)
+{
+
+ mtx_lock_spin(&mtx_garbage_list);
+ STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting,
+ fs_garbage_link);
+ mtx_unlock_spin(&mtx_garbage_list);
+}
+
+/* Swap fp's entries with fp_setting_new */
+static struct fail_point_setting *
+fail_point_swap_settings(struct fail_point *fp,
+ struct fail_point_setting *fp_setting_new)
+{
+ struct fail_point_setting *fp_setting_old;
+
+ fp_setting_old = fp->fp_setting;
+ fp->fp_setting = fp_setting_new;
+
+ return (fp_setting_old);
+}
+
+static inline void
+fail_point_eval_swap_out(struct fail_point *fp,
+ struct fail_point_setting *fp_setting)
+{
+
+ /* We may have already been swapped out and replaced; ignore. */
+ if (fp->fp_setting == fp_setting)
+ fail_point_swap_settings(fp, NULL);
+}
+
+/* Free up any zero-ref entries in the garbage queue */
+static void
+fail_point_garbage_collect()
+{
+ struct fail_point_setting *fs_current, *fs_next;
+ struct fail_point_setting_garbage fp_ents_free_list;
+
+ /**
+ * We will transfer the entries to free to fp_ents_free_list while holding
+ * the spin mutex, then free it after we drop the lock. This avoids
+ * triggering witness due to sleepable mutexes in the memory
+ * allocator.
+ */
+ STAILQ_INIT(&fp_ents_free_list);
+
+ mtx_lock_spin(&mtx_garbage_list);
+ STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link,
+ fs_next) {
+ if (fs_current->fs_parent->fp_setting != fs_current &&
+ fs_current->fs_parent->fp_ref_cnt == 0) {
+ STAILQ_REMOVE(&fp_setting_garbage, fs_current,
+ fail_point_setting, fs_garbage_link);
+ STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current,
+ fs_garbage_link);
+ }
+ }
+ mtx_unlock_spin(&mtx_garbage_list);
+
+ STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link,
+ fs_next)
+ fail_point_setting_destroy(fs_current);
+}
+
+/* Drain out all refs from this fail point */
+static inline void
+fail_point_drain(struct fail_point *fp, int expected_ref)
+{
+ struct fail_point_setting *entries;
+
+ entries = fail_point_swap_settings(fp, NULL);
+ /**
+ * We have unpaused all threads; so we will wait no longer
+ * than the time taken for the longest remaining sleep, or
+ * the length of time of a long-running code block.
+ */
+ while (fp->fp_ref_cnt > expected_ref) {
+ wakeup(FP_PAUSE_CHANNEL(fp));
+ tsleep(&fp, PWAIT, "fail_point_drain", hz / 100);
+ }
+ fail_point_swap_settings(fp, entries);
+}
+
static inline void
-fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent,
- int msecs, enum fail_point_return_code *pret)
+fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret,
+ struct mtx *mtx_sleep)
{
- /* convert from millisecs to ticks, rounding up */
- int timo = ((msecs * hz) + 999) / 1000;
+
+ if (fp->fp_pre_sleep_fn)
+ fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
+
+ msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0);
+
+ if (fp->fp_post_sleep_fn)
+ fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
+}
+
+static inline void
+fail_point_sleep(struct fail_point *fp, int msecs,
+ enum fail_point_return_code *pret)
+{
+ int timo;
+
+ /* Convert from millisecs to ticks, rounding up */
+ timo = howmany(msecs * hz, 1000);
if (timo > 0) {
- if (fp->fp_sleep_fn == NULL) {
- msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo);
+ if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) {
+ if (fp->fp_pre_sleep_fn)
+ fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
+
+ tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo);
+
+ if (fp->fp_post_sleep_fn)
+ fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
} else {
- timeout(fp->fp_sleep_fn, fp->fp_sleep_arg, timo);
+ if (fp->fp_pre_sleep_fn)
+ fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
+
+ timeout(fp->fp_post_sleep_fn, fp->fp_post_sleep_arg,
+ timo);
*pret = FAIL_POINT_RC_QUEUED;
}
}
}
-
-/**
- * Defines stating the equivalent of probablilty one (100%)
- */
-enum {
- PROB_MAX = 1000000, /* probability between zero and this number */
- PROB_DIGITS = 6, /* number of zero's in above number */
-};
-
-static char *parse_fail_point(struct fail_point_entries *, char *);
-static char *parse_term(struct fail_point_entries *, char *);
+static char *parse_fail_point(struct fail_point_setting *, char *);
+static char *parse_term(struct fail_point_setting *, char *);
static char *parse_number(int *out_units, int *out_decimal, char *);
static char *parse_type(struct fail_point_entry *, char *);
-static void free_entry(struct fail_point_entries *, struct fail_point_entry *);
-static void clear_entries(struct fail_point_entries *);
/**
* Initialize a fail_point. The name is formed in a printf-like fashion
@@ -167,7 +467,7 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...)
char *name;
int n;
- TAILQ_INIT(&fp->fp_entries);
+ fp->fp_setting = NULL;
fp->fp_flags = 0;
/* Figure out the size of the name. */
@@ -185,25 +485,33 @@ fail_point_init(struct fail_point *fp, const char *fmt, ...)
fp->fp_name = name;
fp->fp_location = "";
fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME;
- fp->fp_sleep_fn = NULL;
- fp->fp_sleep_arg = NULL;
+ fp->fp_pre_sleep_fn = NULL;
+ fp->fp_pre_sleep_arg = NULL;
+ fp->fp_post_sleep_fn = NULL;
+ fp->fp_post_sleep_arg = NULL;
}
/**
- * Free the resources held by a fail_point.
- *
+ * Free the resources held by a fail_point, and wake any paused threads.
+ * Thou shalt not allow threads to hit this fail point after you enter this
+ * function, nor shall you call this multiple times for a given fp.
* @ingroup failpoint
*/
void
fail_point_destroy(struct fail_point *fp)
{
+ fail_point_drain(fp, 0);
+
if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
fp_free(__DECONST(void *, fp->fp_name));
fp->fp_name = NULL;
}
fp->fp_flags = 0;
- clear_entries(&fp->fp_entries);
+
+ sx_xlock(&sx_fp_set);
+ fail_point_garbage_collect();
+ sx_xunlock(&sx_fp_set);
}
/**
@@ -216,21 +524,51 @@ fail_point_destroy(struct fail_point *fp)
enum fail_point_return_code
fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
{
- enum fail_point_return_code ret = FAIL_POINT_RC_CONTINUE;
- struct fail_point_entry *ent, *next;
+ bool execute = false;
+ struct fail_point_entry *ent;
+ struct fail_point_setting *fp_setting;
+ enum fail_point_return_code ret;
+ int cont;
+ int count;
int msecs;
+ int usecs;
+
+ ret = FAIL_POINT_RC_CONTINUE;
+ cont = 0; /* don't continue by default */
+
+ fp_setting = fail_point_setting_get_ref(fp);
+ if (fp_setting == NULL)
+ goto abort;
- FP_LOCK();
+ TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
- TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) {
- int cont = 0; /* don't continue by default */
+ if (ent->fe_stale)
+ continue;
if (ent->fe_prob < PROB_MAX &&
ent->fe_prob < random() % PROB_MAX)
continue;
+
if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
continue;
+ if (ent->fe_count != FE_COUNT_UNTRACKED) {
+ count = ent->fe_count;
+ while (count > 0) {
+ if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) {
+ count--;
+ execute = true;
+ break;
+ }
+ count = ent->fe_count;
+ }
+ if (execute == false)
+ /* We lost the race; consider the entry stale and bail now */
+ continue;
+ if (count == 0)
+ ent->fe_stale = true;
+ }
+
switch (ent->fe_type) {
case FAIL_POINT_PANIC:
panic("fail point %s panicking", fp->fp_name);
@@ -244,7 +582,7 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
case FAIL_POINT_BREAK:
printf("fail point %s breaking to debugger\n",
- fp->fp_name);
+ fp->fp_name);
breakpoint();
break;
@@ -254,51 +592,95 @@ fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
break;
case FAIL_POINT_SLEEP:
- /*
- * Free the entry now if necessary, since
- * we're about to drop the mutex and sleep.
- */
msecs = ent->fe_arg;
- if (ent->fe_count > 0 && --ent->fe_count == 0) {
- free_entry(&fp->fp_entries, ent);
- ent = NULL;
- }
-
if (msecs)
- fail_point_sleep(fp, ent, msecs, &ret);
+ fail_point_sleep(fp, msecs, &ret);
+ break;
+
+ case FAIL_POINT_PAUSE:
+ /**
+ * Pausing is inherently strange with multiple
+ * entries given our design. That is because some
+ * entries could be unreachable, for instance in cases like:
+ * pause->return. We can never reach the return entry.
+ * The sysctl layer actually truncates all entries after
+ * a pause for this reason.
+ */
+ mtx_lock_spin(&fp_setting->feq_mtx);
+ fail_point_pause(fp, &ret, &fp_setting->feq_mtx);
+ mtx_unlock_spin(&fp_setting->feq_mtx);
+ break;
+
+ case FAIL_POINT_YIELD:
+ kern_yield(-1);
+ break;
+
+ case FAIL_POINT_DELAY:
+ usecs = ent->fe_arg;
+ DELAY(usecs);
break;
default:
break;
}
- if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0)
- free_entry(&fp->fp_entries, ent);
if (cont == 0)
break;
}
- /* Get rid of "off"s at the end. */
- while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
- ent->fe_type == FAIL_POINT_OFF)
- free_entry(&fp->fp_entries, ent);
+ if (fail_point_is_off(fp))
+ fail_point_eval_swap_out(fp, fp_setting);
- FP_UNLOCK();
+abort:
+ fail_point_setting_release_ref(fp);
return (ret);
+
}
/**
* Translate internal fail_point structure into human-readable text.
*/
static void
-fail_point_get(struct fail_point *fp, struct sbuf *sb)
+fail_point_get(struct fail_point *fp, struct sbuf *sb,
+ bool verbose)
{
struct fail_point_entry *ent;
+ struct fail_point_setting *fp_setting;
+ struct fail_point_entry *fp_entry_cpy;
+ int cnt_sleeping;
+ int idx;
+ int printed_entry_count;
+
+ cnt_sleeping = 0;
+ idx = 0;
+ printed_entry_count = 0;
+
+ fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) *
+ (FP_MAX_ENTRY_COUNT + 1), M_WAITOK);
+
+ fp_setting = fail_point_setting_get_ref(fp);
- FP_LOCK();
+ if (fp_setting != NULL) {
+ TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
+ if (ent->fe_stale)
+ continue;
- TAILQ_FOREACH(ent, &fp->fp_entries, fe_entries) {
+ KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT,
+ ("FP entry list larger than allowed"));
+
+ fp_entry_cpy[printed_entry_count] = *ent;
+ ++printed_entry_count;
+ }
+ }
+ fail_point_setting_release_ref(fp);
+
+ /* This is our equivalent of a NULL terminator */
+ fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID;
+
+ while (idx < printed_entry_count) {
+ ent = &fp_entry_cpy[idx];
+ ++idx;
if (ent->fe_prob < PROB_MAX) {
int decimal = ent->fe_prob % (PROB_MAX / 100);
int units = ent->fe_prob / (PROB_MAX / 100);
@@ -313,7 +695,7 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb)
}
sbuf_printf(sb, "%%");
}
- if (ent->fe_count > 0)
+ if (ent->fe_count >= 0)
sbuf_printf(sb, "%d*", ent->fe_count);
sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
if (ent->fe_arg)
@@ -323,10 +705,33 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb)
if (TAILQ_NEXT(ent, fe_entries))
sbuf_printf(sb, "->");
}
- if (TAILQ_EMPTY(&fp->fp_entries))
+ if (!printed_entry_count)
sbuf_printf(sb, "off");
- FP_UNLOCK();
+ fp_free(fp_entry_cpy);
+ if (verbose) {
+#ifdef STACK
+ /* Print number of sleeping threads. queue=0 is the argument
+ * used by msleep when sending our threads to sleep. */
+ sbuf_printf(sb, "\nsleeping_thread_stacks = {\n");
+ sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0,
+ &cnt_sleeping);
+
+ sbuf_printf(sb, "},\n");
+#endif
+ sbuf_printf(sb, "sleeping_thread_count = %d,\n",
+ cnt_sleeping);
+
+#ifdef STACK
+ sbuf_printf(sb, "paused_thread_stacks = {\n");
+ sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0,
+ &cnt_sleeping);
+
+ sbuf_printf(sb, "},\n");
+#endif
+ sbuf_printf(sb, "paused_thread_count = %d\n",
+ cnt_sleeping);
+ }
}
/**
@@ -336,38 +741,91 @@ fail_point_get(struct fail_point *fp, struct sbuf *sb)
static int
fail_point_set(struct fail_point *fp, char *buf)
{
- int error = 0;
struct fail_point_entry *ent, *ent_next;
- struct fail_point_entries new_entries;
+ struct fail_point_setting *entries;
+ bool should_wake_paused;
+ bool should_truncate;
+ int error;
+
+ error = 0;
+ should_wake_paused = false;
+ should_truncate = false;
/* Parse new entries. */
- TAILQ_INIT(&new_entries);
- if (!parse_fail_point(&new_entries, buf)) {
- clear_entries(&new_entries);
+ /**
+ * ref protects our new malloc'd stuff from being garbage collected
+ * before we link it.
+ */
+ fail_point_setting_get_ref(fp);
+ entries = fail_point_setting_new(fp);
+ if (parse_fail_point(entries, buf) == NULL) {
+ STAILQ_REMOVE(&fp_setting_garbage, entries,
+ fail_point_setting, fs_garbage_link);
+ fail_point_setting_destroy(entries);
error = EINVAL;
goto end;
}
- FP_LOCK();
-
- /* Move new entries in. */
- TAILQ_SWAP(&fp->fp_entries, &new_entries, fail_point_entry, fe_entries);
- clear_entries(&new_entries);
+ /**
+ * Transfer the entries we are going to keep to a new list.
+ * Get rid of useless zero probability entries, and entries with hit
+ * count 0.
+ * If 'off' is present, and it has no hit count set, then all entries
+ * after it are discarded since they are unreachable.
+ */
+ TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) {
+ if (ent->fe_prob == 0 || ent->fe_count == 0) {
+ printf("Discarding entry which cannot execute %s\n",
+ fail_type_strings[ent->fe_type].name);
+ TAILQ_REMOVE(&entries->fp_entry_queue, ent,
+ fe_entries);
+ fp_free(ent);
+ continue;
+ } else if (should_truncate) {
+ printf("Discarding unreachable entry %s\n",
+ fail_type_strings[ent->fe_type].name);
+ TAILQ_REMOVE(&entries->fp_entry_queue, ent,
+ fe_entries);
+ fp_free(ent);
+ continue;
+ }
- /* Get rid of useless zero probability entries. */
- TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, ent_next) {
- if (ent->fe_prob == 0)
- free_entry(&fp->fp_entries, ent);
+ if (ent->fe_type == FAIL_POINT_OFF) {
+ should_wake_paused = true;
+ if (ent->fe_count == FE_COUNT_UNTRACKED) {
+ should_truncate = true;
+ TAILQ_REMOVE(&entries->fp_entry_queue, ent,
+ fe_entries);
+ fp_free(ent);
+ }
+ } else if (ent->fe_type == FAIL_POINT_PAUSE) {
+ should_truncate = true;
+ } else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags &
+ FAIL_POINT_NONSLEEPABLE)) {
+ /**
+ * If this fail point is annotated as being in a
+ * non-sleepable ctx, convert sleep to delay and
+ * convert the msec argument to usecs.
+ */
+ printf("Sleep call request on fail point in "
+ "non-sleepable context; using delay instead "
+ "of sleep\n");
+ ent->fe_type = FAIL_POINT_DELAY;
+ ent->fe_arg *= 1000;
+ }
}
- /* Get rid of "off"s at the end. */
- while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
- ent->fe_type == FAIL_POINT_OFF)
- free_entry(&fp->fp_entries, ent);
-
- FP_UNLOCK();
+ if (TAILQ_EMPTY(&entries->fp_entry_queue)) {
+ entries = fail_point_swap_settings(fp, NULL);
+ if (entries != NULL)
+ wakeup(FP_PAUSE_CHANNEL(fp));
+ } else {
+ if (should_wake_paused)
+ wakeup(FP_PAUSE_CHANNEL(fp));
+ fail_point_swap_settings(fp, entries);
+ }
- end:
+end:
#ifdef IWARNING
if (error)
IWARNING("Failed to set %s %s to %s",
@@ -377,6 +835,7 @@ fail_point_set(struct fail_point *fp, char *buf)
fp->fp_name, fp->fp_location, buf);
#endif /* IWARNING */
+ fail_point_setting_release_ref(fp);
return (error);
}
@@ -385,25 +844,33 @@ fail_point_set(struct fail_point *fp, char *buf)
/**
* Handle kernel failpoint set/get.
*/
+
int
fail_point_sysctl(SYSCTL_HANDLER_ARGS)
{
- struct fail_point *fp = arg1;
- char *buf = NULL;
+ struct fail_point *fp;
+ char *buf;
+ struct sbuf *sb_check;
struct sbuf sb;
int error;
- /* Retrieving */
- sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
- fail_point_get(fp, &sb);
- sbuf_trim(&sb);
- error = sbuf_finish(&sb);
- if (error == 0)
- error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
- sbuf_delete(&sb);
+ error = 0;
+ fp = arg1;
+ buf = NULL;
+
+ sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
+ if (sb_check != &sb)
+ return (ENOMEM);
+
+ sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
/* Setting */
- if (!error && req->newptr) {
+ /**
+ * Lock protects any new entries from being garbage collected before we
+ * can link them to the fail point.
+ */
+ sx_xlock(&sx_fp_set);
+ if (req->newptr) {
if (req->newlen > MAX_FAIL_POINT_BUF) {
error = EINVAL;
goto out;
@@ -417,31 +884,95 @@ fail_point_sysctl(SYSCTL_HANDLER_ARGS)
buf[req->newlen] = '\0';
error = fail_point_set(fp, buf);
- }
+ }
+
+ fail_point_garbage_collect();
+ sx_xunlock(&sx_fp_set);
+
+ /* Retrieving. */
+ fail_point_get(fp, &sb, false);
out:
- fp_free(buf);
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ if (buf)
+ fp_free(buf);
+
return (error);
}
+int
+fail_point_sysctl_status(SYSCTL_HANDLER_ARGS)
+{
+ struct fail_point *fp;
+ struct sbuf sb, *sb_check;
+
+ fp = arg1;
+
+ sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
+ if (sb_check != &sb)
+ return (ENOMEM);
+
+ sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
+
+ /* Retrieving. */
+ fail_point_get(fp, &sb, true);
+
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ /**
+ * Lock protects any new entries from being garbage collected before we
+ * can link them to the fail point.
+ */
+ sx_xlock(&sx_fp_set);
+ fail_point_garbage_collect();
+ sx_xunlock(&sx_fp_set);
+
+ return (0);
+}
+
+int
+fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len)
+{
+ struct sysctl_req *sa;
+ int error;
+
+ sa = sysctl_args;
+
+ error = SYSCTL_OUT(sa, buf, len);
+
+ if (error == ENOMEM)
+ return (-1);
+ else
+ return (len);
+}
+
+
/**
* Internal helper function to translate a human-readable failpoint string
* into a internally-parsable fail_point structure.
*/
static char *
-parse_fail_point(struct fail_point_entries *ents, char *p)
+parse_fail_point(struct fail_point_setting *ents, char *p)
{
/* <fail_point> ::
* <term> ( "->" <term> )*
*/
+ uint8_t term_count;
+
+ term_count = 1;
+
p = parse_term(ents, p);
if (p == NULL)
return (NULL);
+
while (*p != '\0') {
- if (p[0] != '-' || p[1] != '>')
- return (NULL);
- p = parse_term(ents, p + 2);
- if (p == NULL)
+ term_count++;
+ if (p[0] != '-' || p[1] != '>' ||
+ (p = parse_term(ents, p+2)) == NULL ||
+ term_count > FP_MAX_ENTRY_COUNT)
return (NULL);
}
return (p);
@@ -451,14 +982,11 @@ parse_fail_point(struct fail_point_entries *ents, char *p)
* Internal helper function to parse an individual term from a failpoint.
*/
static char *
-parse_term(struct fail_point_entries *ents, char *p)
+parse_term(struct fail_point_setting *ents, char *p)
{
struct fail_point_entry *ent;
- ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO);
- ent->fe_prob = PROB_MAX;
- ent->fe_pid = NO_PID;
- TAILQ_INSERT_TAIL(ents, ent, fe_entries);
+ ent = fail_point_entry_new(ents);
/*
* <term> ::
@@ -483,7 +1011,7 @@ parse_term(struct fail_point_entries *ents, char *p)
if (ent->fe_prob > PROB_MAX)
ent->fe_prob = PROB_MAX;
} else if (*p == '*') {
- if (!units || decimal)
+ if (!units || units < 0 || decimal)
return (NULL);
ent->fe_count = units;
} else
@@ -500,7 +1028,7 @@ parse_term(struct fail_point_entries *ents, char *p)
/* [ "(" <integer> ")" ] */
if (*p != '(')
- return p;
+ return (p);
p++;
if (!isdigit(*p) && *p != '-')
return (NULL);
@@ -509,7 +1037,7 @@ parse_term(struct fail_point_entries *ents, char *p)
return (NULL);
/* [ "[pid " <integer> "]" ] */
-#define PID_STRING "[pid "
+#define PID_STRING "[pid "
if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
return (p);
p += sizeof(PID_STRING) - 1;
@@ -530,7 +1058,7 @@ parse_number(int *out_units, int *out_decimal, char *p)
{
char *old_p;
- /*
+ /**
* <number> ::
* <integer> [ "." <integer> ] |
* "." <integer>
@@ -584,29 +1112,17 @@ parse_type(struct fail_point_entry *ent, char *beg)
return (NULL);
}
-/**
- * Internal helper function to free an individual failpoint term.
- */
-static void
-free_entry(struct fail_point_entries *ents, struct fail_point_entry *ent)
-{
- TAILQ_REMOVE(ents, ent, fe_entries);
- fp_free(ent);
-}
+/* The fail point sysctl tree. */
+SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points");
-/**
- * Internal helper function to clear out all failpoint terms for a single
- * failpoint.
- */
-static void
-clear_entries(struct fail_point_entries *ents)
+/* Debugging/testing stuff for fail point */
+static int
+sysctl_test_fail_point(SYSCTL_HANDLER_ARGS)
{
- struct fail_point_entry *ent, *ent_next;
- TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next)
- fp_free(ent);
- TAILQ_INIT(ents);
+ KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point);
+ return (0);
}
-
-/* The fail point sysctl tree. */
-SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points");
+SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point,
+ CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_test_fail_point, "A",
+ "Trigger test fail points");
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
index 7454d79..e204291 100644
--- a/sys/kern/kern_linker.c
+++ b/sys/kern/kern_linker.c
@@ -1763,8 +1763,6 @@ linker_hints_lookup(const char *path, int pathlen, const char *modname,
goto bad;
}
hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
- if (hints == NULL)
- goto bad;
error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
if (error)
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
index 3074589..5e634d7 100644
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@@ -424,6 +424,7 @@ mb_ctor_mbuf(void *mem, int size, void *arg, int how)
m = (struct mbuf *)mem;
flags = args->flags;
+ MPASS((flags & M_NOFREE) == 0);
error = m_init(m, how, type, flags);
@@ -572,6 +573,7 @@ mb_ctor_pack(void *mem, int size, void *arg, int how)
args = (struct mb_args *)arg;
flags = args->flags;
type = args->type;
+ MPASS((flags & M_NOFREE) == 0);
#ifdef INVARIANTS
trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
@@ -731,6 +733,7 @@ m_clget(struct mbuf *m, int how)
zone_drain(zone_pack);
uma_zalloc_arg(zone_clust, m, how);
}
+ MBUF_PROBE2(m__clget, m, how);
return (m->m_flags & M_EXT);
}
@@ -745,6 +748,7 @@ void *
m_cljget(struct mbuf *m, int how, int size)
{
uma_zone_t zone;
+ void *retval;
if (m != NULL) {
KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
@@ -753,7 +757,11 @@ m_cljget(struct mbuf *m, int how, int size)
}
zone = m_getzone(size);
- return (uma_zalloc_arg(zone, m, how));
+ retval = uma_zalloc_arg(zone, m, how);
+
+ MBUF_PROBE4(m__cljget, m, how, size, retval);
+
+ return (retval);
}
/*
@@ -934,6 +942,7 @@ void
m_freem(struct mbuf *mb)
{
+ MBUF_PROBE1(m__freem, mb);
while (mb != NULL)
mb = m_free(mb);
}
diff --git a/sys/kern/kern_osd.c b/sys/kern/kern_osd.c
index cc9bed1..41d518f 100644
--- a/sys/kern/kern_osd.c
+++ b/sys/kern/kern_osd.c
@@ -44,6 +44,23 @@ __FBSDID("$FreeBSD$");
/* OSD (Object Specific Data) */
+/*
+ * Lock key:
+ * (m) osd_module_lock
+ * (o) osd_object_lock
+ * (l) osd_list_lock
+ */
+struct osd_master {
+ struct sx osd_module_lock;
+ struct rmlock osd_object_lock;
+ struct mtx osd_list_lock;
+ LIST_HEAD(, osd) osd_list; /* (l) */
+ osd_destructor_t *osd_destructors; /* (o) */
+ osd_method_t *osd_methods; /* (m) */
+ u_int osd_ntslots; /* (m) */
+ const u_int osd_nmethods;
+};
+
static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data");
static int osd_debug = 0;
@@ -61,25 +78,12 @@ static void do_osd_del(u_int type, struct osd *osd, u_int slot,
int list_locked);
/*
- * Lists of objects with OSD.
- *
- * Lock key:
- * (m) osd_module_lock
- * (o) osd_object_lock
- * (l) osd_list_lock
+ * List of objects with OSD.
*/
-static LIST_HEAD(, osd) osd_list[OSD_LAST + 1]; /* (m) */
-static osd_method_t *osd_methods[OSD_LAST + 1]; /* (m) */
-static u_int osd_nslots[OSD_LAST + 1]; /* (m) */
-static osd_destructor_t *osd_destructors[OSD_LAST + 1]; /* (o) */
-static const u_int osd_nmethods[OSD_LAST + 1] = {
- [OSD_JAIL] = PR_MAXMETHOD,
+struct osd_master osdm[OSD_LAST + 1] = {
+ [OSD_JAIL] = { .osd_nmethods = PR_MAXMETHOD },
};
-static struct sx osd_module_lock[OSD_LAST + 1];
-static struct rmlock osd_object_lock[OSD_LAST + 1];
-static struct mtx osd_list_lock[OSD_LAST + 1];
-
static void
osd_default_destructor(void *value __unused)
{
@@ -101,12 +105,12 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods)
if (destructor == NULL)
destructor = osd_default_destructor;
- sx_xlock(&osd_module_lock[type]);
+ sx_xlock(&osdm[type].osd_module_lock);
/*
* First, we try to find unused slot.
*/
- for (i = 0; i < osd_nslots[type]; i++) {
- if (osd_destructors[type][i] == NULL) {
+ for (i = 0; i < osdm[type].osd_ntslots; i++) {
+ if (osdm[type].osd_destructors[i] == NULL) {
OSD_DEBUG("Unused slot found (type=%u, slot=%u).",
type, i);
break;
@@ -115,31 +119,31 @@ osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods)
/*
* If no unused slot was found, allocate one.
*/
- if (i == osd_nslots[type]) {
- osd_nslots[type]++;
- if (osd_nmethods[type] != 0)
- osd_methods[type] = realloc(osd_methods[type],
- sizeof(osd_method_t) * osd_nslots[type] *
- osd_nmethods[type], M_OSD, M_WAITOK);
- newptr = malloc(sizeof(osd_destructor_t) * osd_nslots[type],
- M_OSD, M_WAITOK);
- rm_wlock(&osd_object_lock[type]);
- bcopy(osd_destructors[type], newptr,
+ if (i == osdm[type].osd_ntslots) {
+ osdm[type].osd_ntslots++;
+ if (osdm[type].osd_nmethods != 0)
+ osdm[type].osd_methods = realloc(osdm[type].osd_methods,
+ sizeof(osd_method_t) * osdm[type].osd_ntslots *
+ osdm[type].osd_nmethods, M_OSD, M_WAITOK);
+ newptr = malloc(sizeof(osd_destructor_t) *
+ osdm[type].osd_ntslots, M_OSD, M_WAITOK);
+ rm_wlock(&osdm[type].osd_object_lock);
+ bcopy(osdm[type].osd_destructors, newptr,
sizeof(osd_destructor_t) * i);
- free(osd_destructors[type], M_OSD);
- osd_destructors[type] = newptr;
- rm_wunlock(&osd_object_lock[type]);
+ free(osdm[type].osd_destructors, M_OSD);
+ osdm[type].osd_destructors = newptr;
+ rm_wunlock(&osdm[type].osd_object_lock);
OSD_DEBUG("New slot allocated (type=%u, slot=%u).",
type, i + 1);
}
- osd_destructors[type][i] = destructor;
- if (osd_nmethods[type] != 0) {
- for (m = 0; m < osd_nmethods[type]; m++)
- osd_methods[type][i * osd_nmethods[type] + m] =
- methods != NULL ? methods[m] : NULL;
+ osdm[type].osd_destructors[i] = destructor;
+ if (osdm[type].osd_nmethods != 0) {
+ for (m = 0; m < osdm[type].osd_nmethods; m++)
+ osdm[type].osd_methods[i * osdm[type].osd_nmethods + m]
+ = methods != NULL ? methods[m] : NULL;
}
- sx_xunlock(&osd_module_lock[type]);
+ sx_xunlock(&osdm[type].osd_module_lock);
return (i + 1);
}
@@ -150,105 +154,142 @@ osd_deregister(u_int type, u_int slot)
KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
KASSERT(slot > 0, ("Invalid slot."));
- KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+ KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
- sx_xlock(&osd_module_lock[type]);
- rm_wlock(&osd_object_lock[type]);
+ sx_xlock(&osdm[type].osd_module_lock);
+ rm_wlock(&osdm[type].osd_object_lock);
/*
* Free all OSD for the given slot.
*/
- mtx_lock(&osd_list_lock[type]);
- LIST_FOREACH_SAFE(osd, &osd_list[type], osd_next, tosd)
+ mtx_lock(&osdm[type].osd_list_lock);
+ LIST_FOREACH_SAFE(osd, &osdm[type].osd_list, osd_next, tosd)
do_osd_del(type, osd, slot, 1);
- mtx_unlock(&osd_list_lock[type]);
+ mtx_unlock(&osdm[type].osd_list_lock);
/*
* Set destructor to NULL to free the slot.
*/
- osd_destructors[type][slot - 1] = NULL;
- if (slot == osd_nslots[type]) {
- osd_nslots[type]--;
- osd_destructors[type] = realloc(osd_destructors[type],
- sizeof(osd_destructor_t) * osd_nslots[type], M_OSD,
+ osdm[type].osd_destructors[slot - 1] = NULL;
+ if (slot == osdm[type].osd_ntslots) {
+ osdm[type].osd_ntslots--;
+ osdm[type].osd_destructors = realloc(osdm[type].osd_destructors,
+ sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD,
M_NOWAIT | M_ZERO);
- if (osd_nmethods[type] != 0)
- osd_methods[type] = realloc(osd_methods[type],
- sizeof(osd_method_t) * osd_nslots[type] *
- osd_nmethods[type], M_OSD, M_NOWAIT | M_ZERO);
+ if (osdm[type].osd_nmethods != 0)
+ osdm[type].osd_methods = realloc(osdm[type].osd_methods,
+ sizeof(osd_method_t) * osdm[type].osd_ntslots *
+ osdm[type].osd_nmethods, M_OSD, M_NOWAIT | M_ZERO);
/*
* We always reallocate to smaller size, so we assume it will
* always succeed.
*/
- KASSERT(osd_destructors[type] != NULL &&
- (osd_nmethods[type] == 0 || osd_methods[type] != NULL),
- ("realloc() failed"));
+ KASSERT(osdm[type].osd_destructors != NULL &&
+ (osdm[type].osd_nmethods == 0 ||
+ osdm[type].osd_methods != NULL), ("realloc() failed"));
OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).",
type, slot);
} else {
OSD_DEBUG("Slot deregistration (type=%u, slot=%u).",
type, slot);
}
- rm_wunlock(&osd_object_lock[type]);
- sx_xunlock(&osd_module_lock[type]);
+ rm_wunlock(&osdm[type].osd_object_lock);
+ sx_xunlock(&osdm[type].osd_module_lock);
}
int
osd_set(u_int type, struct osd *osd, u_int slot, void *value)
{
+
+ return (osd_set_reserved(type, osd, slot, NULL, value));
+}
+
+void *
+osd_reserve(u_int slot)
+{
+
+ KASSERT(slot > 0, ("Invalid slot."));
+
+ OSD_DEBUG("Reserving slot array (slot=%u).", slot);
+ return (malloc(sizeof(void *) * slot, M_OSD, M_WAITOK | M_ZERO));
+}
+
+int
+osd_set_reserved(u_int type, struct osd *osd, u_int slot, void *rsv,
+ void *value)
+{
struct rm_priotracker tracker;
KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
KASSERT(slot > 0, ("Invalid slot."));
- KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+ KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
- rm_rlock(&osd_object_lock[type], &tracker);
+ rm_rlock(&osdm[type].osd_object_lock, &tracker);
if (slot > osd->osd_nslots) {
+ void *newptr;
+
if (value == NULL) {
OSD_DEBUG(
"Not allocating null slot (type=%u, slot=%u).",
type, slot);
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock, &tracker);
+ if (rsv)
+ osd_free_reserved(rsv);
return (0);
- } else if (osd->osd_nslots == 0) {
+ }
+
+ /*
+ * Too few slots allocated here, so we need to extend or create
+ * the array.
+ */
+ if (rsv) {
/*
- * First OSD for this object, so we need to allocate
- * space and put it onto the list.
+ * Use the reserve passed in (assumed to be
+ * the right size).
*/
- osd->osd_slots = malloc(sizeof(void *) * slot, M_OSD,
- M_NOWAIT | M_ZERO);
- if (osd->osd_slots == NULL) {
- rm_runlock(&osd_object_lock[type], &tracker);
- return (ENOMEM);
+ newptr = rsv;
+ if (osd->osd_nslots != 0) {
+ memcpy(newptr, osd->osd_slots,
+ sizeof(void *) * osd->osd_nslots);
+ free(osd->osd_slots, M_OSD);
}
- osd->osd_nslots = slot;
- mtx_lock(&osd_list_lock[type]);
- LIST_INSERT_HEAD(&osd_list[type], osd, osd_next);
- mtx_unlock(&osd_list_lock[type]);
- OSD_DEBUG("Setting first slot (type=%u).", type);
} else {
- void *newptr;
-
- /*
- * Too few slots allocated here, needs to extend
- * the array.
- */
newptr = realloc(osd->osd_slots, sizeof(void *) * slot,
M_OSD, M_NOWAIT | M_ZERO);
if (newptr == NULL) {
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock,
+ &tracker);
return (ENOMEM);
}
- osd->osd_slots = newptr;
- osd->osd_nslots = slot;
- OSD_DEBUG("Growing slots array (type=%u).", type);
}
- }
+ if (osd->osd_nslots == 0) {
+ /*
+ * First OSD for this object, so we need to put it
+ * onto the list.
+ */
+ mtx_lock(&osdm[type].osd_list_lock);
+ LIST_INSERT_HEAD(&osdm[type].osd_list, osd, osd_next);
+ mtx_unlock(&osdm[type].osd_list_lock);
+ OSD_DEBUG("Setting first slot (type=%u).", type);
+ } else
+ OSD_DEBUG("Growing slots array (type=%u).", type);
+ osd->osd_slots = newptr;
+ osd->osd_nslots = slot;
+ } else if (rsv)
+ osd_free_reserved(rsv);
OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type,
slot, value);
osd->osd_slots[slot - 1] = value;
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock, &tracker);
return (0);
}
+void
+osd_free_reserved(void *rsv)
+{
+
+ OSD_DEBUG("Discarding reserved slot array.");
+ free(rsv, M_OSD);
+}
+
void *
osd_get(u_int type, struct osd *osd, u_int slot)
{
@@ -257,9 +298,9 @@ osd_get(u_int type, struct osd *osd, u_int slot)
KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
KASSERT(slot > 0, ("Invalid slot."));
- KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+ KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
- rm_rlock(&osd_object_lock[type], &tracker);
+ rm_rlock(&osdm[type].osd_object_lock, &tracker);
if (slot > osd->osd_nslots) {
value = NULL;
OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
@@ -268,7 +309,7 @@ osd_get(u_int type, struct osd *osd, u_int slot)
OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).",
type, slot, value);
}
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock, &tracker);
return (value);
}
@@ -277,9 +318,9 @@ osd_del(u_int type, struct osd *osd, u_int slot)
{
struct rm_priotracker tracker;
- rm_rlock(&osd_object_lock[type], &tracker);
+ rm_rlock(&osdm[type].osd_object_lock, &tracker);
do_osd_del(type, osd, slot, 0);
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock, &tracker);
}
static void
@@ -289,7 +330,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
KASSERT(slot > 0, ("Invalid slot."));
- KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+ KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot);
@@ -298,7 +339,7 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
return;
}
if (osd->osd_slots[slot - 1] != NULL) {
- osd_destructors[type][slot - 1](osd->osd_slots[slot - 1]);
+ osdm[type].osd_destructors[slot - 1](osd->osd_slots[slot - 1]);
osd->osd_slots[slot - 1] = NULL;
}
for (i = osd->osd_nslots - 1; i >= 0; i--) {
@@ -312,10 +353,10 @@ do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
/* No values left for this object. */
OSD_DEBUG("No more slots left (type=%u).", type);
if (!list_locked)
- mtx_lock(&osd_list_lock[type]);
+ mtx_lock(&osdm[type].osd_list_lock);
LIST_REMOVE(osd, osd_next);
if (!list_locked)
- mtx_unlock(&osd_list_lock[type]);
+ mtx_unlock(&osdm[type].osd_list_lock);
free(osd->osd_slots, M_OSD);
osd->osd_slots = NULL;
osd->osd_nslots = 0;
@@ -341,21 +382,21 @@ osd_call(u_int type, u_int method, void *obj, void *data)
int error, i;
KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
- KASSERT(method < osd_nmethods[type], ("Invalid method."));
+ KASSERT(method < osdm[type].osd_nmethods, ("Invalid method."));
/*
* Call this method for every slot that defines it, stopping if an
* error is encountered.
*/
error = 0;
- sx_slock(&osd_module_lock[type]);
- for (i = 0; i < osd_nslots[type]; i++) {
- methodfun =
- osd_methods[type][i * osd_nmethods[type] + method];
+ sx_slock(&osdm[type].osd_module_lock);
+ for (i = 0; i < osdm[type].osd_ntslots; i++) {
+ methodfun = osdm[type].osd_methods[i * osdm[type].osd_nmethods +
+ method];
if (methodfun != NULL && (error = methodfun(obj, data)) != 0)
break;
}
- sx_sunlock(&osd_module_lock[type]);
+ sx_sunlock(&osdm[type].osd_module_lock);
return (error);
}
@@ -373,14 +414,14 @@ osd_exit(u_int type, struct osd *osd)
return;
}
- rm_rlock(&osd_object_lock[type], &tracker);
+ rm_rlock(&osdm[type].osd_object_lock, &tracker);
for (i = 1; i <= osd->osd_nslots; i++) {
- if (osd_destructors[type][i - 1] != NULL)
+ if (osdm[type].osd_destructors[i - 1] != NULL)
do_osd_del(type, osd, i, 0);
else
OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i);
}
- rm_runlock(&osd_object_lock[type], &tracker);
+ rm_runlock(&osdm[type].osd_object_lock, &tracker);
OSD_DEBUG("Object exit (type=%u).", type);
}
@@ -390,13 +431,13 @@ osd_init(void *arg __unused)
u_int i;
for (i = OSD_FIRST; i <= OSD_LAST; i++) {
- osd_nslots[i] = 0;
- LIST_INIT(&osd_list[i]);
- sx_init(&osd_module_lock[i], "osd_module");
- rm_init(&osd_object_lock[i], "osd_object");
- mtx_init(&osd_list_lock[i], "osd_list", NULL, MTX_DEF);
- osd_destructors[i] = NULL;
- osd_methods[i] = NULL;
+ sx_init(&osdm[i].osd_module_lock, "osd_module");
+ rm_init(&osdm[i].osd_object_lock, "osd_object");
+ mtx_init(&osdm[i].osd_list_lock, "osd_list", NULL, MTX_DEF);
+ LIST_INIT(&osdm[i].osd_list);
+ osdm[i].osd_destructors = NULL;
+ osdm[i].osd_ntslots = 0;
+ osdm[i].osd_methods = NULL;
}
}
SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL);
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 1c3c9d7..419c69e 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -114,6 +114,8 @@ SDT_PROBE_DEFINE3(racct, , rusage, set,
"struct proc *", "int", "uint64_t");
SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
"struct proc *", "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, , rusage, set__force,
+ "struct proc *", "int", "uint64_t");
SDT_PROBE_DEFINE3(racct, , rusage, sub,
"struct proc *", "int", "uint64_t");
SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
@@ -532,7 +534,7 @@ racct_adjust_resource(struct racct *racct, int resource,
}
static int
-racct_add_locked(struct proc *p, int resource, uint64_t amount)
+racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
{
#ifdef RCTL
int error;
@@ -540,8 +542,6 @@ racct_add_locked(struct proc *p, int resource, uint64_t amount)
ASSERT_RACCT_ENABLED();
- SDT_PROBE3(racct, , rusage, add, p, resource, amount);
-
/*
* We need proc lock to dereference p->p_ucred.
*/
@@ -549,7 +549,7 @@ racct_add_locked(struct proc *p, int resource, uint64_t amount)
#ifdef RCTL
error = rctl_enforce(p, resource, amount);
- if (error && RACCT_IS_DENIABLE(resource)) {
+ if (error && !force && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
return (error);
}
@@ -572,12 +572,32 @@ racct_add(struct proc *p, int resource, uint64_t amount)
if (!racct_enable)
return (0);
+ SDT_PROBE3(racct, , rusage, add, p, resource, amount);
+
mtx_lock(&racct_lock);
- error = racct_add_locked(p, resource, amount);
+ error = racct_add_locked(p, resource, amount, 0);
mtx_unlock(&racct_lock);
return (error);
}
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Doesn't check for limits and never fails.
+ */
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+ if (!racct_enable)
+ return;
+
+ SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
+
+ mtx_lock(&racct_lock);
+ racct_add_locked(p, resource, amount, 1);
+ mtx_unlock(&racct_lock);
+}
+
static void
racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
{
@@ -597,8 +617,6 @@ racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
/*
* Increase allocation of 'resource' by 'amount' for credential 'cred'.
* Doesn't check for limits and never fails.
- *
- * XXX: Shouldn't this ever return an error?
*/
void
racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
@@ -612,32 +630,8 @@ racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
mtx_unlock(&racct_lock);
}
-/*
- * Increase allocation of 'resource' by 'amount' for process 'p'.
- * Doesn't check for limits and never fails.
- */
-void
-racct_add_force(struct proc *p, int resource, uint64_t amount)
-{
-
- if (!racct_enable)
- return;
-
- SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
-
- /*
- * We need proc lock to dereference p->p_ucred.
- */
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- mtx_lock(&racct_lock);
- racct_adjust_resource(p->p_racct, resource, amount);
- racct_add_cred_locked(p->p_ucred, resource, amount);
- mtx_unlock(&racct_lock);
-}
-
static int
-racct_set_locked(struct proc *p, int resource, uint64_t amount)
+racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
{
int64_t old_amount, decayed_amount;
int64_t diff_proc, diff_cred;
@@ -647,8 +641,6 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount)
ASSERT_RACCT_ENABLED();
- SDT_PROBE3(racct, , rusage, set, p, resource, amount);
-
/*
* We need proc lock to dereference p->p_ucred.
*/
@@ -678,7 +670,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount)
#ifdef RCTL
if (diff_proc > 0) {
error = rctl_enforce(p, resource, diff_proc);
- if (error && RACCT_IS_DENIABLE(resource)) {
+ if (error && !force && RACCT_IS_DENIABLE(resource)) {
SDT_PROBE3(racct, , rusage, set__failure, p, resource,
amount);
return (error);
@@ -709,51 +701,14 @@ racct_set(struct proc *p, int resource, uint64_t amount)
if (!racct_enable)
return (0);
+ SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
+
mtx_lock(&racct_lock);
- error = racct_set_locked(p, resource, amount);
+ error = racct_set_locked(p, resource, amount, 0);
mtx_unlock(&racct_lock);
return (error);
}
-static void
-racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
-{
- int64_t old_amount, decayed_amount;
- int64_t diff_proc, diff_cred;
-
- ASSERT_RACCT_ENABLED();
-
- SDT_PROBE3(racct, , rusage, set, p, resource, amount);
-
- /*
- * We need proc lock to dereference p->p_ucred.
- */
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- old_amount = p->p_racct->r_resources[resource];
- /*
- * The diffs may be negative.
- */
- diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
- /*
- * Resources in per-credential racct containers may decay.
- * If this is the case, we need to calculate the difference
- * between the new amount and the proportional value of the
- * old amount that has decayed in the ucred racct containers.
- */
- decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
- diff_cred = amount - decayed_amount;
- } else
- diff_cred = diff_proc;
-
- racct_adjust_resource(p->p_racct, resource, diff_proc);
- if (diff_cred > 0)
- racct_add_cred_locked(p->p_ucred, resource, diff_cred);
- else if (diff_cred < 0)
- racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
-}
-
void
racct_set_force(struct proc *p, int resource, uint64_t amount)
{
@@ -761,8 +716,10 @@ racct_set_force(struct proc *p, int resource, uint64_t amount)
if (!racct_enable)
return;
+ SDT_PROBE3(racct, , rusage, set, p, resource, amount);
+
mtx_lock(&racct_lock);
- racct_set_force_locked(p, resource, amount);
+ racct_set_locked(p, resource, amount, 1);
mtx_unlock(&racct_lock);
}
@@ -930,13 +887,13 @@ racct_proc_fork(struct proc *parent, struct proc *child)
continue;
error = racct_set_locked(child, i,
- parent->p_racct->r_resources[i]);
+ parent->p_racct->r_resources[i], 0);
if (error != 0)
goto out;
}
- error = racct_add_locked(child, RACCT_NPROC, 1);
- error += racct_add_locked(child, RACCT_NTHR, 1);
+ error = racct_add_locked(child, RACCT_NPROC, 1, 0);
+ error += racct_add_locked(child, RACCT_NTHR, 1, 0);
out:
mtx_unlock(&racct_lock);
@@ -1002,7 +959,7 @@ racct_proc_exit(struct proc *p)
pct = racct_getpcpu(p, pct_estimate);
mtx_lock(&racct_lock);
- racct_set_locked(p, RACCT_CPU, runtime);
+ racct_set_locked(p, RACCT_CPU, runtime, 0);
racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
for (i = 0; i <= RACCT_MAX; i++) {
@@ -1010,7 +967,7 @@ racct_proc_exit(struct proc *p)
continue;
if (!RACCT_IS_RECLAIMABLE(i))
continue;
- racct_set_locked(p, i, 0);
+ racct_set_locked(p, i, 0, 0);
}
mtx_unlock(&racct_lock);
@@ -1150,23 +1107,21 @@ racct_proc_wakeup(struct proc *p)
}
static void
-racct_decay_resource(struct racct *racct, void * res, void* dummy)
+racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
{
- int resource;
int64_t r_old, r_new;
ASSERT_RACCT_ENABLED();
mtx_assert(&racct_lock, MA_OWNED);
- resource = *(int *)res;
- r_old = racct->r_resources[resource];
+ r_old = racct->r_resources[RACCT_PCTCPU];
/* If there is nothing to decay, just exit. */
if (r_old <= 0)
return;
r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
- racct->r_resources[resource] = r_new;
+ racct->r_resources[RACCT_PCTCPU] = r_new;
}
static void
@@ -1184,17 +1139,17 @@ racct_decay_post(void)
}
static void
-racct_decay(int resource)
+racct_decay(void)
{
ASSERT_RACCT_ENABLED();
- ui_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- loginclass_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- prison_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
+ ui_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ prison_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
}
static void
@@ -1209,7 +1164,7 @@ racctd(void)
ASSERT_RACCT_ENABLED();
for (;;) {
- racct_decay(RACCT_PCTCPU);
+ racct_decay();
sx_slock(&allproc_lock);
@@ -1249,11 +1204,11 @@ racctd(void)
pct_estimate = 0;
pct = racct_getpcpu(p, pct_estimate);
mtx_lock(&racct_lock);
- racct_set_force_locked(p, RACCT_PCTCPU, pct);
- racct_set_locked(p, RACCT_CPU, runtime);
+ racct_set_locked(p, RACCT_PCTCPU, pct, 1);
+ racct_set_locked(p, RACCT_CPU, runtime, 0);
racct_set_locked(p, RACCT_WALLCLOCK,
(uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
+ wallclock.tv_usec, 0);
mtx_unlock(&racct_lock);
PROC_UNLOCK(p);
}
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index b0d3fd6..1970ed8 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -78,10 +78,16 @@ FEATURE(rctl, "Resource Limits");
#define RCTL_PCPU_SHIFT (10 * 1000000)
unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static int rctl_log_rate_limit = 10;
+static int rctl_devctl_rate_limit = 10;
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
&rctl_maxbufsize, 0, "Maximum output buffer size");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
+ &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
+ &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -219,43 +225,43 @@ rctl_resource_name(int resource)
panic("rctl_resource_name: unknown resource %d", resource);
}
-/*
- * Return the amount of resource that can be allocated by 'p' before
- * hitting 'rule'.
- */
-static int64_t
-rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+static struct racct *
+rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
{
- int resource;
- int64_t available = INT64_MAX;
struct ucred *cred = p->p_ucred;
ASSERT_RACCT_ENABLED();
rw_assert(&rctl_lock, RA_LOCKED);
- resource = rule->rr_resource;
switch (rule->rr_per) {
case RCTL_SUBJECT_TYPE_PROCESS:
- available = rule->rr_amount -
- p->p_racct->r_resources[resource];
- break;
+ return (p->p_racct);
case RCTL_SUBJECT_TYPE_USER:
- available = rule->rr_amount -
- cred->cr_ruidinfo->ui_racct->r_resources[resource];
- break;
+ return (cred->cr_ruidinfo->ui_racct);
case RCTL_SUBJECT_TYPE_LOGINCLASS:
- available = rule->rr_amount -
- cred->cr_loginclass->lc_racct->r_resources[resource];
- break;
+ return (cred->cr_loginclass->lc_racct);
case RCTL_SUBJECT_TYPE_JAIL:
- available = rule->rr_amount -
- cred->cr_prison->pr_prison_racct->prr_racct->
- r_resources[resource];
- break;
+ return (cred->cr_prison->pr_prison_racct->prr_racct);
default:
- panic("rctl_compute_available: unknown per %d",
- rule->rr_per);
+ panic("%s: unknown per %d", __func__, rule->rr_per);
}
+}
+
+/*
+ * Return the amount of resource that can be allocated by 'p' before
+ * hitting 'rule'.
+ */
+static int64_t
+rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+{
+ int64_t available;
+ const struct racct *racct;
+
+ ASSERT_RACCT_ENABLED();
+ rw_assert(&rctl_lock, RA_LOCKED);
+
+ racct = rctl_proc_rule_to_racct(p, rule);
+ available = rule->rr_amount - racct->r_resources[rule->rr_resource];
return (available);
}
@@ -336,13 +342,13 @@ rctl_pcpu_available(const struct proc *p) {
int
rctl_enforce(struct proc *p, int resource, uint64_t amount)
{
+ static struct timeval log_lasttime, devctl_lasttime;
+ static int log_curtime = 0, devctl_curtime = 0;
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
int should_deny = 0;
char *buf;
- static int curtime = 0;
- static struct timeval lasttime;
ASSERT_RACCT_ENABLED();
@@ -383,7 +389,8 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
if (p->p_state != PRS_NORMAL)
continue;
- if (!ppsratecheck(&lasttime, &curtime, 10))
+ if (!ppsratecheck(&log_lasttime, &log_curtime,
+ rctl_log_rate_limit))
continue;
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
@@ -409,6 +416,10 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
if (p->p_state != PRS_NORMAL)
continue;
+ if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
+ rctl_devctl_rate_limit))
+ continue;
+
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
if (buf == NULL) {
printf("rctl_enforce: out of memory\n");
@@ -642,6 +653,9 @@ str2int64(const char *str, int64_t *value)
if ((size_t)(end - str) != strlen(str))
return (EINVAL);
+ if (*value < 0)
+ return (ERANGE);
+
return (0);
}
@@ -1008,8 +1022,13 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
error = str2int64(amountstr, &rule->rr_amount);
if (error != 0)
goto out;
- if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
+ if (rule->rr_amount > INT64_MAX / 1000000) {
+ error = ERANGE;
+ goto out;
+ }
rule->rr_amount *= 1000000;
+ }
}
if (perstr == NULL || perstr[0] == '\0')
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index a2e1311..7ee7780 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -516,7 +516,7 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
int
vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
- int kflags, struct thread *td)
+ struct thread *td)
{
struct file *sock_fp;
struct vnode *vp;
@@ -534,7 +534,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
so = NULL;
m = mh = NULL;
sfs = NULL;
- sbytes = 0;
+ hdrlen = sbytes = 0;
softerr = 0;
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
@@ -560,26 +560,6 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
cv_init(&sfs->cv, "sendfile");
}
- /* If headers are specified copy them into mbufs. */
- if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
- hdr_uio->uio_td = td;
- hdr_uio->uio_rw = UIO_WRITE;
- /*
- * In FBSD < 5.0 the nbytes to send also included
- * the header. If compat is specified subtract the
- * header size from nbytes.
- */
- if (kflags & SFK_COMPAT) {
- if (nbytes > hdr_uio->uio_resid)
- nbytes -= hdr_uio->uio_resid;
- else
- nbytes = 0;
- }
- mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0);
- hdrlen = m_length(mh, &mhtail);
- } else
- hdrlen = 0;
-
rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
/*
@@ -668,11 +648,20 @@ retry_space:
SOCKBUF_UNLOCK(&so->so_snd);
/*
- * Reduce space in the socket buffer by the size of
- * the header mbuf chain.
- * hdrlen is set to 0 after the first loop.
+ * At the beginning of the first loop check if any headers
+ * are specified and copy them into mbufs. Reduce space in
+ * the socket buffer by the size of the header mbuf chain.
+ * Clear hdr_uio here and hdrlen at the end of the first loop.
*/
- space -= hdrlen;
+ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
+ hdr_uio->uio_td = td;
+ hdr_uio->uio_rw = UIO_WRITE;
+ hdr_uio->uio_resid = min(hdr_uio->uio_resid, space);
+ mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0);
+ hdrlen = m_length(mh, &mhtail);
+ space -= hdrlen;
+ hdr_uio = NULL;
+ }
if (vp != NULL) {
error = vn_lock(vp, LK_SHARED);
@@ -944,6 +933,19 @@ sendfile(struct thread *td, struct sendfile_args *uap, int compat)
&hdr_uio);
if (error != 0)
goto out;
+#ifdef COMPAT_FREEBSD4
+ /*
+ * In FreeBSD < 5.0 the nbytes to send also included
+ * the header. If compat is specified subtract the
+ * header size from nbytes.
+ */
+ if (compat) {
+ if (uap->nbytes > hdr_uio->uio_resid)
+ uap->nbytes -= hdr_uio->uio_resid;
+ else
+ uap->nbytes = 0;
+ }
+#endif
}
if (hdtr.trailers != NULL) {
error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
@@ -965,7 +967,7 @@ sendfile(struct thread *td, struct sendfile_args *uap, int compat)
}
error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
- uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
+ uap->nbytes, &sbytes, uap->flags, td);
fdrop(fp, td);
if (uap->sbytes != NULL)
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 8c84773..c9932f56 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -162,15 +162,7 @@ _sleep(void *ident, struct lock_object *lock, int priority,
else
class = NULL;
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * During autoconfiguration, just return;
- * don't run any other threads or panic below,
- * in case this is the idle thread and already asleep.
- * XXX: this used to do "s = splhigh(); splx(safepri);
- * splx(s);" to give interrupts a chance, but there is
- * no way to give interrupts a chance now.
- */
+ if (SCHEDULER_STOPPED()) {
if (lock != NULL && priority & PDROP)
class->lc_unlock(lock);
return (0);
@@ -264,17 +256,8 @@ msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
KASSERT(p != NULL, ("msleep1"));
KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
- if (cold || SCHEDULER_STOPPED()) {
- /*
- * During autoconfiguration, just return;
- * don't run any other threads or panic below,
- * in case this is the idle thread and already asleep.
- * XXX: this used to do "s = splhigh(); splx(safepri);
- * splx(s);" to give interrupts a chance, but there is
- * no way to give interrupts a chance now.
- */
+ if (SCHEDULER_STOPPED())
return (0);
- }
sleepq_lock(ident);
CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
@@ -438,8 +421,10 @@ mi_switch(int flags, struct thread *newtd)
if (flags & SW_VOL) {
td->td_ru.ru_nvcsw++;
td->td_swvoltick = ticks;
- } else
+ } else {
td->td_ru.ru_nivcsw++;
+ td->td_swinvoltick = ticks;
+ }
#ifdef SCHED_STATS
SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
#endif
diff --git a/sys/kern/pic_if.m b/sys/kern/pic_if.m
index 2309f0c..3003c35 100644
--- a/sys/kern/pic_if.m
+++ b/sys/kern/pic_if.m
@@ -1,7 +1,6 @@
#-
-# Copyright (c) 2012 Jakub Wojciech Klama <jceel@FreeBSD.org>
-# Copyright (c) 2015 Svatopluk Kraus
-# Copyright (c) 2015 Michal Meloun
+# Copyright (c) 2015-2016 Svatopluk Kraus
+# Copyright (c) 2015-2016 Michal Meloun
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -30,88 +29,132 @@
#include <sys/bus.h>
#include <sys/cpuset.h>
-#include <machine/frame.h>
-#include <machine/intr.h>
+#include <sys/resource.h>
+#include <sys/intr.h>
INTERFACE pic;
CODE {
- static int null_pic_bind(device_t dev, struct intr_irqsrc *isrc)
+ static int
+ dflt_pic_bind_intr(device_t dev, struct intr_irqsrc *isrc)
{
+
return (EOPNOTSUPP);
}
- static void null_pic_disable_intr(device_t dev, struct intr_irqsrc *isrc)
+ static int
+ null_pic_alloc_intr(device_t dev, struct intr_irqsrc *isrc,
+ struct resource *res, struct intr_map_data *data)
+ {
+
+ return (0);
+ }
+
+ static int
+ null_pic_release_intr(device_t dev, struct intr_irqsrc *isrc,
+ struct resource *res, struct intr_map_data *data)
+ {
+
+ return (0);
+ }
+
+ static int
+ null_pic_setup_intr(device_t dev, struct intr_irqsrc *isrc,
+ struct resource *res, struct intr_map_data *data)
{
- return;
+
+ return (0);
}
- static void null_pic_enable_intr(device_t dev, struct intr_irqsrc *isrc)
+ static int
+ null_pic_teardown_intr(device_t dev, struct intr_irqsrc *isrc,
+ struct resource *res, struct intr_map_data *data)
{
- return;
+
+ return (0);
}
- static void null_pic_init_secondary(device_t dev)
+ static void
+ null_pic_init_secondary(device_t dev)
{
- return;
}
- static void null_pic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi)
+ static void
+ null_pic_ipi_send(device_t dev, cpuset_t cpus, u_int ipi)
{
- return;
+ }
+
+ static int
+ dflt_pic_ipi_setup(device_t dev, u_int ipi, struct intr_irqsrc *isrc)
+ {
+
+ return (EOPNOTSUPP);
}
};
-METHOD int register {
+METHOD int alloc_intr {
device_t dev;
struct intr_irqsrc *isrc;
- boolean_t *is_percpu;
-};
+ struct resource *res;
+ struct intr_map_data *data;
+} DEFAULT null_pic_alloc_intr;
-METHOD int unregister {
+METHOD int bind_intr {
device_t dev;
struct intr_irqsrc *isrc;
-};
+} DEFAULT dflt_pic_bind_intr;
METHOD void disable_intr {
device_t dev;
struct intr_irqsrc *isrc;
-} DEFAULT null_pic_disable_intr;
+};
-METHOD void disable_source {
+METHOD void enable_intr {
device_t dev;
struct intr_irqsrc *isrc;
};
-METHOD void enable_source {
+METHOD int map_intr {
device_t dev;
- struct intr_irqsrc *isrc;
+ struct intr_map_data *data;
+ struct intr_irqsrc **isrcp;
};
-METHOD void enable_intr {
+METHOD int release_intr {
device_t dev;
struct intr_irqsrc *isrc;
-} DEFAULT null_pic_enable_intr;
+ struct resource *res;
+ struct intr_map_data *data;
+} DEFAULT null_pic_release_intr;
-METHOD void pre_ithread {
+METHOD int setup_intr {
device_t dev;
struct intr_irqsrc *isrc;
-};
+ struct resource *res;
+ struct intr_map_data *data;
+} DEFAULT null_pic_setup_intr;
-METHOD void post_ithread {
+METHOD int teardown_intr {
device_t dev;
struct intr_irqsrc *isrc;
-};
+ struct resource *res;
+ struct intr_map_data *data;
+} DEFAULT null_pic_teardown_intr;
METHOD void post_filter {
device_t dev;
struct intr_irqsrc *isrc;
};
-METHOD int bind {
+METHOD void post_ithread {
device_t dev;
struct intr_irqsrc *isrc;
-} DEFAULT null_pic_bind;
+};
+
+METHOD void pre_ithread {
+ device_t dev;
+ struct intr_irqsrc *isrc;
+};
METHOD void init_secondary {
device_t dev;
@@ -121,4 +164,11 @@ METHOD void ipi_send {
device_t dev;
struct intr_irqsrc *isrc;
cpuset_t cpus;
+ u_int ipi;
} DEFAULT null_pic_ipi_send;
+
+METHOD int ipi_setup {
+ device_t dev;
+ u_int ipi;
+ struct intr_irqsrc **isrcp;
+} DEFAULT dflt_pic_ipi_setup;
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
index 8daa9f2..caf9202 100644
--- a/sys/kern/subr_bus.c
+++ b/sys/kern/subr_bus.c
@@ -839,6 +839,38 @@ sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
return (0);
}
+/**
+ * @brief safely quotes strings that might have double quotes in them.
+ *
+ * The devctl protocol relies on quoted strings having matching quotes.
+ * This routine quotes any internal quotes so the resulting string
+ * is safe to pass to snprintf to construct, for example pnp info strings.
+ * Strings are always terminated with a NUL, but may be truncated if longer
+ * than @p len bytes after quotes.
+ *
+ * @param dst Buffer to hold the string. Must be at least @p len bytes long
+ * @param src Original buffer.
+ * @param len Length of buffer pointed to by @dst, including trailing NUL
+ */
+void
+devctl_safe_quote(char *dst, const char *src, size_t len)
+{
+ char *walker = dst, *ep = dst + len - 1;
+
+ if (len == 0)
+ return;
+ while (src != NULL && walker < ep)
+ {
+ if (*src == '"') {
+ if (ep - walker < 2)
+ break;
+ *walker++ = '\\';
+ }
+ *walker++ = *src++;
+ }
+ *walker = '\0';
+}
+
/* End of /dev/devctl code */
static TAILQ_HEAD(,device) bus_data_devices;
diff --git a/sys/kern/subr_counter.c b/sys/kern/subr_counter.c
index ea2759c..5149f2d 100644
--- a/sys/kern/subr_counter.c
+++ b/sys/kern/subr_counter.c
@@ -94,3 +94,28 @@ sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS)
return (0);
}
+
+int
+sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t *out;
+ int error;
+
+ out = malloc(arg2 * sizeof(uint64_t), M_TEMP, M_WAITOK);
+ for (int i = 0; i < arg2; i++)
+ out[i] = counter_u64_fetch(((counter_u64_t *)arg1)[i]);
+
+ error = SYSCTL_OUT(req, out, arg2 * sizeof(uint64_t));
+ free(out, M_TEMP);
+
+ if (error || !req->newptr)
+ return (error);
+
+ /*
+ * Any write attempt to a counter zeroes it.
+ */
+ for (int i = 0; i < arg2; i++)
+ counter_u64_zero(((counter_u64_t *)arg1)[i]);
+
+ return (0);
+}
diff --git a/sys/kern/subr_intr.c b/sys/kern/subr_intr.c
index 1c97fd4..96319ad 100644
--- a/sys/kern/subr_intr.c
+++ b/sys/kern/subr_intr.c
@@ -1,7 +1,6 @@
/*-
- * Copyright (c) 2012-2014 Jakub Wojciech Klama <jceel@FreeBSD.org>.
- * Copyright (c) 2015 Svatopluk Kraus
- * Copyright (c) 2015 Michal Meloun
+ * Copyright (c) 2015-2016 Svatopluk Kraus
+ * Copyright (c) 2015-2016 Michal Meloun
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -24,8 +23,6 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * $FreeBSD$
*/
#include <sys/cdefs.h>
@@ -38,6 +35,7 @@ __FBSDID("$FreeBSD$");
* - to complete things for removable PICs
*/
+#include "opt_acpi.h"
#include "opt_ddb.h"
#include "opt_platform.h"
@@ -52,6 +50,7 @@ __FBSDID("$FreeBSD$");
#include <sys/interrupt.h>
#include <sys/conf.h>
#include <sys/cpuset.h>
+#include <sys/rman.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <machine/atomic.h>
@@ -112,6 +111,35 @@ u_int irq_next_free;
#define IRQ_INVALID nitems(irq_sources)
+/*
+ * XXX - All stuff around struct intr_dev_data is considered as temporary
+ * until better place for storing struct intr_map_data will be find.
+ *
+ * For now, there are two global interrupt numbers spaces:
+ * <0, NIRQ) ... interrupts without config data
+ * managed in irq_sources[]
+ * IRQ_DDATA_BASE + <0, 2 * NIRQ) ... interrupts with config data
+ * managed in intr_ddata_tab[]
+ *
+ * Read intr_ddata_lookup() to see how these spaces are worked with.
+ * Note that each interrupt number from second space duplicates some number
+ * from first space at this moment. An interrupt number from first space can
+ * be duplicated even multiple times in second space.
+ */
+struct intr_dev_data {
+ device_t idd_dev;
+ intptr_t idd_xref;
+ u_int idd_irq;
+ struct intr_map_data idd_data;
+ struct intr_irqsrc * idd_isrc;
+};
+
+static struct intr_dev_data *intr_ddata_tab[2 * NIRQ];
+static u_int intr_ddata_first_unused;
+
+#define IRQ_DDATA_BASE 10000
+CTASSERT(IRQ_DDATA_BASE > IRQ_INVALID);
+
#ifdef SMP
static boolean_t irq_assign_cpu = FALSE;
#endif
@@ -173,12 +201,10 @@ static inline void
isrc_increment_count(struct intr_irqsrc *isrc)
{
- /*
- * XXX - It should be atomic for PPI interrupts. It was proven that
- * the lost is measurable easily for timer PPI interrupts.
- */
- isrc->isrc_count[0]++;
- /*atomic_add_long(&isrc->isrc_count[0], 1);*/
+ if (isrc->isrc_flags & INTR_ISRCF_PPI)
+ atomic_add_long(&isrc->isrc_count[0], 1);
+ else
+ isrc->isrc_count[0]++;
}
/*
@@ -233,6 +259,16 @@ isrc_setup_counters(struct intr_irqsrc *isrc)
isrc_update_name(isrc, NULL);
}
+/*
+ * Virtualization for interrupt source interrupt counters release.
+ */
+static void
+isrc_release_counters(struct intr_irqsrc *isrc)
+{
+
+ panic("%s: not implemented", __func__);
+}
+
#ifdef SMP
/*
* Virtualization for interrupt source IPI counters setup.
@@ -279,8 +315,8 @@ intr_irq_handler(struct trapframe *tf)
* be called straight from the interrupt controller, when associated interrupt
* source is learned.
*/
-void
-intr_irq_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf)
+int
+intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf)
{
KASSERT(isrc != NULL, ("%s: no source", __func__));
@@ -293,57 +329,16 @@ intr_irq_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf)
error = isrc->isrc_filter(isrc->isrc_arg, tf);
PIC_POST_FILTER(isrc->isrc_dev, isrc);
if (error == FILTER_HANDLED)
- return;
- } else
+ return (0);
+ } else
#endif
if (isrc->isrc_event != NULL) {
if (intr_event_handle(isrc->isrc_event, tf) == 0)
- return;
+ return (0);
}
isrc_increment_straycount(isrc);
- PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc);
-
- device_printf(isrc->isrc_dev, "stray irq <%s> disabled",
- isrc->isrc_name);
-}
-
-/*
- * Allocate interrupt source.
- */
-static struct intr_irqsrc *
-isrc_alloc(u_int type, u_int extsize)
-{
- struct intr_irqsrc *isrc;
-
- isrc = malloc(sizeof(*isrc) + extsize, M_INTRNG, M_WAITOK | M_ZERO);
- isrc->isrc_irq = IRQ_INVALID; /* just to be safe */
- isrc->isrc_type = type;
- isrc->isrc_nspc_type = INTR_IRQ_NSPC_NONE;
- isrc->isrc_trig = INTR_TRIGGER_CONFORM;
- isrc->isrc_pol = INTR_POLARITY_CONFORM;
- CPU_ZERO(&isrc->isrc_cpu);
- return (isrc);
-}
-
-/*
- * Free interrupt source.
- */
-static void
-isrc_free(struct intr_irqsrc *isrc)
-{
-
- free(isrc, M_INTRNG);
-}
-
-void
-intr_irq_set_name(struct intr_irqsrc *isrc, const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap);
- va_end(ap);
+ return (EINVAL);
}
/*
@@ -356,8 +351,8 @@ intr_irq_set_name(struct intr_irqsrc *isrc, const char *fmt, ...)
* immediately. However, if only one free handle left which is reused
* constantly...
*/
-static int
-isrc_alloc_irq_locked(struct intr_irqsrc *isrc)
+static inline int
+isrc_alloc_irq(struct intr_irqsrc *isrc)
{
u_int maxirqs, irq;
@@ -383,46 +378,35 @@ found:
isrc->isrc_irq = irq;
irq_sources[irq] = isrc;
- intr_irq_set_name(isrc, "irq%u", irq);
- isrc_setup_counters(isrc);
-
irq_next_free = irq + 1;
if (irq_next_free >= maxirqs)
irq_next_free = 0;
return (0);
}
-#ifdef notyet
+
/*
* Free unique interrupt number (resource handle) from interrupt source.
*/
-static int
+static inline int
isrc_free_irq(struct intr_irqsrc *isrc)
{
- u_int maxirqs;
- mtx_assert(&isrc_table_lock, MA_NOTOWNED);
+ mtx_assert(&isrc_table_lock, MA_OWNED);
- maxirqs = nitems(irq_sources);
- if (isrc->isrc_irq >= maxirqs)
+ if (isrc->isrc_irq >= nitems(irq_sources))
return (EINVAL);
-
- mtx_lock(&isrc_table_lock);
- if (irq_sources[isrc->isrc_irq] != isrc) {
- mtx_unlock(&isrc_table_lock);
+ if (irq_sources[isrc->isrc_irq] != isrc)
return (EINVAL);
- }
irq_sources[isrc->isrc_irq] = NULL;
isrc->isrc_irq = IRQ_INVALID; /* just to be safe */
- mtx_unlock(&isrc_table_lock);
-
return (0);
}
-#endif
+
/*
* Lookup interrupt source by interrupt number (resource handle).
*/
-static struct intr_irqsrc *
+static inline struct intr_irqsrc *
isrc_lookup(u_int irq)
{
@@ -432,158 +416,159 @@ isrc_lookup(u_int irq)
}
/*
- * Lookup interrupt source by namespace description.
+ * Initialize interrupt source and register it into global interrupt table.
*/
-static struct intr_irqsrc *
-isrc_namespace_lookup(device_t dev, uint16_t type, uint16_t num)
+int
+intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags,
+ const char *fmt, ...)
{
- u_int irq;
- struct intr_irqsrc *isrc;
+ int error;
+ va_list ap;
- mtx_assert(&isrc_table_lock, MA_OWNED);
+ bzero(isrc, sizeof(struct intr_irqsrc));
+ isrc->isrc_dev = dev;
+ isrc->isrc_irq = IRQ_INVALID; /* just to be safe */
+ isrc->isrc_flags = flags;
+
+ va_start(ap, fmt);
+ vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap);
+ va_end(ap);
- for (irq = 0; irq < nitems(irq_sources); irq++) {
- isrc = irq_sources[irq];
- if (isrc != NULL && isrc->isrc_dev == dev &&
- isrc->isrc_nspc_type == type && isrc->isrc_nspc_num == num)
- return (isrc);
+ mtx_lock(&isrc_table_lock);
+ error = isrc_alloc_irq(isrc);
+ if (error != 0) {
+ mtx_unlock(&isrc_table_lock);
+ return (error);
}
- return (NULL);
+ /*
+ * Setup interrupt counters, but not for IPI sources. Those are setup
+ * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust
+ * our counter pool.
+ */
+ if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
+ isrc_setup_counters(isrc);
+ mtx_unlock(&isrc_table_lock);
+ return (0);
}
/*
- * Map interrupt source according to namespace into framework. If such mapping
- * does not exist, create it. Return unique interrupt number (resource handle)
- * associated with mapped interrupt source.
+ * Deregister interrupt source from global interrupt table.
*/
-u_int
-intr_namespace_map_irq(device_t dev, uint16_t type, uint16_t num)
+int
+intr_isrc_deregister(struct intr_irqsrc *isrc)
{
- struct intr_irqsrc *isrc, *new_isrc;
int error;
- new_isrc = isrc_alloc(INTR_ISRCT_NAMESPACE, 0);
-
mtx_lock(&isrc_table_lock);
- isrc = isrc_namespace_lookup(dev, type, num);
- if (isrc != NULL) {
- mtx_unlock(&isrc_table_lock);
- isrc_free(new_isrc);
- return (isrc->isrc_irq); /* already mapped */
- }
+ if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
+ isrc_release_counters(isrc);
+ error = isrc_free_irq(isrc);
+ mtx_unlock(&isrc_table_lock);
+ return (error);
+}
- error = isrc_alloc_irq_locked(new_isrc);
- if (error != 0) {
+static struct intr_dev_data *
+intr_ddata_alloc(u_int extsize)
+{
+ struct intr_dev_data *ddata;
+
+ ddata = malloc(sizeof(*ddata) + extsize, M_INTRNG, M_WAITOK | M_ZERO);
+
+ mtx_lock(&isrc_table_lock);
+ if (intr_ddata_first_unused >= nitems(intr_ddata_tab)) {
mtx_unlock(&isrc_table_lock);
- isrc_free(new_isrc);
- return (IRQ_INVALID); /* no space left */
+ free(ddata, M_INTRNG);
+ return (NULL);
}
-
- new_isrc->isrc_dev = dev;
- new_isrc->isrc_nspc_type = type;
- new_isrc->isrc_nspc_num = num;
+ intr_ddata_tab[intr_ddata_first_unused] = ddata;
+ ddata->idd_irq = IRQ_DDATA_BASE + intr_ddata_first_unused++;
mtx_unlock(&isrc_table_lock);
-
- return (new_isrc->isrc_irq);
+ return (ddata);
}
-#ifdef FDT
-/*
- * Lookup interrupt source by FDT description.
- */
static struct intr_irqsrc *
-isrc_fdt_lookup(intptr_t xref, pcell_t *cells, u_int ncells)
+intr_ddata_lookup(u_int irq, struct intr_map_data **datap)
{
- u_int irq, cellsize;
+ int error;
struct intr_irqsrc *isrc;
+ struct intr_dev_data *ddata;
- mtx_assert(&isrc_table_lock, MA_OWNED);
+ isrc = isrc_lookup(irq);
+ if (isrc != NULL) {
+ if (datap != NULL)
+ *datap = NULL;
+ return (isrc);
+ }
- cellsize = ncells * sizeof(*cells);
- for (irq = 0; irq < nitems(irq_sources); irq++) {
- isrc = irq_sources[irq];
- if (isrc != NULL && isrc->isrc_type == INTR_ISRCT_FDT &&
- isrc->isrc_xref == xref && isrc->isrc_ncells == ncells &&
- memcmp(isrc->isrc_cells, cells, cellsize) == 0)
- return (isrc);
+ if (irq < IRQ_DDATA_BASE)
+ return (NULL);
+
+ irq -= IRQ_DDATA_BASE;
+ if (irq >= nitems(intr_ddata_tab))
+ return (NULL);
+
+ ddata = intr_ddata_tab[irq];
+ if (ddata->idd_isrc == NULL) {
+ error = intr_map_irq(ddata->idd_dev, ddata->idd_xref,
+ &ddata->idd_data, &irq);
+ if (error != 0)
+ return (NULL);
+ ddata->idd_isrc = isrc_lookup(irq);
}
- return (NULL);
+ if (datap != NULL)
+ *datap = &ddata->idd_data;
+ return (ddata->idd_isrc);
}
+#ifdef DEV_ACPI
/*
- * Map interrupt source according to FDT data into framework. If such mapping
+ * Map interrupt source according to ACPI info into framework. If such mapping
* does not exist, create it. Return unique interrupt number (resource handle)
* associated with mapped interrupt source.
*/
u_int
-intr_fdt_map_irq(phandle_t node, pcell_t *cells, u_int ncells)
+intr_acpi_map_irq(device_t dev, u_int irq, enum intr_polarity pol,
+ enum intr_trigger trig)
{
- struct intr_irqsrc *isrc, *new_isrc;
- u_int cellsize;
- intptr_t xref;
- int error;
-
- xref = (intptr_t)node; /* It's so simple for now. */
-
- cellsize = ncells * sizeof(*cells);
- new_isrc = isrc_alloc(INTR_ISRCT_FDT, cellsize);
-
- mtx_lock(&isrc_table_lock);
- isrc = isrc_fdt_lookup(xref, cells, ncells);
- if (isrc != NULL) {
- mtx_unlock(&isrc_table_lock);
- isrc_free(new_isrc);
- return (isrc->isrc_irq); /* already mapped */
- }
-
- error = isrc_alloc_irq_locked(new_isrc);
- if (error != 0) {
- mtx_unlock(&isrc_table_lock);
- isrc_free(new_isrc);
- return (IRQ_INVALID); /* no space left */
- }
-
- new_isrc->isrc_xref = xref;
- new_isrc->isrc_ncells = ncells;
- memcpy(new_isrc->isrc_cells, cells, cellsize);
- mtx_unlock(&isrc_table_lock);
-
- return (new_isrc->isrc_irq);
+ struct intr_dev_data *ddata;
+
+ ddata = intr_ddata_alloc(0);
+ if (ddata == NULL)
+ return (0xFFFFFFFF); /* no space left */
+
+ ddata->idd_dev = dev;
+ ddata->idd_data.type = INTR_MAP_DATA_ACPI;
+ ddata->idd_data.acpi.irq = irq;
+ ddata->idd_data.acpi.pol = pol;
+ ddata->idd_data.acpi.trig = trig;
+ return (ddata->idd_irq);
}
#endif
-
+#ifdef FDT
/*
- * Register interrupt source into interrupt controller.
+ * Map interrupt source according to FDT data into framework. If such mapping
+ * does not exist, create it. Return unique interrupt number (resource handle)
+ * associated with mapped interrupt source.
*/
-static int
-isrc_register(struct intr_irqsrc *isrc)
+u_int
+intr_fdt_map_irq(phandle_t node, pcell_t *cells, u_int ncells)
{
- struct intr_pic *pic;
- boolean_t is_percpu;
- int error;
-
- if (isrc->isrc_flags & INTR_ISRCF_REGISTERED)
- return (0);
-
- if (isrc->isrc_dev == NULL) {
- pic = pic_lookup(NULL, isrc->isrc_xref);
- if (pic == NULL || pic->pic_dev == NULL)
- return (ESRCH);
- isrc->isrc_dev = pic->pic_dev;
- }
-
- error = PIC_REGISTER(isrc->isrc_dev, isrc, &is_percpu);
- if (error != 0)
- return (error);
+ struct intr_dev_data *ddata;
+ u_int cellsize;
- mtx_lock(&isrc_table_lock);
- isrc->isrc_flags |= INTR_ISRCF_REGISTERED;
- if (is_percpu)
- isrc->isrc_flags |= INTR_ISRCF_PERCPU;
- isrc_update_name(isrc, NULL);
- mtx_unlock(&isrc_table_lock);
- return (0);
+ cellsize = ncells * sizeof(*cells);
+ ddata = intr_ddata_alloc(cellsize);
+ if (ddata == NULL)
+ return (0xFFFFFFFF); /* no space left */
+
+ ddata->idd_xref = (intptr_t)node;
+ ddata->idd_data.type = INTR_MAP_DATA_FDT;
+ ddata->idd_data.fdt.ncells = ncells;
+ ddata->idd_data.fdt.cells = (pcell_t *)(ddata + 1);
+ memcpy(ddata->idd_data.fdt.cells, cells, cellsize);
+ return (ddata->idd_irq);
}
+#endif
#ifdef INTR_SOLO
/*
@@ -678,7 +663,7 @@ intr_isrc_assign_cpu(void *arg, int cpu)
* informed if the call is successfull.
*/
if (irq_assign_cpu) {
- error = PIC_BIND(isrc->isrc_dev, isrc);
+ error = PIC_BIND_INTR(isrc->isrc_dev, isrc);
if (error) {
CPU_ZERO(&isrc->isrc_cpu);
mtx_unlock(&isrc_table_lock);
@@ -774,7 +759,7 @@ isrc_add_handler(struct intr_irqsrc *isrc, const char *name,
/*
* Lookup interrupt controller locked.
*/
-static struct intr_pic *
+static inline struct intr_pic *
pic_lookup_locked(device_t dev, intptr_t xref)
{
struct intr_pic *pic;
@@ -801,7 +786,6 @@ pic_lookup(device_t dev, intptr_t xref)
mtx_lock(&pic_list_lock);
pic = pic_lookup_locked(dev, xref);
mtx_unlock(&pic_list_lock);
-
return (pic);
}
@@ -871,7 +855,7 @@ intr_pic_register(device_t dev, intptr_t xref)
* Unregister interrupt controller.
*/
int
-intr_pic_unregister(device_t dev, intptr_t xref)
+intr_pic_deregister(device_t dev, intptr_t xref)
{
panic("%s: not implemented", __func__);
@@ -923,12 +907,73 @@ intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter,
}
int
-intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand,
- void *arg, u_int irq, int flags, void **cookiep)
+intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data,
+ u_int *irqp)
{
- const char *name;
+ int error;
+ struct intr_irqsrc *isrc;
+ struct intr_pic *pic;
+
+ if (data == NULL)
+ return (EINVAL);
+
+ pic = pic_lookup(dev, xref);
+ if (pic == NULL || pic->pic_dev == NULL)
+ return (ESRCH);
+
+ error = PIC_MAP_INTR(pic->pic_dev, data, &isrc);
+ if (error == 0)
+ *irqp = isrc->isrc_irq;
+ return (error);
+}
+
+int
+intr_alloc_irq(device_t dev, struct resource *res)
+{
+ struct intr_map_data *data;
+ struct intr_irqsrc *isrc;
+
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
+
+ isrc = intr_ddata_lookup(rman_get_start(res), &data);
+ if (isrc == NULL)
+ return (EINVAL);
+
+ return (PIC_ALLOC_INTR(isrc->isrc_dev, isrc, res, data));
+}
+
+int
+intr_release_irq(device_t dev, struct resource *res)
+{
+ struct intr_map_data *data;
struct intr_irqsrc *isrc;
+
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
+
+ isrc = intr_ddata_lookup(rman_get_start(res), &data);
+ if (isrc == NULL)
+ return (EINVAL);
+
+ return (PIC_RELEASE_INTR(isrc->isrc_dev, isrc, res, data));
+}
+
+int
+intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt,
+ driver_intr_t hand, void *arg, int flags, void **cookiep)
+{
int error;
+ struct intr_map_data *data;
+ struct intr_irqsrc *isrc;
+ const char *name;
+
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
+
+ isrc = intr_ddata_lookup(rman_get_start(res), &data);
+ if (isrc == NULL)
+ return (EINVAL);
name = device_get_nameunit(dev);
@@ -947,21 +992,7 @@ intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand,
debugf("irq %u cannot solo on %s\n", irq, name);
return (EINVAL);
}
-#endif
- isrc = isrc_lookup(irq);
- if (isrc == NULL) {
- debugf("irq %u without source on %s\n", irq, name);
- return (EINVAL);
- }
-
- error = isrc_register(isrc);
- if (error != 0) {
- debugf("irq %u map error %d on %s\n", irq, error, name);
- return (error);
- }
-
-#ifdef INTR_SOLO
if (flags & INTR_SOLO) {
error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt,
arg, cookiep);
@@ -978,24 +1009,32 @@ intr_irq_add_handler(device_t dev, driver_filter_t filt, driver_intr_t hand,
return (error);
mtx_lock(&isrc_table_lock);
- isrc->isrc_handlers++;
- if (isrc->isrc_handlers == 1) {
- PIC_ENABLE_INTR(isrc->isrc_dev, isrc);
- PIC_ENABLE_SOURCE(isrc->isrc_dev, isrc);
+ error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data);
+ if (error == 0) {
+ isrc->isrc_handlers++;
+ if (isrc->isrc_handlers == 1)
+ PIC_ENABLE_INTR(isrc->isrc_dev, isrc);
}
mtx_unlock(&isrc_table_lock);
- return (0);
+ if (error != 0)
+ intr_event_remove_handler(*cookiep);
+ return (error);
}
int
-intr_irq_remove_handler(device_t dev, u_int irq, void *cookie)
+intr_teardown_irq(device_t dev, struct resource *res, void *cookie)
{
- struct intr_irqsrc *isrc;
int error;
+ struct intr_map_data *data;
+ struct intr_irqsrc *isrc;
- isrc = isrc_lookup(irq);
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
+
+ isrc = intr_ddata_lookup(rman_get_start(res), &data);
if (isrc == NULL || isrc->isrc_handlers == 0)
return (EINVAL);
+
#ifdef INTR_SOLO
if (isrc->isrc_filter != NULL) {
if (isrc != cookie)
@@ -1005,8 +1044,8 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie)
isrc->isrc_filter = NULL;
isrc->isrc_arg = NULL;
isrc->isrc_handlers = 0;
- PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc);
PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
+ PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
isrc_update_name(isrc, NULL);
mtx_unlock(&isrc_table_lock);
return (0);
@@ -1019,10 +1058,9 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie)
if (error == 0) {
mtx_lock(&isrc_table_lock);
isrc->isrc_handlers--;
- if (isrc->isrc_handlers == 0) {
- PIC_DISABLE_SOURCE(isrc->isrc_dev, isrc);
+ if (isrc->isrc_handlers == 0)
PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
- }
+ PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
intrcnt_updatename(isrc);
mtx_unlock(&isrc_table_lock);
}
@@ -1030,36 +1068,16 @@ intr_irq_remove_handler(device_t dev, u_int irq, void *cookie)
}
int
-intr_irq_config(u_int irq, enum intr_trigger trig, enum intr_polarity pol)
+intr_describe_irq(device_t dev, struct resource *res, void *cookie,
+ const char *descr)
{
+ int error;
struct intr_irqsrc *isrc;
- isrc = isrc_lookup(irq);
- if (isrc == NULL)
- return (EINVAL);
-
- if (isrc->isrc_handlers != 0)
- return (EBUSY); /* interrrupt is enabled (active) */
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
- /*
- * Once an interrupt is enabled, we do not change its configuration.
- * A controller PIC_ENABLE_INTR() method is called when an interrupt
- * is going to be enabled. In this method, a controller should setup
- * the interrupt according to saved configuration parameters.
- */
- isrc->isrc_trig = trig;
- isrc->isrc_pol = pol;
-
- return (0);
-}
-
-int
-intr_irq_describe(u_int irq, void *cookie, const char *descr)
-{
- struct intr_irqsrc *isrc;
- int error;
-
- isrc = isrc_lookup(irq);
+ isrc = intr_ddata_lookup(rman_get_start(res), NULL);
if (isrc == NULL || isrc->isrc_handlers == 0)
return (EINVAL);
#ifdef INTR_SOLO
@@ -1084,11 +1102,14 @@ intr_irq_describe(u_int irq, void *cookie, const char *descr)
#ifdef SMP
int
-intr_irq_bind(u_int irq, int cpu)
+intr_bind_irq(device_t dev, struct resource *res, int cpu)
{
struct intr_irqsrc *isrc;
- isrc = isrc_lookup(irq);
+ KASSERT(rman_get_start(res) == rman_get_end(res),
+ ("%s: more interrupts in resource", __func__));
+
+ isrc = intr_ddata_lookup(rman_get_start(res), NULL);
if (isrc == NULL || isrc->isrc_handlers == 0)
return (EINVAL);
#ifdef INTR_SOLO
@@ -1135,7 +1156,7 @@ intr_irq_shuffle(void *arg __unused)
for (i = 0; i < NIRQ; i++) {
isrc = irq_sources[i];
if (isrc == NULL || isrc->isrc_handlers == 0 ||
- isrc->isrc_flags & INTR_ISRCF_PERCPU)
+ isrc->isrc_flags & INTR_ISRCF_PPI)
continue;
if (isrc->isrc_event != NULL &&
@@ -1151,7 +1172,7 @@ intr_irq_shuffle(void *arg __unused)
* for bound ISRC. The best thing we can do is to clear
* isrc_cpu so inconsistency with ie_cpu will be detectable.
*/
- if (PIC_BIND(isrc->isrc_dev, isrc) != 0)
+ if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0)
CPU_ZERO(&isrc->isrc_cpu);
}
mtx_unlock(&isrc_table_lock);
@@ -1196,6 +1217,7 @@ intr_pic_init_secondary(void)
DB_SHOW_COMMAND(irqs, db_show_irqs)
{
u_int i, irqsum;
+ u_long num;
struct intr_irqsrc *isrc;
for (irqsum = 0, i = 0; i < NIRQ; i++) {
@@ -1203,11 +1225,11 @@ DB_SHOW_COMMAND(irqs, db_show_irqs)
if (isrc == NULL)
continue;
+ num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0;
db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i,
isrc->isrc_name, isrc->isrc_cpu.__bits[0],
- isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "",
- isrc->isrc_count[0]);
- irqsum += isrc->isrc_count[0];
+ isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num);
+ irqsum += num;
}
db_printf("irq total %u\n", irqsum);
}
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
index 41f1f34..9bbec64 100644
--- a/sys/kern/subr_rman.c
+++ b/sys/kern/subr_rman.c
@@ -159,7 +159,7 @@ rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end)
struct resource_i *r, *s, *t;
int rv = 0;
- DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
+ DPRINTF(("rman_manage_region: <%s> request: start %#jx, end %#jx\n",
rm->rm_descr, start, end));
if (start < rm->rm_start || end > rm->rm_end)
return EINVAL;
@@ -174,7 +174,7 @@ rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end)
/* Skip entries before us. */
TAILQ_FOREACH(s, &rm->rm_list, r_link) {
- if (s->r_end == ULONG_MAX)
+ if (s->r_end == ~0)
break;
if (s->r_end + 1 >= r->r_start)
break;
@@ -444,8 +444,8 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
rv = NULL;
- DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], "
- "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end,
+ DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#jx, %#jx], "
+ "length %#jx, flags %x, device %s\n", rm->rm_descr, start, end,
count, flags,
dev == NULL ? "<null>" : device_get_nameunit(dev)));
KASSERT((flags & RF_FIRSTSHARE) == 0,
@@ -454,19 +454,29 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
mtx_lock(rm->rm_mtx);
+ r = TAILQ_FIRST(&rm->rm_list);
+ if (r == NULL) {
+ DPRINTF(("NULL list head\n"));
+ } else {
+ DPRINTF(("rman_reserve_resource_bound: trying %#jx <%#jx,%#jx>\n",
+ r->r_end, start, count-1));
+ }
for (r = TAILQ_FIRST(&rm->rm_list);
r && r->r_end < start + count - 1;
- r = TAILQ_NEXT(r, r_link))
+ r = TAILQ_NEXT(r, r_link)) {
;
+ DPRINTF(("rman_reserve_resource_bound: tried %#jx <%#jx,%#jx>\n",
+ r->r_end, start, count-1));
+ }
if (r == NULL) {
DPRINTF(("could not find a region\n"));
goto out;
}
- amask = (1ul << RF_ALIGNMENT(flags)) - 1;
- KASSERT(start <= ULONG_MAX - amask,
- ("start (%#lx) + amask (%#lx) would wrap around", start, amask));
+ amask = (1ull << RF_ALIGNMENT(flags)) - 1;
+ KASSERT(start <= RM_MAX_END - amask,
+ ("start (%#jx) + amask (%#jx) would wrap around", start, amask));
/* If bound is 0, bmask will also be 0 */
bmask = ~(bound - 1);
@@ -474,18 +484,18 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
* First try to find an acceptable totally-unshared region.
*/
for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
- DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
+ DPRINTF(("considering [%#jx, %#jx]\n", s->r_start, s->r_end));
/*
* The resource list is sorted, so there is no point in
* searching further once r_start is too large.
*/
if (s->r_start > end - (count - 1)) {
- DPRINTF(("s->r_start (%#lx) + count - 1> end (%#lx)\n",
+ DPRINTF(("s->r_start (%#jx) + count - 1> end (%#jx)\n",
s->r_start, end));
break;
}
- if (s->r_start > ULONG_MAX - amask) {
- DPRINTF(("s->r_start (%#lx) + amask (%#lx) too large\n",
+ if (s->r_start > RM_MAX_END - amask) {
+ DPRINTF(("s->r_start (%#jx) + amask (%#jx) too large\n",
s->r_start, amask));
break;
}
@@ -493,7 +503,7 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
DPRINTF(("region is allocated\n"));
continue;
}
- rstart = ulmax(s->r_start, start);
+ rstart = ummax(s->r_start, start);
/*
* Try to find a region by adjusting to boundary and alignment
* until both conditions are satisfied. This is not an optimal
@@ -505,16 +515,16 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
rstart += bound - (rstart & ~bmask);
} while ((rstart & amask) != 0 && rstart < end &&
rstart < s->r_end);
- rend = ulmin(s->r_end, ulmax(rstart + count - 1, end));
+ rend = ummin(s->r_end, ummax(rstart + count - 1, end));
if (rstart > rend) {
DPRINTF(("adjusted start exceeds end\n"));
continue;
}
- DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+ DPRINTF(("truncated region: [%#jx, %#jx]; size %#jx (requested %#jx)\n",
rstart, rend, (rend - rstart + 1), count));
if ((rend - rstart + 1) >= count) {
- DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
+ DPRINTF(("candidate region: [%#jx, %#jx], size %#jx\n",
rstart, rend, (rend - rstart + 1)));
if ((s->r_end - s->r_start + 1) == count) {
DPRINTF(("candidate region is entire chunk\n"));
@@ -545,7 +555,7 @@ rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end,
if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
DPRINTF(("splitting region in three parts: "
- "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+ "[%#jx, %#jx]; [%#jx, %#jx]; [%#jx, %#jx]\n",
s->r_start, rv->r_start - 1,
rv->r_start, rv->r_end,
rv->r_end + 1, s->r_end));
@@ -1032,8 +1042,8 @@ dump_rman_header(struct rman *rm)
if (db_pager_quit)
return;
- db_printf("rman %p: %s (0x%lx-0x%lx full range)\n",
- rm, rm->rm_descr, rm->rm_start, rm->rm_end);
+ db_printf("rman %p: %s (0x%jx-0x%jx full range)\n",
+ rm, rm->rm_descr, (rman_res_t)rm->rm_start, (rman_res_t)rm->rm_end);
}
static void
@@ -1051,7 +1061,7 @@ dump_rman(struct rman *rm)
devname = "nomatch";
} else
devname = NULL;
- db_printf(" 0x%lx-0x%lx (RID=%d) ",
+ db_printf(" 0x%jx-0x%jx (RID=%d) ",
r->r_start, r->r_end, r->r_rid);
if (devname != NULL)
db_printf("(%s)\n", devname);
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
index 12908f6..ef06c48 100644
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$");
#include "opt_sleepqueue_profiling.h"
#include "opt_ddb.h"
#include "opt_sched.h"
+#include "opt_stack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -75,6 +76,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/sleepqueue.h>
+#include <sys/stack.h>
#include <sys/sysctl.h>
#include <vm/uma.h>
@@ -83,6 +85,7 @@ __FBSDID("$FreeBSD$");
#include <ddb/ddb.h>
#endif
+
/*
* Constants for the hash table of sleep queue chains.
* SC_TABLESIZE must be a power of two for SC_MASK to work properly.
@@ -382,6 +385,8 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
MPASS(TD_ON_SLEEPQ(td));
MPASS(td->td_sleepqueue == NULL);
MPASS(wchan != NULL);
+ if (cold)
+ panic("timed sleep before timers are working");
callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
}
@@ -1034,6 +1039,122 @@ sleepq_abort(struct thread *td, int intrval)
return (sleepq_resume_thread(sq, td, 0));
}
+/*
+ * Prints the stacks of all threads presently sleeping on wchan/queue to
+ * the sbuf sb. Sets count_stacks_printed to the number of stacks actually
+ * printed. Typically, this will equal the number of threads sleeping on the
+ * queue, but may be less if sb overflowed before all stacks were printed.
+ */
+#ifdef STACK
+int
+sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
+ int *count_stacks_printed)
+{
+ struct thread *td, *td_next;
+ struct sleepqueue *sq;
+ struct stack **st;
+ struct sbuf **td_infos;
+ int i, stack_idx, error, stacks_to_allocate;
+ bool finished, partial_print;
+
+ error = 0;
+ finished = false;
+ partial_print = false;
+
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+
+ stacks_to_allocate = 10;
+ for (i = 0; i < 3 && !finished ; i++) {
+ /* We cannot malloc while holding the queue's spinlock, so
+ * we do our mallocs now, and hope it is enough. If it
+ * isn't, we will free these, drop the lock, malloc more,
+ * and try again, up to a point. After that point we will
+ * give up and report ENOMEM. We also cannot write to sb
+ * during this time since the client may have set the
+ * SBUF_AUTOEXTEND flag on their sbuf, which could cause a
+ * malloc as we print to it. So we defer actually printing
+ * to sb until after we drop the spinlock.
+ */
+
+ /* Where we will store the stacks. */
+ st = malloc(sizeof(struct stack *) * stacks_to_allocate,
+ M_TEMP, M_WAITOK);
+ for (stack_idx = 0; stack_idx < stacks_to_allocate;
+ stack_idx++)
+ st[stack_idx] = stack_create();
+
+ /* Where we will store the td name, tid, etc. */
+ td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate,
+ M_TEMP, M_WAITOK);
+ for (stack_idx = 0; stack_idx < stacks_to_allocate;
+ stack_idx++)
+ td_infos[stack_idx] = sbuf_new(NULL, NULL,
+ MAXCOMLEN + sizeof(struct thread *) * 2 + 40,
+ SBUF_FIXEDLEN);
+
+ sleepq_lock(wchan);
+ sq = sleepq_lookup(wchan);
+ if (sq == NULL) {
+ /* This sleepq does not exist; exit and return ENOENT. */
+ error = ENOENT;
+ finished = true;
+ sleepq_release(wchan);
+ goto loop_end;
+ }
+
+ stack_idx = 0;
+ /* Save thread info */
+ TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
+ td_next) {
+ if (stack_idx >= stacks_to_allocate)
+ goto loop_end;
+
+ /* Note the td_lock is equal to the sleepq_lock here. */
+ stack_save_td(st[stack_idx], td);
+
+ sbuf_printf(td_infos[stack_idx], "%d: %s %p",
+ td->td_tid, td->td_name, td);
+
+ ++stack_idx;
+ }
+
+ finished = true;
+ sleepq_release(wchan);
+
+ /* Print the stacks */
+ for (i = 0; i < stack_idx; i++) {
+ sbuf_finish(td_infos[i]);
+ sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
+ stack_sbuf_print(sb, st[i]);
+ sbuf_printf(sb, "\n");
+
+ error = sbuf_error(sb);
+ if (error == 0)
+ *count_stacks_printed = stack_idx;
+ }
+
+loop_end:
+ if (!finished)
+ sleepq_release(wchan);
+ for (stack_idx = 0; stack_idx < stacks_to_allocate;
+ stack_idx++)
+ stack_destroy(st[stack_idx]);
+ for (stack_idx = 0; stack_idx < stacks_to_allocate;
+ stack_idx++)
+ sbuf_delete(td_infos[stack_idx]);
+ free(st, M_TEMP);
+ free(td_infos, M_TEMP);
+ stacks_to_allocate *= 10;
+ }
+
+ if (!finished && error == 0)
+ error = ENOMEM;
+
+ return (error);
+}
+#endif
+
#ifdef SLEEPQUEUE_PROFILING
#define SLEEPQ_PROF_LOCATIONS 1024
#define SLEEPQ_SBUFSIZE 512
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 82349f8..9848136 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/bus.h>
#include <sys/lock.h>
+#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/sched.h>
@@ -51,6 +52,10 @@ __FBSDID("$FreeBSD$");
#include "opt_sched.h"
#ifdef SMP
+MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
+#endif
+
+#ifdef SMP
volatile cpuset_t stopped_cpus;
volatile cpuset_t started_cpus;
volatile cpuset_t suspended_cpus;
@@ -556,7 +561,7 @@ smp_rendezvous(void (* setup_func)(void *),
smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
}
-static struct cpu_group group[MAXCPU];
+static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
struct cpu_group *
smp_topo(void)
@@ -616,6 +621,17 @@ smp_topo(void)
}
struct cpu_group *
+smp_topo_alloc(u_int count)
+{
+ static u_int index;
+ u_int curr;
+
+ curr = index;
+ index += count;
+ return (&group[curr]);
+}
+
+struct cpu_group *
smp_topo_none(void)
{
struct cpu_group *top;
@@ -861,3 +877,233 @@ sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
return (error);
}
+
+#ifdef SMP
+void
+topo_init_node(struct topo_node *node)
+{
+
+ bzero(node, sizeof(*node));
+ TAILQ_INIT(&node->children);
+}
+
+void
+topo_init_root(struct topo_node *root)
+{
+
+ topo_init_node(root);
+ root->type = TOPO_TYPE_SYSTEM;
+}
+
+struct topo_node *
+topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+ topo_node_type type, uintptr_t subtype)
+{
+ struct topo_node *node;
+
+ TAILQ_FOREACH_REVERSE(node, &parent->children,
+ topo_children, siblings) {
+ if (node->hwid == hwid
+ && node->type == type && node->subtype == subtype) {
+ return (node);
+ }
+ }
+
+ node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
+ topo_init_node(node);
+ node->parent = parent;
+ node->hwid = hwid;
+ node->type = type;
+ node->subtype = subtype;
+ TAILQ_INSERT_TAIL(&parent->children, node, siblings);
+ parent->nchildren++;
+
+ return (node);
+}
+
+struct topo_node *
+topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+ topo_node_type type, uintptr_t subtype)
+{
+
+ struct topo_node *node;
+
+ TAILQ_FOREACH(node, &parent->children, siblings) {
+ if (node->hwid == hwid
+ && node->type == type && node->subtype == subtype) {
+ return (node);
+ }
+ }
+
+ return (NULL);
+}
+
+void
+topo_promote_child(struct topo_node *child)
+{
+ struct topo_node *next;
+ struct topo_node *node;
+ struct topo_node *parent;
+
+ parent = child->parent;
+ next = TAILQ_NEXT(child, siblings);
+ TAILQ_REMOVE(&parent->children, child, siblings);
+ TAILQ_INSERT_HEAD(&parent->children, child, siblings);
+
+ while (next != NULL) {
+ node = next;
+ next = TAILQ_NEXT(node, siblings);
+ TAILQ_REMOVE(&parent->children, node, siblings);
+ TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
+ child = node;
+ }
+}
+
+struct topo_node *
+topo_next_node(struct topo_node *top, struct topo_node *node)
+{
+ struct topo_node *next;
+
+ if ((next = TAILQ_FIRST(&node->children)) != NULL)
+ return (next);
+
+ if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+ return (next);
+
+ while ((node = node->parent) != top)
+ if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+ return (next);
+
+ return (NULL);
+}
+
+struct topo_node *
+topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
+{
+ struct topo_node *next;
+
+ if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+ return (next);
+
+ while ((node = node->parent) != top)
+ if ((next = TAILQ_NEXT(node, siblings)) != NULL)
+ return (next);
+
+ return (NULL);
+}
+
+void
+topo_set_pu_id(struct topo_node *node, cpuid_t id)
+{
+
+ KASSERT(node->type == TOPO_TYPE_PU,
+ ("topo_set_pu_id: wrong node type: %u", node->type));
+ KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
+ ("topo_set_pu_id: cpuset already not empty"));
+ node->id = id;
+ CPU_SET(id, &node->cpuset);
+ node->cpu_count = 1;
+ node->subtype = 1;
+
+ while ((node = node->parent) != NULL) {
+ if (CPU_ISSET(id, &node->cpuset))
+ break;
+ CPU_SET(id, &node->cpuset);
+ node->cpu_count++;
+ }
+}
+
+int
+topo_analyze(struct topo_node *topo_root, int all,
+ int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
+{
+ struct topo_node *pkg_node;
+ struct topo_node *core_node;
+ struct topo_node *pu_node;
+ int thrs_per_pkg;
+ int cpp_counter;
+ int tpc_counter;
+ int tpp_counter;
+
+ *pkg_count = 0;
+ *cores_per_pkg = -1;
+ *thrs_per_core = -1;
+ thrs_per_pkg = -1;
+ pkg_node = topo_root;
+ while (pkg_node != NULL) {
+ if (pkg_node->type != TOPO_TYPE_PKG) {
+ pkg_node = topo_next_node(topo_root, pkg_node);
+ continue;
+ }
+ if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
+ pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+ continue;
+ }
+
+ (*pkg_count)++;
+
+ cpp_counter = 0;
+ tpp_counter = 0;
+ core_node = pkg_node;
+ while (core_node != NULL) {
+ if (core_node->type == TOPO_TYPE_CORE) {
+ if (!all && CPU_EMPTY(&core_node->cpuset)) {
+ core_node =
+ topo_next_nonchild_node(pkg_node,
+ core_node);
+ continue;
+ }
+
+ cpp_counter++;
+
+ tpc_counter = 0;
+ pu_node = core_node;
+ while (pu_node != NULL) {
+ if (pu_node->type == TOPO_TYPE_PU &&
+ (all || !CPU_EMPTY(&pu_node->cpuset)))
+ tpc_counter++;
+ pu_node = topo_next_node(core_node,
+ pu_node);
+ }
+
+ if (*thrs_per_core == -1)
+ *thrs_per_core = tpc_counter;
+ else if (*thrs_per_core != tpc_counter)
+ return (0);
+
+ core_node = topo_next_nonchild_node(pkg_node,
+ core_node);
+ } else {
+ /* PU node directly under PKG. */
+ if (core_node->type == TOPO_TYPE_PU &&
+ (all || !CPU_EMPTY(&core_node->cpuset)))
+ tpp_counter++;
+ core_node = topo_next_node(pkg_node,
+ core_node);
+ }
+ }
+
+ if (*cores_per_pkg == -1)
+ *cores_per_pkg = cpp_counter;
+ else if (*cores_per_pkg != cpp_counter)
+ return (0);
+ if (thrs_per_pkg == -1)
+ thrs_per_pkg = tpp_counter;
+ else if (thrs_per_pkg != tpp_counter)
+ return (0);
+
+ pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
+ }
+
+ KASSERT(*pkg_count > 0,
+ ("bug in topology or analysis"));
+ if (*cores_per_pkg == 0) {
+ KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
+ ("bug in topology or analysis"));
+ *thrs_per_core = thrs_per_pkg;
+ }
+
+ return (1);
+}
+#endif /* SMP */
+
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index fe79f52..75fb66e 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -89,12 +89,14 @@ __FBSDID("$FreeBSD$");
#define SYS_IOCTL_SMALL_SIZE 128 /* bytes */
#define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */
-int iosize_max_clamp = 0;
+#ifdef __LP64__
+static int iosize_max_clamp = 0;
SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
&iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
-int devfs_iosize_max_clamp = 1;
+static int devfs_iosize_max_clamp = 1;
SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
&devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
+#endif
/*
* Assert that the return value of read(2) and write(2) syscalls fits
@@ -159,6 +161,24 @@ struct selfd {
static uma_zone_t selfd_zone;
static struct mtx_pool *mtxpool_select;
+#ifdef __LP64__
+size_t
+devfs_iosize_max(void)
+{
+
+ return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
+ INT_MAX : SSIZE_MAX);
+}
+
+size_t
+iosize_max(void)
+{
+
+ return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
+ INT_MAX : SSIZE_MAX);
+}
+#endif
+
#ifndef _SYS_SYSPROTO_H_
struct read_args {
int fd;
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 44ddd81..3575028 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -3,7 +3,7 @@
*
* DO NOT EDIT-- this file is automatically generated.
* $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 296572 2016-03-09 19:05:11Z jhb
+ * created from FreeBSD: head/sys/kern/syscalls.master 297167 2016-03-21 21:37:33Z jhb
*/
const char *syscallnames[] = {
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index f08f5e3..5675425 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -554,7 +554,7 @@
312 AUE_SETRESGID STD { int setresgid(gid_t rgid, gid_t egid, \
gid_t sgid); }
313 AUE_NULL OBSOL signanosleep
-314 AUE_NULL STD { int aio_return(struct aiocb *aiocbp); }
+314 AUE_NULL STD { ssize_t aio_return(struct aiocb *aiocbp); }
315 AUE_NULL STD { int aio_suspend( \
struct aiocb * const * aiocbp, int nent, \
const struct timespec *timeout); }
@@ -643,7 +643,7 @@
358 AUE_EXTATTR_DELETE_FILE STD { int extattr_delete_file(const char *path, \
int attrnamespace, \
const char *attrname); }
-359 AUE_NULL STD { int aio_waitcomplete( \
+359 AUE_NULL STD { ssize_t aio_waitcomplete( \
struct aiocb **aiocbp, \
struct timespec *timeout); }
360 AUE_GETRESUID STD { int getresuid(uid_t *ruid, uid_t *euid, \
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index 6fd03f1..c2ea2b4 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -9796,7 +9796,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
/* aio_return */
case 314:
if (ndx == 0 || ndx == 1)
- p = "int";
+ p = "ssize_t";
break;
/* aio_suspend */
case 315:
@@ -9972,7 +9972,7 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
/* aio_waitcomplete */
case 359:
if (ndx == 0 || ndx == 1)
- p = "int";
+ p = "ssize_t";
break;
/* getresuid */
case 360:
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 23bfbff..62dfda6 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -47,6 +47,49 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/uio.h>
+#include <sys/sdt.h>
+
+SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
+ "struct mbuf *", "mbufinfo_t *",
+ "uint32_t", "uint32_t",
+ "uint16_t", "uint16_t",
+ "uint32_t", "uint32_t",
+ "uint32_t", "uint32_t");
+
+SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
+ "uint32_t", "uint32_t",
+ "uint16_t", "uint16_t",
+ "struct mbuf *", "mbufinfo_t *");
+
+SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
+ "uint32_t", "uint32_t",
+ "uint16_t", "uint16_t",
+ "struct mbuf *", "mbufinfo_t *");
+
+SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
+ "uint32_t", "uint32_t",
+ "uint16_t", "uint16_t",
+ "uint32_t", "uint32_t",
+ "struct mbuf *", "mbufinfo_t *");
+
+SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
+ "struct mbuf *", "mbufinfo_t *",
+ "uint32_t", "uint32_t",
+ "uint32_t", "uint32_t");
+
+SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
+ "struct mbuf *", "mbufinfo_t *",
+ "uint32_t", "uint32_t",
+ "uint32_t", "uint32_t",
+ "void*", "void*");
+
+SDT_PROBE_DEFINE(sdt, , , m__cljset);
+
+SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
+ "struct mbuf *", "mbufinfo_t *");
+
+SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
+ "struct mbuf *", "mbufinfo_t *");
#include <security/mac/mac_framework.h>
@@ -1627,7 +1670,7 @@ m_unshare(struct mbuf *m0, int how)
* don't know how to break up the non-contiguous memory when
* doing DMA.
*/
- n = m_getcl(how, m->m_type, m->m_flags);
+ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
if (n == NULL) {
m_freem(m0);
return (NULL);
@@ -1657,7 +1700,7 @@ m_unshare(struct mbuf *m0, int how)
break;
off += cc;
- n = m_getcl(how, m->m_type, m->m_flags);
+ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
if (n == NULL) {
m_freem(mfirst);
m_freem(m0);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 35d17b1..d873217 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -358,7 +358,7 @@ sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
}
SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
&maxsockets, 0, sysctl_maxsockets, "IU",
- "Maximum number of sockets avaliable");
+ "Maximum number of sockets available");
/*
* Socket operation routines. These routines are called by the routines in
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 27fa239..20df141 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -743,7 +743,7 @@ aio_process_rw(struct kaiocb *job)
struct file *fp;
struct uio auio;
struct iovec aiov;
- int cnt;
+ ssize_t cnt;
int error;
int oublock_st, oublock_end;
int inblock_st, inblock_end;
@@ -1173,7 +1173,7 @@ aio_qphysio(struct proc *p, struct kaiocb *job)
struct cdevsw *csw;
struct cdev *dev;
struct kaioinfo *ki;
- int error, ref, unmap, poff;
+ int error, ref, poff;
vm_prot_t prot;
cb = &job->uaiocb;
@@ -1206,12 +1206,13 @@ aio_qphysio(struct proc *p, struct kaiocb *job)
ki = p->p_aioinfo;
poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
- unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
- if (unmap) {
+ if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
if (cb->aio_nbytes > MAXPHYS) {
error = -1;
goto unref;
}
+
+ pbuf = NULL;
} else {
if (cb->aio_nbytes > MAXPHYS - poff) {
error = -1;
@@ -1221,17 +1222,14 @@ aio_qphysio(struct proc *p, struct kaiocb *job)
error = -1;
goto unref;
}
- }
- job->bp = bp = g_alloc_bio();
- if (!unmap) {
+
job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
BUF_KERNPROC(pbuf);
- }
-
- AIO_LOCK(ki);
- if (!unmap)
+ AIO_LOCK(ki);
ki->kaio_buffer_count++;
- AIO_UNLOCK(ki);
+ AIO_UNLOCK(ki);
+ }
+ job->bp = bp = g_alloc_bio();
bp->bio_length = cb->aio_nbytes;
bp->bio_bcount = cb->aio_nbytes;
@@ -1245,17 +1243,18 @@ aio_qphysio(struct proc *p, struct kaiocb *job)
prot = VM_PROT_READ;
if (cb->aio_lio_opcode == LIO_READ)
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
- if ((job->npages = vm_fault_quick_hold_pages(
- &curproc->p_vmspace->vm_map,
+ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
- sizeof(job->pages)/sizeof(job->pages[0]))) < 0) {
+ nitems(job->pages));
+ if (job->npages < 0) {
error = EFAULT;
goto doerror;
}
- if (!unmap) {
+ if (pbuf != NULL) {
pmap_qenter((vm_offset_t)pbuf->b_data,
job->pages, job->npages);
bp->bio_data = pbuf->b_data + poff;
+ atomic_add_int(&num_buf_aio, 1);
} else {
bp->bio_ma = job->pages;
bp->bio_ma_n = job->npages;
@@ -1264,20 +1263,16 @@ aio_qphysio(struct proc *p, struct kaiocb *job)
bp->bio_flags |= BIO_UNMAPPED;
}
- if (!unmap)
- atomic_add_int(&num_buf_aio, 1);
-
/* Perform transfer. */
csw->d_strategy(bp);
dev_relthread(dev, ref);
return (0);
doerror:
- AIO_LOCK(ki);
- if (!unmap)
+ if (pbuf != NULL) {
+ AIO_LOCK(ki);
ki->kaio_buffer_count--;
- AIO_UNLOCK(ki);
- if (pbuf) {
+ AIO_UNLOCK(ki);
relpbuf(pbuf, NULL);
job->pbuf = NULL;
}
@@ -1446,8 +1441,7 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
return (error);
}
- /* XXX: aio_nbytes is later casted to signed types. */
- if (job->uaiocb.aio_nbytes > INT_MAX) {
+ if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
uma_zfree(aiocb_zone, job);
return (EINVAL);
}
@@ -1788,7 +1782,7 @@ kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
struct proc *p = td->td_proc;
struct kaiocb *job;
struct kaioinfo *ki;
- int status, error;
+ long status, error;
ki = p->p_aioinfo;
if (ki == NULL)
@@ -2344,7 +2338,8 @@ kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
struct kaioinfo *ki;
struct kaiocb *job;
struct aiocb *ujob;
- int error, status, timo;
+ long error, status;
+ int timo;
ops->store_aiocb(ujobp, NULL);
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 9be0ece..9eb523c 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -256,10 +256,10 @@ vfs_free_addrlist_af(struct radix_node_head **prnh)
rnh = *prnh;
RADIX_NODE_HEAD_LOCK(rnh);
- (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, &rnh->rh);
+ (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh);
RADIX_NODE_HEAD_UNLOCK(rnh);
RADIX_NODE_HEAD_DESTROY(rnh);
- free(rnh, M_RTABLE);
+ rn_detachhead((void **)prnh);
prnh = NULL;
}
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
index cf24253..a33e376 100644
--- a/sys/kern/vfs_mountroot.c
+++ b/sys/kern/vfs_mountroot.c
@@ -89,6 +89,7 @@ __FBSDID("$FreeBSD$");
static int parse_mount(char **);
static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
+static void vfs_mountroot_wait(void);
static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
/*
@@ -488,6 +489,8 @@ parse_dir_ask(char **conf)
char *mnt;
int error;
+ vfs_mountroot_wait();
+
printf("\nLoader variables:\n");
parse_dir_ask_printenv("vfs.root.mountfrom");
parse_dir_ask_printenv("vfs.root.mountfrom.options");
OpenPOWER on IntegriCloud