summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c20
-rw-r--r--sys/fs/ext2fs/ext2_bmap.c8
-rw-r--r--sys/kern/kern_physio.c17
-rw-r--r--sys/kern/kern_racct.c77
-rw-r--r--sys/kern/kern_rctl.c214
-rw-r--r--sys/kern/subr_trap.c10
-rw-r--r--sys/kern/vfs_bio.c31
-rw-r--r--sys/kern/vfs_cluster.c15
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/racct.h9
-rw-r--r--sys/sys/rctl.h4
-rw-r--r--sys/ufs/ffs/ffs_inode.c8
-rw-r--r--sys/ufs/ffs/ffs_softdep.c8
-rw-r--r--sys/ufs/ufs/ufs_bmap.c8
-rw-r--r--sys/vm/vm_fault.c16
-rw-r--r--usr.bin/rctl/rctl.839
17 files changed, 449 insertions, 46 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index f944903..534dfb2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -132,6 +132,7 @@
#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
+#include <sys/racct.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -4503,6 +4504,14 @@ top:
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_READBPS, size);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index b60236f..af8d366 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
+#include <sys/racct.h>
#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
@@ -427,6 +428,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+#if defined(_KERNEL) && defined(RACCT)
+ if (racct_enable && !read) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, length);
+ racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+ PROC_UNLOCK(curproc);
+ }
+#endif
+
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
@@ -1422,7 +1432,15 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
-#endif
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
diff --git a/sys/fs/ext2fs/ext2_bmap.c b/sys/fs/ext2fs/ext2_bmap.c
index 8e5e986..7966b9b 100644
--- a/sys/fs/ext2fs/ext2_bmap.c
+++ b/sys/fs/ext2fs/ext2_bmap.c
@@ -42,6 +42,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -247,6 +248,13 @@ ext2_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, int *runp, int *runb)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index a148386..31d8f16 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -27,6 +27,7 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/uio.h>
#include <geom/geom.h>
@@ -109,6 +110,22 @@ physio(struct cdev *dev, struct uio *uio, int ioflag)
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
error = 0;
for (i = 0; i < uio->uio_iovcnt; i++) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ if (uio->uio_rw == UIO_READ) {
+ racct_add_force(curproc, RACCT_READBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+
while (uio->uio_iov[i].iov_len) {
g_reset_bio(bp);
if (uio->uio_rw == UIO_READ) {
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index bbd50ca..438a249 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include "opt_sched.h"
#include <sys/param.h>
+#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
@@ -177,7 +178,15 @@ int racct_types[] = {
[RACCT_WALLCLOCK] =
RACCT_IN_MILLIONS,
[RACCT_PCTCPU] =
- RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+ RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
+ [RACCT_READBPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEBPS] =
+ RACCT_DECAYING,
+ [RACCT_READIOPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEIOPS] =
+ RACCT_DECAYING };
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
@@ -634,6 +643,28 @@ racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
RACCT_UNLOCK();
}
+/*
+ * Account for disk IO resource consumption. Checks for limits,
+ * but never fails, due to disk limits being undeniable.
+ */
+void
+racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
+{
+
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ RACCT_LOCK();
+ if (is_write) {
+ racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
+ racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
+ } else {
+ racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
+ racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
+ }
+ RACCT_UNLOCK();
+}
+
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
{
@@ -655,7 +686,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (resource == RACCT_PCTCPU) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -1043,14 +1074,19 @@ racct_move(struct racct *dest, struct racct *src)
RACCT_UNLOCK();
}
-static void
-racct_proc_throttle(struct proc *p)
+/*
+ * Make the process sleep in userret() for 'timeout' ticks. Setting
+ * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
+ */
+void
+racct_proc_throttle(struct proc *p, int timeout)
{
struct thread *td;
#ifdef SMP
int cpuid;
#endif
+ KASSERT(timeout != 0, ("timeout %d", timeout));
ASSERT_RACCT_ENABLED();
PROC_LOCK_ASSERT(p, MA_OWNED);
@@ -1058,10 +1094,13 @@ racct_proc_throttle(struct proc *p)
* Do not block kernel processes. Also do not block processes with
* low %cpu utilization to improve interactivity.
*/
- if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) ||
- (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
return;
- p->p_throttled = 1;
+
+ if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
+ return;
+
+ p->p_throttled = timeout;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
@@ -1102,7 +1141,7 @@ racct_proc_wakeup(struct proc *p)
PROC_LOCK_ASSERT(p, MA_OWNED);
- if (p->p_throttled) {
+ if (p->p_throttled != 0) {
p->p_throttled = 0;
wakeup(p->p_racct);
}
@@ -1116,6 +1155,13 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
ASSERT_RACCT_ENABLED();
RACCT_LOCK_ASSERT();
+#ifdef RCTL
+ rctl_throttle_decay(racct, RACCT_READBPS);
+ rctl_throttle_decay(racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(racct, RACCT_READIOPS);
+ rctl_throttle_decay(racct, RACCT_WRITEIOPS);
+#endif
+
r_old = racct->r_resources[RACCT_PCTCPU];
/* If there is nothing to decay, just exit. */
@@ -1206,6 +1252,12 @@ racctd(void)
pct_estimate = 0;
pct = racct_getpcpu(p, pct_estimate);
RACCT_LOCK();
+#ifdef RCTL
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+#endif
racct_set_locked(p, RACCT_PCTCPU, pct, 1);
racct_set_locked(p, RACCT_CPU, runtime, 0);
racct_set_locked(p, RACCT_WALLCLOCK,
@@ -1228,10 +1280,13 @@ racctd(void)
continue;
}
- if (racct_pcpu_available(p) <= 0)
- racct_proc_throttle(p);
- else if (p->p_throttled)
+ if (racct_pcpu_available(p) <= 0) {
+ if (p->p_racct->r_resources[RACCT_PCTCPU] >
+ pcpu_threshold)
+ racct_proc_throttle(p, -1);
+ } else if (p->p_throttled == -1) {
racct_proc_wakeup(p);
+ }
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 7f6a7ad..8f301b8 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -77,9 +77,13 @@ FEATURE(rctl, "Resource Limits");
#define RCTL_PCPU_SHIFT (10 * 1000000)
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
static int rctl_log_rate_limit = 10;
static int rctl_devctl_rate_limit = 10;
+static unsigned int rctl_throttle_min = 0;
+static unsigned int rctl_throttle_max = 0;
+static unsigned int rctl_throttle_pct = 0;
+static unsigned int rctl_throttle_pct2 = 0;
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
@@ -88,6 +92,16 @@ SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
&rctl_log_rate_limit, 0, "Maximum number of log messages per second");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
&rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
+ &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
+ &rctl_throttle_max, 0, "Longest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
+ &rctl_throttle_pct, 0,
+ "Throttling penalty for process consumption, in percent");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
+ &rctl_throttle_pct2, 0,
+ "Throttling penalty for container consumption, in percent");
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -134,6 +148,10 @@ static struct dict resourcenames[] = {
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ "pcpu", RACCT_PCTCPU },
+ { "readbps", RACCT_READBPS },
+ { "writebps", RACCT_WRITEBPS },
+ { "readiops", RACCT_READIOPS },
+ { "writeiops", RACCT_WRITEIOPS },
{ NULL, -1 }};
static struct dict actionnames[] = {
@@ -171,6 +189,7 @@ static struct dict actionnames[] = {
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
+ { "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
static void rctl_init(void);
@@ -274,23 +293,53 @@ rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
}
/*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit isn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit. This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
*/
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
- int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
{
- int64_t available;
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t minavailable;
ASSERT_RACCT_ENABLED();
- RCTL_LOCK_ASSERT();
- available = rctl_available_resource(p, rule);
- if (available >= amount)
- return (0);
+ minavailable = INT64_MAX;
- return (1);
+ RCTL_RLOCK();
+
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_THROTTLE)
+ continue;
+
+ if (rule->rr_amount < minavailable)
+ minavailable = rule->rr_amount;
+ }
+
+ RCTL_RUNLOCK();
+
+ if (racct->r_resources[resource] < minavailable) {
+ racct->r_resources[resource] = 0;
+ } else {
+ /*
+ * Cap utilization counter at ten times the limit. Otherwise,
+ * if we changed the rule lowering the allowed amount, it could
+ * take unreasonably long time for the accumulated resource
+ * usage to drop.
+ */
+ if (racct->r_resources[resource] > minavailable * 10)
+ racct->r_resources[resource] = minavailable * 10;
+
+ racct->r_resources[resource] -= minavailable;
+ }
}
/*
@@ -340,6 +389,38 @@ rctl_pcpu_available(const struct proc *p) {
return (minavailable);
}
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ c = a + b;
+
+ /*
+ * Detect overflow.
+ */
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ c = a * b;
+
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
@@ -353,9 +434,12 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
+ int64_t available;
+ uint64_t sleep_ms, sleep_ratio;
int should_deny = 0;
char *buf;
+
ASSERT_RACCT_ENABLED();
RCTL_RLOCK();
@@ -368,7 +452,9 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
- if (!rctl_would_exceed(p, rule, amount)) {
+
+ available = rctl_available_resource(p, rule);
+ if (available >= (int64_t)amount) {
link->rrl_exceeded = 0;
continue;
}
@@ -421,7 +507,7 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
if (p->p_state != PRS_NORMAL)
continue;
-
+
if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
rctl_devctl_rate_limit))
continue;
@@ -444,6 +530,69 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
+ case RCTL_ACTION_THROTTLE:
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ /*
+ * Make the process sleep for a fraction of second
+ * proportional to the ratio of process' resource
+ * utilization compared to the limit. The point is
+ * to penalize resource hogs: processes that consume
+ * more of the available resources sleep for longer.
+ *
+ * We're trying to defer division until the very end,
+ * to minimize the rounding effects. The following
+ * calculation could have been written in a clearer
+ * way like this:
+ *
+ * sleep_ms = hz * p->p_racct->r_resources[resource] /
+ * rule->rr_amount;
+ * sleep_ms *= rctl_throttle_pct / 100;
+ * if (sleep_ms < rctl_throttle_min)
+ * sleep_ms = rctl_throttle_min;
+ *
+ */
+ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+ sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100;
+ if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+ sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+ /*
+ * Multiply that by the ratio of the resource
+ * consumption for the container compared to the limit,
+ * squared. In other words, a process in a container
+ * that is two times over the limit will be throttled
+ * four times as much for hitting the same rule. The
+ * point is to penalize processes more if the container
+ * itself (eg certain UID or jail) is above the limit.
+ */
+ if (available < 0)
+ sleep_ratio = -available / rule->rr_amount;
+ else
+ sleep_ratio = 0;
+ sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+ sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+ sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+ /*
+ * Finally the division.
+ */
+ sleep_ms /= rule->rr_amount;
+
+ if (sleep_ms > rctl_throttle_max)
+ sleep_ms = rctl_throttle_max;
+#if 0
+ printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+ __func__, p->p_pid, p->p_comm,
+ p->p_racct->r_resources[resource],
+ rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+ KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
+ __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
+ racct_proc_throttle(p, sleep_ms);
+ continue;
default:
if (link->rrl_exceeded != 0)
continue;
@@ -1073,20 +1222,32 @@ rctl_rule_add(struct rctl_rule *rule)
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
- * Some rules just don't make sense. Note that the one below
- * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
- * for example, is not deniable in the racct sense, but the
- * limit is enforced in a different way, so "deny" rules for %CPU
- * do make sense.
+ * Some rules just don't make sense, like "deny" rule for an undeniable
+ * resource. The exception are the RSS and %CPU resources - they are
+ * not deniable in the racct sense, but the limit is enforced in
+ * a different way.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
- (rule->rr_resource == RACCT_CPU ||
- rule->rr_resource == RACCT_WALLCLOCK))
+ !RACCT_IS_DENIABLE(rule->rr_resource) &&
+ rule->rr_resource != RACCT_RSS &&
+ rule->rr_resource != RACCT_PCTCPU) {
return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ !RACCT_IS_DECAYING(rule->rr_resource)) {
+ return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ rule->rr_resource == RACCT_PCTCPU) {
+ return (EOPNOTSUPP);
+ }
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- RACCT_IS_SLOPPY(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource)) {
return (EOPNOTSUPP);
+ }
/*
* Make sure there are no duplicated rules. Also, for the "deny"
@@ -1960,6 +2121,15 @@ rctl_init(void)
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ if (rctl_throttle_min <= 0)
+ rctl_throttle_min = 1;
+ if (rctl_throttle_max <= 0)
+ rctl_throttle_max = 2 * hz;
+ if (rctl_throttle_pct <= 0)
+ rctl_throttle_pct = 100;
+ if (rctl_throttle_pct2 <= 0)
+ rctl_throttle_pct2 = 100;
}
#else /* !RCTL */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index a371456..6d1ac70 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -172,10 +172,14 @@ userret(struct thread *td, struct trapframe *frame)
(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
#endif
#ifdef RACCT
- if (racct_enable && p->p_throttled == 1) {
+ if (racct_enable && p->p_throttled != 0) {
PROC_LOCK(p);
- while (p->p_throttled == 1)
- msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+ while (p->p_throttled != 0) {
+ msleep(p->p_racct, &p->p_mtx, 0, "racct",
+ p->p_throttled < 0 ? 0 : p->p_throttled);
+ if (p->p_throttled > 0)
+ p->p_throttled = 0;
+ }
PROC_UNLOCK(p);
}
#endif
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index e272f9d..b7b9641 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/smp.h>
@@ -1784,8 +1785,16 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
if ((rabp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rabp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
rabp->b_flags |= B_ASYNC;
rabp->b_flags &= ~B_INVAL;
rabp->b_ioflags &= ~BIO_ERROR;
@@ -1829,8 +1838,16 @@ breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
@@ -1926,8 +1943,16 @@ bufwrite(struct buf *bp)
bp->b_runningbufspace = bp->b_bufsize;
space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_oublock++;
+ }
if (oldflags & B_ASYNC)
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 9871a50..40dc0c0 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/vmmeter.h>
@@ -241,6 +242,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
@@ -294,6 +302,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
BUF_KERNPROC(rbp);
rbp->b_iooffset = dbtob(rbp->b_blkno);
bstrategy(rbp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rbp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 2d1769e..d2b617c 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -623,7 +623,7 @@ struct proc {
after fork. */
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
- u_char p_throttled; /* (c) Flag for racct pcpu throttling */
+ int p_throttled; /* (c) Flag for racct pcpu throttling */
struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
diff --git a/sys/sys/racct.h b/sys/sys/racct.h
index 8d1f2fa..5330c63 100644
--- a/sys/sys/racct.h
+++ b/sys/sys/racct.h
@@ -42,6 +42,7 @@
#include <sys/stdint.h>
#include <sys/sysctl.h>
+struct buf;
struct proc;
struct rctl_rule_link;
struct ucred;
@@ -71,7 +72,11 @@ struct ucred;
#define RACCT_SHMSIZE 18
#define RACCT_WALLCLOCK 19
#define RACCT_PCTCPU 20
-#define RACCT_MAX RACCT_PCTCPU
+#define RACCT_READBPS 21
+#define RACCT_WRITEBPS 22
+#define RACCT_READIOPS 23
+#define RACCT_WRITEIOPS 24
+#define RACCT_MAX RACCT_WRITEIOPS
/*
* Resource properties.
@@ -153,6 +158,7 @@ SYSCTL_DECL(_kern_racct);
int racct_add(struct proc *p, int resource, uint64_t amount);
void racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
void racct_add_force(struct proc *p, int resource, uint64_t amount);
+void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write);
int racct_set(struct proc *p, int resource, uint64_t amount);
void racct_set_force(struct proc *p, int resource, uint64_t amount);
void racct_sub(struct proc *p, int resource, uint64_t amount);
@@ -170,6 +176,7 @@ void racct_proc_exit(struct proc *p);
void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
struct ucred *newcred);
void racct_move(struct racct *dest, struct racct *src);
+void racct_proc_throttle(struct proc *p, int timeout);
#else
diff --git a/sys/sys/rctl.h b/sys/sys/rctl.h
index e1a45a4..b9e6cd6 100644
--- a/sys/sys/rctl.h
+++ b/sys/sys/rctl.h
@@ -129,7 +129,8 @@ struct rctl_rule {
#define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1)
#define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2)
#define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3)
-#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL
+#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4)
+#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE
#define RCTL_AMOUNT_UNDEFINED -1
@@ -140,6 +141,7 @@ void rctl_rule_release(struct rctl_rule *rule);
int rctl_rule_add(struct rctl_rule *rule);
int rctl_rule_remove(struct rctl_rule *filter);
int rctl_enforce(struct proc *p, int resource, uint64_t amount);
+void rctl_throttle_decay(struct racct *racct, int resource);
int64_t rctl_pcpu_available(const struct proc *p);
uint64_t rctl_get_limit(struct proc *p, int resource);
uint64_t rctl_get_available(struct proc *p, int resource);
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index c8dac1b..0202820 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/random.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
@@ -659,6 +660,13 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++; /* pay for read */
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index bedc8e1..bc0693a 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -6229,6 +6230,13 @@ setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index 9819ef5..768298f 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -223,6 +224,13 @@ ufs_bmaparray(vp, bn, bnp, nbp, runp, runb)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index a7e3d37..13a5757 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
@@ -994,6 +995,21 @@ vnode_locked:
if (hardfault) {
PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+ if (racct_enable && fs.object->type == OBJT_VNODE) {
+ PROC_LOCK(curproc);
+ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ PAGE_SIZE + behind * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS,
+ PAGE_SIZE + ahead * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif
} else
curthread->td_ru.ru_minflt++;
diff --git a/usr.bin/rctl/rctl.8 b/usr.bin/rctl/rctl.8
index ec97623..2d92d54 100644
--- a/usr.bin/rctl/rctl.8
+++ b/usr.bin/rctl/rctl.8
@@ -25,7 +25,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 29, 2015
+.Dd January 30, 2016
.Dt RCTL 8
.Os
.Sh NAME
@@ -204,14 +204,22 @@ resource would be
.It Sy shmsize Ta "SysV shared memory size, in bytes"
.It Sy wallclock Ta "wallclock time, in seconds"
.It Sy pcpu Ta "%CPU, in percents of a single CPU core"
+.It Sy readbps Ta "filesystem reads, in bytes per second"
+.It Sy writebps Ta "filesystem writes, in bytes per second"
+.It Sy readiops Ta "filesystem reads, in operations per second"
+.It Sy writeiops Ta "filesystem writes, in operations per second"
.El
.Sh ACTIONS
.Bl -column -offset 3n "pseudoterminals"
.It Em action
.It Sy deny Ta deny the allocation; not supported for
-.Sy cputime
+.Sy cputime ,
+.Sy wallclock ,
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
and
-.Sy wallclock
+.Sy writeiops
.It Sy log Ta "log a warning to the console"
.It Sy devctl Ta "send notification to"
.Xr devd 8
@@ -228,6 +236,12 @@ send a signal to the offending process.
See
.Xr signal 3
for a list of supported signals
+.It Sy throttle Ta "slow down process execution"; only supported for
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
+and
+.Sy writeiops .
.El
.Pp
Not all actions are supported for all resources.
@@ -287,3 +301,22 @@ under sponsorship from the FreeBSD Foundation.
Limiting
.Sy memoryuse
may kill the machine due to thrashing.
+.Pp
+The
+.Sy readiops
+and
+.Sy writeiops
+counters are only approximations.
+Like
+.Sy readbps
+and
+.Sy writebps ,
+they are calculated in the filesystem layer, where it is difficult
+or even impossible to observe actual disk device operations.
+.Pp
+The
+.Sy writebps
+and
+.Sy writeiops
+resources generally account for writes to the filesystem cache,
+not to actual devices.
OpenPOWER on IntegriCloud