summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authortrasz <trasz@FreeBSD.org>2016-04-07 04:23:25 +0000
committertrasz <trasz@FreeBSD.org>2016-04-07 04:23:25 +0000
commit825d80e01c65efad6bfe7302038a657bbc08e06a (patch)
tree38459686f8eedbb80701f99097b1b51ee3363aa8 /sys
parentcc1aaf5a1a88524a7d9ff817b98dcfb66049f5f8 (diff)
downloadFreeBSD-src-825d80e01c65efad6bfe7302038a657bbc08e06a.zip
FreeBSD-src-825d80e01c65efad6bfe7302038a657bbc08e06a.tar.gz
Add four new RCTL resources - readbps, readiops, writebps and writeiops,
for limiting disk (actually filesystem) IO. Note that in some cases these limits are not quite precise. It's ok, as long as it's within some reasonable bounds. Testing - and review of the code, in particular the VFS and VM parts - is very welcome. MFC after: 1 month Relnotes: yes Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D5080
Diffstat (limited to 'sys')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c20
-rw-r--r--sys/fs/ext2fs/ext2_bmap.c8
-rw-r--r--sys/kern/kern_physio.c17
-rw-r--r--sys/kern/kern_racct.c77
-rw-r--r--sys/kern/kern_rctl.c214
-rw-r--r--sys/kern/subr_trap.c10
-rw-r--r--sys/kern/vfs_bio.c31
-rw-r--r--sys/kern/vfs_cluster.c15
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/racct.h9
-rw-r--r--sys/sys/rctl.h4
-rw-r--r--sys/ufs/ffs/ffs_inode.c8
-rw-r--r--sys/ufs/ffs/ffs_softdep.c8
-rw-r--r--sys/ufs/ufs/ufs_bmap.c8
-rw-r--r--sys/vm/vm_fault.c16
16 files changed, 413 insertions, 43 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index f944903..534dfb2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -132,6 +132,7 @@
#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
+#include <sys/racct.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -4503,6 +4504,14 @@ top:
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_READBPS, size);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index b60236f..af8d366 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
+#include <sys/racct.h>
#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
@@ -427,6 +428,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+#if defined(_KERNEL) && defined(RACCT)
+ if (racct_enable && !read) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, length);
+ racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+ PROC_UNLOCK(curproc);
+ }
+#endif
+
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
@@ -1422,7 +1432,15 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
-#endif
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
diff --git a/sys/fs/ext2fs/ext2_bmap.c b/sys/fs/ext2fs/ext2_bmap.c
index 8e5e986..7966b9b 100644
--- a/sys/fs/ext2fs/ext2_bmap.c
+++ b/sys/fs/ext2fs/ext2_bmap.c
@@ -42,6 +42,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -247,6 +248,13 @@ ext2_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, int *runp, int *runb)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index a148386..31d8f16 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -27,6 +27,7 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/uio.h>
#include <geom/geom.h>
@@ -109,6 +110,22 @@ physio(struct cdev *dev, struct uio *uio, int ioflag)
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
error = 0;
for (i = 0; i < uio->uio_iovcnt; i++) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ if (uio->uio_rw == UIO_READ) {
+ racct_add_force(curproc, RACCT_READBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+
while (uio->uio_iov[i].iov_len) {
g_reset_bio(bp);
if (uio->uio_rw == UIO_READ) {
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index bbd50ca..438a249 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include "opt_sched.h"
#include <sys/param.h>
+#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
@@ -177,7 +178,15 @@ int racct_types[] = {
[RACCT_WALLCLOCK] =
RACCT_IN_MILLIONS,
[RACCT_PCTCPU] =
- RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+ RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
+ [RACCT_READBPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEBPS] =
+ RACCT_DECAYING,
+ [RACCT_READIOPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEIOPS] =
+ RACCT_DECAYING };
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
@@ -634,6 +643,28 @@ racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
RACCT_UNLOCK();
}
+/*
+ * Account for disk IO resource consumption. Checks for limits,
+ * but never fails, due to disk limits being undeniable.
+ */
+void
+racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
+{
+
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ RACCT_LOCK();
+ if (is_write) {
+ racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
+ racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
+ } else {
+ racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
+ racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
+ }
+ RACCT_UNLOCK();
+}
+
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
{
@@ -655,7 +686,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (resource == RACCT_PCTCPU) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -1043,14 +1074,19 @@ racct_move(struct racct *dest, struct racct *src)
RACCT_UNLOCK();
}
-static void
-racct_proc_throttle(struct proc *p)
+/*
+ * Make the process sleep in userret() for 'timeout' ticks. Setting
+ * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
+ */
+void
+racct_proc_throttle(struct proc *p, int timeout)
{
struct thread *td;
#ifdef SMP
int cpuid;
#endif
+ KASSERT(timeout != 0, ("timeout %d", timeout));
ASSERT_RACCT_ENABLED();
PROC_LOCK_ASSERT(p, MA_OWNED);
@@ -1058,10 +1094,13 @@ racct_proc_throttle(struct proc *p)
* Do not block kernel processes. Also do not block processes with
* low %cpu utilization to improve interactivity.
*/
- if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) ||
- (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
return;
- p->p_throttled = 1;
+
+ if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
+ return;
+
+ p->p_throttled = timeout;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
@@ -1102,7 +1141,7 @@ racct_proc_wakeup(struct proc *p)
PROC_LOCK_ASSERT(p, MA_OWNED);
- if (p->p_throttled) {
+ if (p->p_throttled != 0) {
p->p_throttled = 0;
wakeup(p->p_racct);
}
@@ -1116,6 +1155,13 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
ASSERT_RACCT_ENABLED();
RACCT_LOCK_ASSERT();
+#ifdef RCTL
+ rctl_throttle_decay(racct, RACCT_READBPS);
+ rctl_throttle_decay(racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(racct, RACCT_READIOPS);
+ rctl_throttle_decay(racct, RACCT_WRITEIOPS);
+#endif
+
r_old = racct->r_resources[RACCT_PCTCPU];
/* If there is nothing to decay, just exit. */
@@ -1206,6 +1252,12 @@ racctd(void)
pct_estimate = 0;
pct = racct_getpcpu(p, pct_estimate);
RACCT_LOCK();
+#ifdef RCTL
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+#endif
racct_set_locked(p, RACCT_PCTCPU, pct, 1);
racct_set_locked(p, RACCT_CPU, runtime, 0);
racct_set_locked(p, RACCT_WALLCLOCK,
@@ -1228,10 +1280,13 @@ racctd(void)
continue;
}
- if (racct_pcpu_available(p) <= 0)
- racct_proc_throttle(p);
- else if (p->p_throttled)
+ if (racct_pcpu_available(p) <= 0) {
+ if (p->p_racct->r_resources[RACCT_PCTCPU] >
+ pcpu_threshold)
+ racct_proc_throttle(p, -1);
+ } else if (p->p_throttled == -1) {
racct_proc_wakeup(p);
+ }
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 7f6a7ad..8f301b8 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -77,9 +77,13 @@ FEATURE(rctl, "Resource Limits");
#define RCTL_PCPU_SHIFT (10 * 1000000)
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
static int rctl_log_rate_limit = 10;
static int rctl_devctl_rate_limit = 10;
+static unsigned int rctl_throttle_min = 0;
+static unsigned int rctl_throttle_max = 0;
+static unsigned int rctl_throttle_pct = 0;
+static unsigned int rctl_throttle_pct2 = 0;
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
@@ -88,6 +92,16 @@ SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
&rctl_log_rate_limit, 0, "Maximum number of log messages per second");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
&rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
+ &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
+ &rctl_throttle_max, 0, "Longest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
+ &rctl_throttle_pct, 0,
+ "Throttling penalty for process consumption, in percent");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
+ &rctl_throttle_pct2, 0,
+ "Throttling penalty for container consumption, in percent");
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -134,6 +148,10 @@ static struct dict resourcenames[] = {
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ "pcpu", RACCT_PCTCPU },
+ { "readbps", RACCT_READBPS },
+ { "writebps", RACCT_WRITEBPS },
+ { "readiops", RACCT_READIOPS },
+ { "writeiops", RACCT_WRITEIOPS },
{ NULL, -1 }};
static struct dict actionnames[] = {
@@ -171,6 +189,7 @@ static struct dict actionnames[] = {
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
+ { "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
static void rctl_init(void);
@@ -274,23 +293,53 @@ rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
}
/*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit isn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit. This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
*/
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
- int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
{
- int64_t available;
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t minavailable;
ASSERT_RACCT_ENABLED();
- RCTL_LOCK_ASSERT();
- available = rctl_available_resource(p, rule);
- if (available >= amount)
- return (0);
+ minavailable = INT64_MAX;
- return (1);
+ RCTL_RLOCK();
+
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_THROTTLE)
+ continue;
+
+ if (rule->rr_amount < minavailable)
+ minavailable = rule->rr_amount;
+ }
+
+ RCTL_RUNLOCK();
+
+ if (racct->r_resources[resource] < minavailable) {
+ racct->r_resources[resource] = 0;
+ } else {
+ /*
+ * Cap utilization counter at ten times the limit. Otherwise,
+ * if we changed the rule lowering the allowed amount, it could
+ * take unreasonably long time for the accumulated resource
+ * usage to drop.
+ */
+ if (racct->r_resources[resource] > minavailable * 10)
+ racct->r_resources[resource] = minavailable * 10;
+
+ racct->r_resources[resource] -= minavailable;
+ }
}
/*
@@ -340,6 +389,38 @@ rctl_pcpu_available(const struct proc *p) {
return (minavailable);
}
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ c = a + b;
+
+ /*
+ * Detect overflow.
+ */
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ c = a * b;
+
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
@@ -353,9 +434,12 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
+ int64_t available;
+ uint64_t sleep_ms, sleep_ratio;
int should_deny = 0;
char *buf;
+
ASSERT_RACCT_ENABLED();
RCTL_RLOCK();
@@ -368,7 +452,9 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
- if (!rctl_would_exceed(p, rule, amount)) {
+
+ available = rctl_available_resource(p, rule);
+ if (available >= (int64_t)amount) {
link->rrl_exceeded = 0;
continue;
}
@@ -421,7 +507,7 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
if (p->p_state != PRS_NORMAL)
continue;
-
+
if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
rctl_devctl_rate_limit))
continue;
@@ -444,6 +530,69 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
+ case RCTL_ACTION_THROTTLE:
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ /*
+ * Make the process sleep for a fraction of second
+ * proportional to the ratio of process' resource
+ * utilization compared to the limit. The point is
+ * to penalize resource hogs: processes that consume
+ * more of the available resources sleep for longer.
+ *
+ * We're trying to defer division until the very end,
+ * to minimize the rounding effects. The following
+ * calculation could have been written in a clearer
+ * way like this:
+ *
+ * sleep_ms = hz * p->p_racct->r_resources[resource] /
+ * rule->rr_amount;
+ * sleep_ms *= rctl_throttle_pct / 100;
+ * if (sleep_ms < rctl_throttle_min)
+ * sleep_ms = rctl_throttle_min;
+ *
+ */
+ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+ sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100;
+ if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+ sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+ /*
+ * Multiply that by the ratio of the resource
+ * consumption for the container compared to the limit,
+ * squared. In other words, a process in a container
+ * that is two times over the limit will be throttled
+ * four times as much for hitting the same rule. The
+ * point is to penalize processes more if the container
+ * itself (eg certain UID or jail) is above the limit.
+ */
+ if (available < 0)
+ sleep_ratio = -available / rule->rr_amount;
+ else
+ sleep_ratio = 0;
+ sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+ sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+ sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+ /*
+ * Finally the division.
+ */
+ sleep_ms /= rule->rr_amount;
+
+ if (sleep_ms > rctl_throttle_max)
+ sleep_ms = rctl_throttle_max;
+#if 0
+ printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+ __func__, p->p_pid, p->p_comm,
+ p->p_racct->r_resources[resource],
+ rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+ KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
+ __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
+ racct_proc_throttle(p, sleep_ms);
+ continue;
default:
if (link->rrl_exceeded != 0)
continue;
@@ -1073,20 +1222,32 @@ rctl_rule_add(struct rctl_rule *rule)
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
- * Some rules just don't make sense. Note that the one below
- * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
- * for example, is not deniable in the racct sense, but the
- * limit is enforced in a different way, so "deny" rules for %CPU
- * do make sense.
+ * Some rules just don't make sense, like "deny" rule for an undeniable
+ * resource. The exception are the RSS and %CPU resources - they are
+ * not deniable in the racct sense, but the limit is enforced in
+ * a different way.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
- (rule->rr_resource == RACCT_CPU ||
- rule->rr_resource == RACCT_WALLCLOCK))
+ !RACCT_IS_DENIABLE(rule->rr_resource) &&
+ rule->rr_resource != RACCT_RSS &&
+ rule->rr_resource != RACCT_PCTCPU) {
return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ !RACCT_IS_DECAYING(rule->rr_resource)) {
+ return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ rule->rr_resource == RACCT_PCTCPU) {
+ return (EOPNOTSUPP);
+ }
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- RACCT_IS_SLOPPY(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource)) {
return (EOPNOTSUPP);
+ }
/*
* Make sure there are no duplicated rules. Also, for the "deny"
@@ -1960,6 +2121,15 @@ rctl_init(void)
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ if (rctl_throttle_min <= 0)
+ rctl_throttle_min = 1;
+ if (rctl_throttle_max <= 0)
+ rctl_throttle_max = 2 * hz;
+ if (rctl_throttle_pct <= 0)
+ rctl_throttle_pct = 100;
+ if (rctl_throttle_pct2 <= 0)
+ rctl_throttle_pct2 = 100;
}
#else /* !RCTL */
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index a371456..6d1ac70 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -172,10 +172,14 @@ userret(struct thread *td, struct trapframe *frame)
(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
#endif
#ifdef RACCT
- if (racct_enable && p->p_throttled == 1) {
+ if (racct_enable && p->p_throttled != 0) {
PROC_LOCK(p);
- while (p->p_throttled == 1)
- msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+ while (p->p_throttled != 0) {
+ msleep(p->p_racct, &p->p_mtx, 0, "racct",
+ p->p_throttled < 0 ? 0 : p->p_throttled);
+ if (p->p_throttled > 0)
+ p->p_throttled = 0;
+ }
PROC_UNLOCK(p);
}
#endif
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index e272f9d..b7b9641 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/smp.h>
@@ -1784,8 +1785,16 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
if ((rabp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rabp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
rabp->b_flags |= B_ASYNC;
rabp->b_flags &= ~B_INVAL;
rabp->b_ioflags &= ~BIO_ERROR;
@@ -1829,8 +1838,16 @@ breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
@@ -1926,8 +1943,16 @@ bufwrite(struct buf *bp)
bp->b_runningbufspace = bp->b_bufsize;
space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_oublock++;
+ }
if (oldflags & B_ASYNC)
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 9871a50..40dc0c0 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/vmmeter.h>
@@ -241,6 +242,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
@@ -294,6 +302,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
BUF_KERNPROC(rbp);
rbp->b_iooffset = dbtob(rbp->b_blkno);
bstrategy(rbp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rbp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 2d1769e..d2b617c 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -623,7 +623,7 @@ struct proc {
after fork. */
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
- u_char p_throttled; /* (c) Flag for racct pcpu throttling */
+ int p_throttled; /* (c) Flag for racct pcpu throttling */
struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
diff --git a/sys/sys/racct.h b/sys/sys/racct.h
index 8d1f2fa..5330c63 100644
--- a/sys/sys/racct.h
+++ b/sys/sys/racct.h
@@ -42,6 +42,7 @@
#include <sys/stdint.h>
#include <sys/sysctl.h>
+struct buf;
struct proc;
struct rctl_rule_link;
struct ucred;
@@ -71,7 +72,11 @@ struct ucred;
#define RACCT_SHMSIZE 18
#define RACCT_WALLCLOCK 19
#define RACCT_PCTCPU 20
-#define RACCT_MAX RACCT_PCTCPU
+#define RACCT_READBPS 21
+#define RACCT_WRITEBPS 22
+#define RACCT_READIOPS 23
+#define RACCT_WRITEIOPS 24
+#define RACCT_MAX RACCT_WRITEIOPS
/*
* Resource properties.
@@ -153,6 +158,7 @@ SYSCTL_DECL(_kern_racct);
int racct_add(struct proc *p, int resource, uint64_t amount);
void racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
void racct_add_force(struct proc *p, int resource, uint64_t amount);
+void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write);
int racct_set(struct proc *p, int resource, uint64_t amount);
void racct_set_force(struct proc *p, int resource, uint64_t amount);
void racct_sub(struct proc *p, int resource, uint64_t amount);
@@ -170,6 +176,7 @@ void racct_proc_exit(struct proc *p);
void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
struct ucred *newcred);
void racct_move(struct racct *dest, struct racct *src);
+void racct_proc_throttle(struct proc *p, int timeout);
#else
diff --git a/sys/sys/rctl.h b/sys/sys/rctl.h
index e1a45a4..b9e6cd6 100644
--- a/sys/sys/rctl.h
+++ b/sys/sys/rctl.h
@@ -129,7 +129,8 @@ struct rctl_rule {
#define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1)
#define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2)
#define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3)
-#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL
+#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4)
+#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE
#define RCTL_AMOUNT_UNDEFINED -1
@@ -140,6 +141,7 @@ void rctl_rule_release(struct rctl_rule *rule);
int rctl_rule_add(struct rctl_rule *rule);
int rctl_rule_remove(struct rctl_rule *filter);
int rctl_enforce(struct proc *p, int resource, uint64_t amount);
+void rctl_throttle_decay(struct racct *racct, int resource);
int64_t rctl_pcpu_available(const struct proc *p);
uint64_t rctl_get_limit(struct proc *p, int resource);
uint64_t rctl_get_available(struct proc *p, int resource);
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index c8dac1b..0202820 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/random.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
@@ -659,6 +660,13 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++; /* pay for read */
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index bedc8e1..bc0693a 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -6229,6 +6230,13 @@ setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index 9819ef5..768298f 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -223,6 +224,13 @@ ufs_bmaparray(vp, bn, bnp, nbp, runp, runb)
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index a7e3d37..13a5757 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
@@ -994,6 +995,21 @@ vnode_locked:
if (hardfault) {
PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+ if (racct_enable && fs.object->type == OBJT_VNODE) {
+ PROC_LOCK(curproc);
+ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ PAGE_SIZE + behind * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS,
+ PAGE_SIZE + ahead * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif
} else
curthread->td_ru.ru_minflt++;
OpenPOWER on IntegriCloud