summaryrefslogtreecommitdiffstats
path: root/sys/kern/kern_rctl.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/kern_rctl.c')
-rw-r--r--sys/kern/kern_rctl.c309
1 files changed, 286 insertions, 23 deletions
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 7f6a7ad..2442852 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -77,17 +77,46 @@ FEATURE(rctl, "Resource Limits");
#define RCTL_PCPU_SHIFT (10 * 1000000)
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
static int rctl_log_rate_limit = 10;
static int rctl_devctl_rate_limit = 10;
+/*
+ * Values below are initialized in rctl_init().
+ */
+static int rctl_throttle_min = -1;
+static int rctl_throttle_max = -1;
+static int rctl_throttle_pct = -1;
+static int rctl_throttle_pct2 = -1;
+
+static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
+static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
+static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
+static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
+
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
&rctl_maxbufsize, 0, "Maximum output buffer size");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
&rctl_log_rate_limit, 0, "Maximum number of log messages per second");
-SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
&rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
+SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
+ CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
+ "Shortest throttling duration, in hz");
+TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
+SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
+ CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
+ "Longest throttling duration, in hz");
+TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
+SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
+ CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
+ "Throttling penalty for process consumption, in percent");
+TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
+SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
+ CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
+ "Throttling penalty for container consumption, in percent");
+TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -134,6 +163,10 @@ static struct dict resourcenames[] = {
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ "pcpu", RACCT_PCTCPU },
+ { "readbps", RACCT_READBPS },
+ { "writebps", RACCT_WRITEBPS },
+ { "readiops", RACCT_READIOPS },
+ { "writeiops", RACCT_WRITEIOPS },
{ NULL, -1 }};
static struct dict actionnames[] = {
@@ -171,6 +204,7 @@ static struct dict actionnames[] = {
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
+ { "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
static void rctl_init(void);
@@ -193,6 +227,78 @@ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
+static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int val = rctl_throttle_min;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (val < 1 || val > rctl_throttle_max)
+ return (EINVAL);
+
+ RCTL_WLOCK();
+ rctl_throttle_min = val;
+ RCTL_WUNLOCK();
+
+ return (0);
+}
+
+static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int val = rctl_throttle_max;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (val < rctl_throttle_min)
+ return (EINVAL);
+
+ RCTL_WLOCK();
+ rctl_throttle_max = val;
+ RCTL_WUNLOCK();
+
+ return (0);
+}
+
+static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int val = rctl_throttle_pct;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (val < 0)
+ return (EINVAL);
+
+ RCTL_WLOCK();
+ rctl_throttle_pct = val;
+ RCTL_WUNLOCK();
+
+ return (0);
+}
+
+static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int val = rctl_throttle_pct2;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (val < 0)
+ return (EINVAL);
+
+ RCTL_WLOCK();
+ rctl_throttle_pct2 = val;
+ RCTL_WUNLOCK();
+
+ return (0);
+}
+
static const char *
rctl_subject_type_name(int subject)
{
@@ -274,23 +380,53 @@ rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
}
/*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit isn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit. This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
*/
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
- int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
{
- int64_t available;
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t minavailable;
ASSERT_RACCT_ENABLED();
- RCTL_LOCK_ASSERT();
- available = rctl_available_resource(p, rule);
- if (available >= amount)
- return (0);
+ minavailable = INT64_MAX;
- return (1);
+ RCTL_RLOCK();
+
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_THROTTLE)
+ continue;
+
+ if (rule->rr_amount < minavailable)
+ minavailable = rule->rr_amount;
+ }
+
+ RCTL_RUNLOCK();
+
+ if (racct->r_resources[resource] < minavailable) {
+ racct->r_resources[resource] = 0;
+ } else {
+ /*
+ * Cap utilization counter at ten times the limit. Otherwise,
+ * if we changed the rule lowering the allowed amount, it could
+ * take unreasonably long time for the accumulated resource
+ * usage to drop.
+ */
+ if (racct->r_resources[resource] > minavailable * 10)
+ racct->r_resources[resource] = minavailable * 10;
+
+ racct->r_resources[resource] -= minavailable;
+ }
}
/*
@@ -340,6 +476,38 @@ rctl_pcpu_available(const struct proc *p) {
return (minavailable);
}
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ c = a + b;
+
+ /*
+ * Detect overflow.
+ */
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ c = a * b;
+
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
@@ -353,9 +521,12 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
+ int64_t available;
+ uint64_t sleep_ms, sleep_ratio;
int should_deny = 0;
char *buf;
+
ASSERT_RACCT_ENABLED();
RCTL_RLOCK();
@@ -368,7 +539,9 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
- if (!rctl_would_exceed(p, rule, amount)) {
+
+ available = rctl_available_resource(p, rule);
+ if (available >= (int64_t)amount) {
link->rrl_exceeded = 0;
continue;
}
@@ -421,7 +594,7 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
if (p->p_state != PRS_NORMAL)
continue;
-
+
if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
rctl_devctl_rate_limit))
continue;
@@ -444,6 +617,69 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount)
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
+ case RCTL_ACTION_THROTTLE:
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ /*
+ * Make the process sleep for a fraction of second
+ * proportional to the ratio of process' resource
+ * utilization compared to the limit. The point is
+ * to penalize resource hogs: processes that consume
+ * more of the available resources sleep for longer.
+ *
+ * We're trying to defer division until the very end,
+ * to minimize the rounding effects. The following
+ * calculation could have been written in a clearer
+ * way like this:
+ *
+ * sleep_ms = hz * p->p_racct->r_resources[resource] /
+ * rule->rr_amount;
+ * sleep_ms *= rctl_throttle_pct / 100;
+ * if (sleep_ms < rctl_throttle_min)
+ * sleep_ms = rctl_throttle_min;
+ *
+ */
+ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+ sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100;
+ if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+ sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+ /*
+ * Multiply that by the ratio of the resource
+ * consumption for the container compared to the limit,
+ * squared. In other words, a process in a container
+ * that is two times over the limit will be throttled
+ * four times as much for hitting the same rule. The
+ * point is to penalize processes more if the container
+ * itself (eg certain UID or jail) is above the limit.
+ */
+ if (available < 0)
+ sleep_ratio = -available / rule->rr_amount;
+ else
+ sleep_ratio = 0;
+ sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+ sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+ sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+ /*
+ * Finally the division.
+ */
+ sleep_ms /= rule->rr_amount;
+
+ if (sleep_ms > rctl_throttle_max)
+ sleep_ms = rctl_throttle_max;
+#if 0
+ printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+ __func__, p->p_pid, p->p_comm,
+ p->p_racct->r_resources[resource],
+ rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+ KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
+ __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
+ racct_proc_throttle(p, sleep_ms);
+ continue;
default:
if (link->rrl_exceeded != 0)
continue;
@@ -1073,20 +1309,32 @@ rctl_rule_add(struct rctl_rule *rule)
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
- * Some rules just don't make sense. Note that the one below
- * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
- * for example, is not deniable in the racct sense, but the
- * limit is enforced in a different way, so "deny" rules for %CPU
- * do make sense.
+ * Some rules just don't make sense, like "deny" rule for an undeniable
+ * resource. The exception are the RSS and %CPU resources - they are
+ * not deniable in the racct sense, but the limit is enforced in
+ * a different way.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
- (rule->rr_resource == RACCT_CPU ||
- rule->rr_resource == RACCT_WALLCLOCK))
+ !RACCT_IS_DENIABLE(rule->rr_resource) &&
+ rule->rr_resource != RACCT_RSS &&
+ rule->rr_resource != RACCT_PCTCPU) {
return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ !RACCT_IS_DECAYING(rule->rr_resource)) {
+ return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ rule->rr_resource == RACCT_PCTCPU) {
+ return (EOPNOTSUPP);
+ }
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- RACCT_IS_SLOPPY(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource)) {
return (EOPNOTSUPP);
+ }
/*
* Make sure there are no duplicated rules. Also, for the "deny"
@@ -1960,6 +2208,21 @@ rctl_init(void)
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ /*
+ * Set default values, making sure not to overwrite the ones
+ * fetched from tunables. Most of those could be set at the
+ * declaration, except for the rctl_throttle_max - we cannot
+ * set it there due to hz not being compile time constant.
+ */
+ if (rctl_throttle_min < 1)
+ rctl_throttle_min = 1;
+ if (rctl_throttle_max < rctl_throttle_min)
+ rctl_throttle_max = 2 * hz;
+ if (rctl_throttle_pct < 0)
+ rctl_throttle_pct = 100;
+ if (rctl_throttle_pct2 < 0)
+ rctl_throttle_pct2 = 100;
}
#else /* !RCTL */
OpenPOWER on IntegriCloud