summaryrefslogtreecommitdiffstats
path: root/sys/geom
diff options
context:
space:
mode:
authorimp <imp@FreeBSD.org>2015-09-02 17:29:30 +0000
committerimp <imp@FreeBSD.org>2015-09-02 17:29:30 +0000
commit97bb89fb501e98f06080503f0c690f51c807d8a1 (patch)
tree9b869f554061359ce030522213b642cf2378a8ab /sys/geom
parentfd2a8c5a253a3142b4a1f594c4cdcae51abadf69 (diff)
downloadFreeBSD-src-97bb89fb501e98f06080503f0c690f51c807d8a1.zip
FreeBSD-src-97bb89fb501e98f06080503f0c690f51c807d8a1.tar.gz
After the introduction of direct dispatch, the pacing code in g_down()
broke in two ways. One, the pacing variable was accessed in multiple threads in an unsafe way. Two, since large numbers of I/O could come down from the buf layer at one time, large numbers of allocation failures could happen all at once, resulting in a huge pace value that would limit I/Os to 10 IOPS for minutes (or even hours) at a time. While a real solution to these problems requires substantial work (to go to a no-allocation after the first model, or to have some way to wait for more memory with some kind of reserve for pager and swapper requests), it is relatively easy to make this simplistic pacing less pathological. Move to using a volatile variable with loads and stores. While this is a little racy, losing the race is safe: either you get memory and proceed, or you don't and queue. Second, sleep for 1ms (or one tick, whichever is larger) instead of 100ms. This removes the artificial 10 IOPS limit while still easing up on new I/Os during memory shortages. Remove tying the amount of time we do this to the number of failed requests and do it only as long as we keep failing requests. Finally, to avoid needless recursion when memory is tight (start -> g_io_deliver() -> g_io_request() -> start -> ... until we use 1/2 the stack), don't do direct dispatch while pacing. This should be a rare event (not steady state) so the performance hit here is worth the extra safety of not starving g_down() with directly dispatched I/O. Differential Review: https://reviews.freebsd.org/D3546
Diffstat (limited to 'sys/geom')
-rw-r--r--sys/geom/geom_io.c48
1 files changed, 41 insertions, 7 deletions
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index e72e709..f1edc70 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -71,7 +71,17 @@ static struct g_bioq g_bio_run_down;
static struct g_bioq g_bio_run_up;
static struct g_bioq g_bio_run_task;
-static u_int pace;
+/*
+ * Pace is a hint that we've had some trouble recently allocating
+ * bios, so we should back off trying to send I/O down the stack
+ * a bit to let the problem resolve. When pacing, we also turn
+ * off direct dispatch to also reduce memory pressure from I/Os
+ * there, at the expxense of some added latency while the memory
+ * pressures exist. See g_io_schedule_down() for more details
+ * and limitations.
+ */
+static volatile u_int pace;
+
static uma_zone_t biozone;
/*
@@ -521,7 +531,8 @@ g_io_request(struct bio *bp, struct g_consumer *cp)
(pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
!g_is_geom_thread(curthread) &&
((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
- (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP());
+ (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
+ pace == 0;
if (direct) {
/* Block direct execution if less then half of stack left. */
size_t st, su;
@@ -688,7 +699,7 @@ g_io_deliver(struct bio *bp, int error)
bp->bio_driver2 = NULL;
bp->bio_pflags = 0;
g_io_request(bp, cp);
- pace++;
+ pace = 1;
return;
}
@@ -777,10 +788,33 @@ g_io_schedule_down(struct thread *tp __unused)
}
CTR0(KTR_GEOM, "g_down has work to do");
g_bioq_unlock(&g_bio_run_down);
- if (pace > 0) {
- CTR1(KTR_GEOM, "g_down pacing self (pace %d)", pace);
- pause("g_down", hz/10);
- pace--;
+ if (pace != 0) {
+ /*
+ * There has been at least one memory allocation
+ * failure since the last I/O completed. Pause 1ms to
+ * give the system a chance to free up memory. We only
+ * do this once because a large number of allocations
+ * can fail in the direct dispatch case and there's no
+ * relationship between the number of these failures and
+ * the length of the outage. If there's still an outage,
+ * we'll pause again and again until it's
+ * resolved. Older versions paused longer and once per
+ * allocation failure. This was OK for a single threaded
+ * g_down, but with direct dispatch would lead to max of
+ * 10 IOPs for minutes at a time when transient memory
+ * issues prevented allocation for a batch of requests
+ * from the upper layers.
+ *
+ * XXX This pacing is really lame. It needs to be solved
+ * by other methods. This is OK only because the worst
+ * case scenario is so rare. In the worst case scenario
+ * all memory is tied up waiting for I/O to complete
+ * which can never happen since we can't allocate bios
+ * for that I/O.
+ */
+ CTR0(KTR_GEOM, "g_down pacing self");
+ pause("g_down", min(hz/1000, 1));
+ pace = 0;
}
CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
bp->bio_to->name);
OpenPOWER on IntegriCloud