summaryrefslogtreecommitdiffstats
path: root/sys/kern/vfs_bio.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/vfs_bio.c')
-rw-r--r--sys/kern/vfs_bio.c263
1 files changed, 128 insertions, 135 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 21d447d..9949813 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -85,22 +85,24 @@ static void buf_daemon __P((void));
* but the code is intricate enough already.
*/
vm_page_t bogus_page;
-int runningbufspace;
int vmiodirenable = FALSE;
+int runningbufspace;
static vm_offset_t bogus_offset;
-static int bufspace, maxbufspace,
+static int bufspace, maxbufspace,
bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
static int bufreusecnt, bufdefragcnt, buffreekvacnt;
-static int maxbdrun;
static int needsbuffer;
-static int numdirtybuffers, hidirtybuffers;
+static int lorunningspace, hirunningspace, runningbufreq;
+static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int getnewbufcalls;
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
&numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+ &lodirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
&hidirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
@@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
&hifreebuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
&runningbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
+ &lorunningspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
+ &hirunningspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
&maxbufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
@@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
&lobufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
- &maxbdrun, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
&maxbufmallocspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
@@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn)
*/
static __inline void
-numdirtywakeup(void)
+numdirtywakeup(int level)
{
- if (numdirtybuffers < hidirtybuffers) {
+ if (numdirtybuffers <= level) {
if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
wakeup(&needsbuffer);
@@ -204,6 +208,23 @@ bufspacewakeup(void)
}
/*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+ if (bp->b_runningbufspace) {
+ runningbufspace -= bp->b_runningbufspace;
+ bp->b_runningbufspace = 0;
+ if (runningbufreq && runningbufspace <= lorunningspace) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ }
+}
+
+/*
* bufcountwakeup:
*
* Called when a buffer has been added to one of the free queues to
@@ -225,6 +246,31 @@ bufcountwakeup(void)
}
/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * Reads will adjust runningbufspace, but will not block based on it.
+ * The read load has a side effect of reducing the allowed write load.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+ while (runningbufspace > hirunningspace) {
+ ++runningbufreq;
+ tsleep(&runningbufreq, PVM, "wdrain", 0);
+ }
+}
+
+
+/*
* vfs_buf_test_cache:
*
* Called when a buffer is extended. This function clears the B_CACHE
@@ -248,7 +294,7 @@ static __inline__
void
bd_wakeup(int dirtybuflevel)
{
- if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
+ if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
bd_request = 1;
wakeup(&bd_request);
}
@@ -330,6 +376,9 @@ bufinit(void)
hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
lobufspace = hibufspace - MAXBSIZE;
+ lorunningspace = 512 * 1024;
+ hirunningspace = 1024 * 1024;
+
/*
* Limit the amount of malloc memory since it is wired permanently into
* the kernel space. Even though this is accounted for in the buffer
@@ -354,6 +403,7 @@ bufinit(void)
while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
hidirtybuffers >>= 1;
}
+ lodirtybuffers = hidirtybuffers / 2;
/*
* Try to keep the number of free buffers in the specified range,
@@ -370,8 +420,6 @@ bufinit(void)
* based on the number of bytes of I/O in-transit that were initiated
* from buf_daemon.
*/
- if ((maxbdrun = nswbuf / 4) < 4)
- maxbdrun = 4;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
bogus_page = vm_page_alloc(kernel_object,
@@ -419,7 +467,6 @@ bremfree(struct buf * bp)
KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
- runningbufspace += bp->b_bufsize;
} else {
if (BUF_REFCNT(bp) <= 1)
panic("bremfree: removing a buffer not on a queue");
@@ -659,6 +706,13 @@ bwrite(struct buf * bp)
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
+ } else {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. There is no chance of deadlock here because
+ * we are blocking on I/O that is already in-progress.
+ */
+ waitrunningbufspace();
}
return (0);
@@ -774,11 +828,11 @@ bdwrite(struct buf * bp)
bqrelse(bp);
/*
- * Wakeup the buffer flushing daemon if we have saturated the
- * buffer cache.
+ * Wakeup the buffer flushing daemon if we have a lot of dirty
+ * buffers (midpoint between our recovery point and our stall
+ * point).
*/
-
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
/*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
@@ -817,7 +871,7 @@ bdirty(bp)
bp->b_flags |= B_DONE | B_DELWRI;
reassignbuf(bp, bp->b_vp);
++numdirtybuffers;
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
}
}
@@ -843,7 +897,7 @@ bundirty(bp)
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp, bp->b_vp);
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
/*
* Since it is now being written, we can clear its deferred write flag.
@@ -896,14 +950,12 @@ bowrite(struct buf * bp)
void
bwillwrite(void)
{
- int slop = hidirtybuffers / 10;
-
- if (numdirtybuffers > hidirtybuffers + slop) {
+ if (numdirtybuffers >= hidirtybuffers) {
int s;
s = splbio();
- while (numdirtybuffers > hidirtybuffers) {
- bd_wakeup(hidirtybuffers);
+ while (numdirtybuffers >= hidirtybuffers) {
+ bd_wakeup(1);
needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
}
@@ -963,7 +1015,7 @@ brelse(struct buf * bp)
buf_deallocate(bp);
if (bp->b_flags & B_DELWRI) {
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
bp->b_flags &= ~(B_DELWRI | B_CACHE);
if ((bp->b_flags & B_VMIO) == 0) {
@@ -1169,11 +1221,9 @@ brelse(struct buf * bp)
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
bp->b_flags &= ~B_DELWRI;
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
- runningbufspace -= bp->b_bufsize;
-
/*
* Fixup numfreebuffers count. The bp is on an appropriate queue
* unless locked. We then bump numfreebuffers if it is not B_DELWRI.
@@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp)
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
}
- runningbufspace -= bp->b_bufsize;
-
if ((bp->b_flags & B_LOCKED) == 0 &&
((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
bufcountwakeup();
@@ -1309,13 +1357,13 @@ vfs_vmio_release(bp)
}
}
}
- runningbufspace -= bp->b_bufsize;
splx(s);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_npages = 0;
- bp->b_bufsize = 0;
bp->b_flags &= ~B_VMIO;
if (bp->b_vp)
brelvp(bp);
@@ -1723,27 +1771,6 @@ restart:
return(bp);
}
-#if 0
-/*
- * waitfreebuffers:
- *
- * Wait for sufficient free buffers. Only called from normal processes.
- */
-
-static void
-waitfreebuffers(int slpflag, int slptimeo)
-{
- while (numfreebuffers < hifreebuffers) {
- if (numfreebuffers >= hifreebuffers)
- break;
- needsbuffer |= VFS_BIO_NEED_FREE;
- if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
- break;
- }
-}
-
-#endif
-
/*
* buf_daemon:
*
@@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo)
*/
static struct proc *bufdaemonproc;
-static int bd_interval;
-static int bd_flushto;
-static int bd_flushinc;
static struct kproc_desc buf_kp = {
"bufdaemon",
@@ -1783,65 +1807,50 @@ buf_daemon()
curproc->p_flag |= P_BUFEXHAUST;
s = splbio();
- bd_interval = 5 * hz; /* dynamically adjusted */
- bd_flushto = hidirtybuffers; /* dynamically adjusted */
- bd_flushinc = 1;
-
for (;;) {
kthread_suspend_check(bufdaemonproc);
bd_request = 0;
/*
- * Do the flush. Limit the number of buffers we flush in one
- * go. The failure condition occurs when processes are writing
- * buffers faster then we can dispose of them. In this case
- * we may be flushing so often that the previous set of flushes
- * have not had time to complete, causing us to run out of
- * physical buffers and block.
+ * Do the flush. Limit the amount of in-transit I/O we
+ * allow to build up, otherwise we would completely saturate
+ * the I/O system. Wakeup any waiting processes before we
+ * normally would so they can run in parallel with our drain.
*/
- {
- int runcount = maxbdrun;
-
- while (numdirtybuffers > bd_flushto && runcount) {
- --runcount;
- if (flushbufqueues() == 0)
- break;
- }
+ while (numdirtybuffers > lodirtybuffers) {
+ if (flushbufqueues() == 0)
+ break;
+ waitrunningbufspace();
+ numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
}
- if (bd_request ||
- tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) {
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 5 seconds and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep half a second.
+ * Otherwise we loop immediately.
+ */
+ if (numdirtybuffers <= lodirtybuffers) {
/*
- * Another request is pending or we were woken up
- * without timing out. Flush more.
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
*/
- --bd_flushto;
- if (bd_flushto >= numdirtybuffers - 5) {
- bd_flushto = numdirtybuffers - 10;
- bd_flushinc = 1;
- }
- if (bd_flushto < 2)
- bd_flushto = 2;
+ bd_request = 0;
+ tsleep(&bd_request, PVM, "psleep", hz);
} else {
/*
- * We slept and timed out, we can slow down.
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
*/
- bd_flushto += bd_flushinc;
- if (bd_flushto > hidirtybuffers)
- bd_flushto = hidirtybuffers;
- ++bd_flushinc;
- if (bd_flushinc > hidirtybuffers / 20 + 1)
- bd_flushinc = hidirtybuffers / 20 + 1;
+ tsleep(&bd_request, PVM, "qsleep", hz / 2);
}
-
- /*
- * Set the interval on a linear scale based on hidirtybuffers
- * with a maximum frequency of 1/10 second.
- */
- bd_interval = bd_flushto * 5 * hz / hidirtybuffers;
- if (bd_interval < hz / 10)
- bd_interval = hz / 10;
}
}
@@ -2097,21 +2106,11 @@ loop:
*
* XXX remove if 0 sections (clean this up after its proven)
*/
-#if 0
- if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
-#endif
- if (numfreebuffers == 0) {
- if (curproc == idleproc)
- return NULL;
- needsbuffer |= VFS_BIO_NEED_ANY;
- tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
- slptimeo);
- }
-#if 0
- } else if (numfreebuffers < lofreebuffers) {
- waitfreebuffers(slpflag, slptimeo);
+ if (numfreebuffers == 0) {
+ if (curproc == idleproc)
+ return NULL;
+ needsbuffer |= VFS_BIO_NEED_ANY;
}
-#endif
if ((bp = gbincore(vp, blkno))) {
/*
@@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
} else {
free(bp->b_data, M_BIOBUF);
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_data = bp->b_kvabase;
- bp->b_bufsize = 0;
bp->b_bcount = 0;
bp->b_flags &= ~B_MALLOC;
}
@@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
bp->b_flags |= B_MALLOC;
bufmallocspace += mbsize;
- runningbufspace += bp->b_bufsize;
return 1;
}
#endif
@@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size)
origbuf = bp->b_data;
origbufsize = bp->b_bufsize;
bp->b_data = bp->b_kvabase;
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
- bp->b_bufsize = 0;
+ bp->b_bufsize = 0;
+ }
bp->b_flags &= ~B_MALLOC;
newbsize = round_page(newbsize);
}
@@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size)
(vm_offset_t)(bp->b_offset & PAGE_MASK));
}
}
- runningbufspace += (newbsize - bp->b_bufsize);
if (newbsize < bp->b_bufsize)
bufspacewakeup();
bp->b_bufsize = newbsize; /* actual buffer allocation */
@@ -2681,6 +2678,7 @@ bufdone(struct buf *bp)
KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
bp->b_flags |= B_DONE;
+ runningbufwakeup(bp);
if (bp->b_iocmd == BIO_DELETE) {
brelse(bp);
@@ -2768,18 +2766,8 @@ bufdone(struct buf *bp)
if (m == bogus_page) {
bogusflag = 1;
m = vm_page_lookup(obj, OFF_TO_IDX(foff));
- if (!m) {
+ if (m == NULL)
panic("biodone: page disappeared!");
-#if defined(VFS_BIO_DEBUG)
- printf("biodone: page disappeared\n");
-#endif
- vm_object_pip_subtract(obj, 1);
- bp->b_flags &= ~B_CACHE;
- foff = (foff + PAGE_SIZE) &
- ~(off_t)PAGE_MASK;
- iosize -= resid;
- continue;
- }
bp->b_pages[i] = m;
pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
}
@@ -2833,6 +2821,7 @@ bufdone(struct buf *bp)
if (obj)
vm_object_pip_wakeupn(obj, 0);
}
+
/*
* For asynchronous completions, release the buffer now. The brelse
* will do a wakeup there if necessary - so no need to do a wakeup
@@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp)
{
int i;
+ runningbufwakeup(bp);
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
@@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
{
int i, bogus;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
OpenPOWER on IntegriCloud