diff options
Diffstat (limited to 'sys/kern/vfs_bio.c')
-rw-r--r-- | sys/kern/vfs_bio.c | 263 |
1 files changed, 128 insertions, 135 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 21d447d..9949813 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -85,22 +85,24 @@ static void buf_daemon __P((void)); * but the code is intricate enough already. */ vm_page_t bogus_page; -int runningbufspace; int vmiodirenable = FALSE; +int runningbufspace; static vm_offset_t bogus_offset; -static int bufspace, maxbufspace, +static int bufspace, maxbufspace, bufmallocspace, maxbufmallocspace, lobufspace, hibufspace; static int bufreusecnt, bufdefragcnt, buffreekvacnt; -static int maxbdrun; static int needsbuffer; -static int numdirtybuffers, hidirtybuffers; +static int lorunningspace, hirunningspace, runningbufreq; +static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int getnewbufcalls; static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, + &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, @@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, + &lorunningspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, + &hirunningspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, @@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, - &maxbdrun, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, @@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn) */ static __inline void -numdirtywakeup(void) +numdirtywakeup(int level) { - if (numdirtybuffers < hidirtybuffers) { + if (numdirtybuffers <= level) { if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); @@ -204,6 +208,23 @@ bufspacewakeup(void) } /* + * runningbufwakeup() - in-progress I/O accounting. + * + */ +static __inline void +runningbufwakeup(struct buf *bp) +{ + if (bp->b_runningbufspace) { + runningbufspace -= bp->b_runningbufspace; + bp->b_runningbufspace = 0; + if (runningbufreq && runningbufspace <= lorunningspace) { + runningbufreq = 0; + wakeup(&runningbufreq); + } + } +} + +/* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to @@ -225,6 +246,31 @@ bufcountwakeup(void) } /* + * waitrunningbufspace() + * + * runningbufspace is a measure of the amount of I/O currently + * running. This routine is used in async-write situations to + * prevent creating huge backups of pending writes to a device. + * Only asynchronous writes are governed by this function. + * + * Reads will adjust runningbufspace, but will not block based on it. + * The read load has a side effect of reducing the allowed write load. + * + * This does NOT turn an async write into a sync write. It waits + * for earlier writes to complete and generally returns before the + * caller's write has reached the device. + */ +static __inline void +waitrunningbufspace(void) +{ + while (runningbufspace > hirunningspace) { + ++runningbufreq; + tsleep(&runningbufreq, PVM, "wdrain", 0); + } +} + + +/* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE @@ -248,7 +294,7 @@ static __inline__ void bd_wakeup(int dirtybuflevel) { - if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { + if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } @@ -330,6 +376,9 @@ bufinit(void) hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; + lorunningspace = 512 * 1024; + hirunningspace = 1024 * 1024; + /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer @@ -354,6 +403,7 @@ bufinit(void) while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } + lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, @@ -370,8 +420,6 @@ bufinit(void) * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ - if ((maxbdrun = nswbuf / 4) < 4) - maxbdrun = 4; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, @@ -419,7 +467,6 @@ bremfree(struct buf * bp) KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; - runningbufspace += bp->b_bufsize; } else { if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); @@ -659,6 +706,13 @@ bwrite(struct buf * bp) int rtval = bufwait(bp); brelse(bp); return (rtval); + } else { + /* + * don't allow the async write to saturate the I/O + * system. There is no chance of deadlock here because + * we are blocking on I/O that is already in-progress. + */ + waitrunningbufspace(); } return (0); @@ -774,11 +828,11 @@ bdwrite(struct buf * bp) bqrelse(bp); /* - * Wakeup the buffer flushing daemon if we have saturated the - * buffer cache. + * Wakeup the buffer flushing daemon if we have a lot of dirty + * buffers (midpoint between our recovery point and our stall + * point). */ - - bd_wakeup(hidirtybuffers); + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, @@ -817,7 +871,7 @@ bdirty(bp) bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; - bd_wakeup(hidirtybuffers); + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } @@ -843,7 +897,7 @@ bundirty(bp) bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. @@ -896,14 +950,12 @@ bowrite(struct buf * bp) void bwillwrite(void) { - int slop = hidirtybuffers / 10; - - if (numdirtybuffers > hidirtybuffers + slop) { + if (numdirtybuffers >= hidirtybuffers) { int s; s = splbio(); - while (numdirtybuffers > hidirtybuffers) { - bd_wakeup(hidirtybuffers); + while (numdirtybuffers >= hidirtybuffers) { + bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); } @@ -963,7 +1015,7 @@ brelse(struct buf * bp) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { @@ -1169,11 +1221,9 @@ brelse(struct buf * bp) if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } - runningbufspace -= bp->b_bufsize; - /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. @@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp) TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } - runningbufspace -= bp->b_bufsize; - if ((bp->b_flags & B_LOCKED) == 0 && ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { bufcountwakeup(); @@ -1309,13 +1357,13 @@ vfs_vmio_release(bp) } } } - runningbufspace -= bp->b_bufsize; splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); - if (bp->b_bufsize) + if (bp->b_bufsize) { bufspacewakeup(); + bp->b_bufsize = 0; + } bp->b_npages = 0; - bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); @@ -1723,27 +1771,6 @@ restart: return(bp); } -#if 0 -/* - * waitfreebuffers: - * - * Wait for sufficient free buffers. Only called from normal processes. - */ - -static void -waitfreebuffers(int slpflag, int slptimeo) -{ - while (numfreebuffers < hifreebuffers) { - if (numfreebuffers >= hifreebuffers) - break; - needsbuffer |= VFS_BIO_NEED_FREE; - if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) - break; - } -} - -#endif - /* * buf_daemon: * @@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo) */ static struct proc *bufdaemonproc; -static int bd_interval; -static int bd_flushto; -static int bd_flushinc; static struct kproc_desc buf_kp = { "bufdaemon", @@ -1783,65 +1807,50 @@ buf_daemon() curproc->p_flag |= P_BUFEXHAUST; s = splbio(); - bd_interval = 5 * hz; /* dynamically adjusted */ - bd_flushto = hidirtybuffers; /* dynamically adjusted */ - bd_flushinc = 1; - for (;;) { kthread_suspend_check(bufdaemonproc); bd_request = 0; /* - * Do the flush. Limit the number of buffers we flush in one - * go. The failure condition occurs when processes are writing - * buffers faster then we can dispose of them. In this case - * we may be flushing so often that the previous set of flushes - * have not had time to complete, causing us to run out of - * physical buffers and block. + * Do the flush. Limit the amount of in-transit I/O we + * allow to build up, otherwise we would completely saturate + * the I/O system. Wakeup any waiting processes before we + * normally would so they can run in parallel with our drain. */ - { - int runcount = maxbdrun; - - while (numdirtybuffers > bd_flushto && runcount) { - --runcount; - if (flushbufqueues() == 0) - break; - } + while (numdirtybuffers > lodirtybuffers) { + if (flushbufqueues() == 0) + break; + waitrunningbufspace(); + numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); } - if (bd_request || - tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) { + /* + * Only clear bd_request if we have reached our low water + * mark. The buf_daemon normally waits 5 seconds and + * then incrementally flushes any dirty buffers that have + * built up, within reason. + * + * If we were unable to hit our low water mark and couldn't + * find any flushable buffers, we sleep half a second. + * Otherwise we loop immediately. + */ + if (numdirtybuffers <= lodirtybuffers) { /* - * Another request is pending or we were woken up - * without timing out. Flush more. + * We reached our low water mark, reset the + * request and sleep until we are needed again. + * The sleep is just so the suspend code works. */ - --bd_flushto; - if (bd_flushto >= numdirtybuffers - 5) { - bd_flushto = numdirtybuffers - 10; - bd_flushinc = 1; - } - if (bd_flushto < 2) - bd_flushto = 2; + bd_request = 0; + tsleep(&bd_request, PVM, "psleep", hz); } else { /* - * We slept and timed out, we can slow down. + * We couldn't find any flushable dirty buffers but + * still have too many dirty buffers, we + * have to sleep and try again. (rare) */ - bd_flushto += bd_flushinc; - if (bd_flushto > hidirtybuffers) - bd_flushto = hidirtybuffers; - ++bd_flushinc; - if (bd_flushinc > hidirtybuffers / 20 + 1) - bd_flushinc = hidirtybuffers / 20 + 1; + tsleep(&bd_request, PVM, "qsleep", hz / 2); } - - /* - * Set the interval on a linear scale based on hidirtybuffers - * with a maximum frequency of 1/10 second. - */ - bd_interval = bd_flushto * 5 * hz / hidirtybuffers; - if (bd_interval < hz / 10) - bd_interval = hz / 10; } } @@ -2097,21 +2106,11 @@ loop: * * XXX remove if 0 sections (clean this up after its proven) */ -#if 0 - if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) { -#endif - if (numfreebuffers == 0) { - if (curproc == idleproc) - return NULL; - needsbuffer |= VFS_BIO_NEED_ANY; - tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", - slptimeo); - } -#if 0 - } else if (numfreebuffers < lofreebuffers) { - waitfreebuffers(slpflag, slptimeo); + if (numfreebuffers == 0) { + if (curproc == idleproc) + return NULL; + needsbuffer |= VFS_BIO_NEED_ANY; } -#endif if ((bp = gbincore(vp, blkno))) { /* @@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size) bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); - bufmallocspace -= bp->b_bufsize; - runningbufspace -= bp->b_bufsize; - if (bp->b_bufsize) + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; bufspacewakeup(); + bp->b_bufsize = 0; + } bp->b_data = bp->b_kvabase; - bp->b_bufsize = 0; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } @@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size) bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufmallocspace += mbsize; - runningbufspace += bp->b_bufsize; return 1; } #endif @@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size) origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; - bufmallocspace -= bp->b_bufsize; - runningbufspace -= bp->b_bufsize; - if (bp->b_bufsize) + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; bufspacewakeup(); - bp->b_bufsize = 0; + bp->b_bufsize = 0; + } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } @@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size) (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } - runningbufspace += (newbsize - bp->b_bufsize); if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ @@ -2681,6 +2678,7 @@ bufdone(struct buf *bp) KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; + runningbufwakeup(bp); if (bp->b_iocmd == BIO_DELETE) { brelse(bp); @@ -2768,18 +2766,8 @@ bufdone(struct buf *bp) if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); - if (!m) { + if (m == NULL) panic("biodone: page disappeared!"); -#if defined(VFS_BIO_DEBUG) - printf("biodone: page disappeared\n"); -#endif - vm_object_pip_subtract(obj, 1); - bp->b_flags &= ~B_CACHE; - foff = (foff + PAGE_SIZE) & - ~(off_t)PAGE_MASK; - iosize -= resid; - continue; - } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } @@ -2833,6 +2821,7 @@ bufdone(struct buf *bp) if (obj) vm_object_pip_wakeupn(obj, 0); } + /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup @@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp) { int i; + runningbufwakeup(bp); if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; @@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; + if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; |