summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2000-12-26 19:41:38 +0000
committerdillon <dillon@FreeBSD.org>2000-12-26 19:41:38 +0000
commitfd223545d4ce7c8c6fe4896ce1eb916f587f77a8 (patch)
tree8bc9147cc365625dec8071f12bd60d2119d819e4 /sys
parenta042274eabc95cdbaadcbde28ce1b8bdbb79d6f8 (diff)
downloadFreeBSD-src-fd223545d4ce7c8c6fe4896ce1eb916f587f77a8.zip
FreeBSD-src-fd223545d4ce7c8c6fe4896ce1eb916f587f77a8.tar.gz
This implements a better launder limiting solution. There was a solution
in 4.2-REL which I ripped out in -stable and -current when implementing the low-memory handling solution. However, maxlaunder turns out to be the saving grace in certain very heavily loaded systems (e.g. newsreader box). The new algorithm limits the number of pages laundered in the first pageout daemon pass. If that is not sufficient then suceessive will be run without any limit. Write I/O is now pipelined using two sysctls, vfs.lorunningspace and vfs.hirunningspace. This prevents excessive buffered writes in the disk queues which cause long (multi-second) delays for reads. It leads to more stable (less jerky) and generally faster I/O streaming to disk by allowing required read ops (e.g. for indirect blocks and such) to occur without interrupting the write stream, amoung other things. NOTE: eventually, filesystem write I/O pipelining needs to be done on a per-device basis. At the moment it is globalized.
Diffstat (limited to 'sys')
-rw-r--r--sys/fs/specfs/spec_vnops.c2
-rw-r--r--sys/kern/vfs_bio.c263
-rw-r--r--sys/kern/vfs_cluster.c12
-rw-r--r--sys/miscfs/specfs/spec_vnops.c2
-rw-r--r--sys/sys/buf.h2
-rw-r--r--sys/sys/vnode.h1
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c4
-rw-r--r--sys/vm/vm_page.c2
-rw-r--r--sys/vm/vm_page.h1
-rw-r--r--sys/vm/vm_pageout.c228
-rw-r--r--sys/vm/vnode_pager.c39
11 files changed, 304 insertions, 252 deletions
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
bp->b_bcount = size;
bp->b_bufsize = size;
bp->b_resid = 0;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += pcount;
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 21d447d..9949813 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -85,22 +85,24 @@ static void buf_daemon __P((void));
* but the code is intricate enough already.
*/
vm_page_t bogus_page;
-int runningbufspace;
int vmiodirenable = FALSE;
+int runningbufspace;
static vm_offset_t bogus_offset;
-static int bufspace, maxbufspace,
+static int bufspace, maxbufspace,
bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
static int bufreusecnt, bufdefragcnt, buffreekvacnt;
-static int maxbdrun;
static int needsbuffer;
-static int numdirtybuffers, hidirtybuffers;
+static int lorunningspace, hirunningspace, runningbufreq;
+static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int getnewbufcalls;
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
&numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+ &lodirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
&hidirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
@@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
&hifreebuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
&runningbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
+ &lorunningspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
+ &hirunningspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
&maxbufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
@@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
&lobufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
- &maxbdrun, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
&maxbufmallocspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
@@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn)
*/
static __inline void
-numdirtywakeup(void)
+numdirtywakeup(int level)
{
- if (numdirtybuffers < hidirtybuffers) {
+ if (numdirtybuffers <= level) {
if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
wakeup(&needsbuffer);
@@ -204,6 +208,23 @@ bufspacewakeup(void)
}
/*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+ if (bp->b_runningbufspace) {
+ runningbufspace -= bp->b_runningbufspace;
+ bp->b_runningbufspace = 0;
+ if (runningbufreq && runningbufspace <= lorunningspace) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ }
+}
+
+/*
* bufcountwakeup:
*
* Called when a buffer has been added to one of the free queues to
@@ -225,6 +246,31 @@ bufcountwakeup(void)
}
/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * Reads will adjust runningbufspace, but will not block based on it.
+ * The read load has a side effect of reducing the allowed write load.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+ while (runningbufspace > hirunningspace) {
+ ++runningbufreq;
+ tsleep(&runningbufreq, PVM, "wdrain", 0);
+ }
+}
+
+
+/*
* vfs_buf_test_cache:
*
* Called when a buffer is extended. This function clears the B_CACHE
@@ -248,7 +294,7 @@ static __inline__
void
bd_wakeup(int dirtybuflevel)
{
- if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
+ if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
bd_request = 1;
wakeup(&bd_request);
}
@@ -330,6 +376,9 @@ bufinit(void)
hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
lobufspace = hibufspace - MAXBSIZE;
+ lorunningspace = 512 * 1024;
+ hirunningspace = 1024 * 1024;
+
/*
* Limit the amount of malloc memory since it is wired permanently into
* the kernel space. Even though this is accounted for in the buffer
@@ -354,6 +403,7 @@ bufinit(void)
while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
hidirtybuffers >>= 1;
}
+ lodirtybuffers = hidirtybuffers / 2;
/*
* Try to keep the number of free buffers in the specified range,
@@ -370,8 +420,6 @@ bufinit(void)
* based on the number of bytes of I/O in-transit that were initiated
* from buf_daemon.
*/
- if ((maxbdrun = nswbuf / 4) < 4)
- maxbdrun = 4;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
bogus_page = vm_page_alloc(kernel_object,
@@ -419,7 +467,6 @@ bremfree(struct buf * bp)
KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
- runningbufspace += bp->b_bufsize;
} else {
if (BUF_REFCNT(bp) <= 1)
panic("bremfree: removing a buffer not on a queue");
@@ -659,6 +706,13 @@ bwrite(struct buf * bp)
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
+ } else {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. There is no chance of deadlock here because
+ * we are blocking on I/O that is already in-progress.
+ */
+ waitrunningbufspace();
}
return (0);
@@ -774,11 +828,11 @@ bdwrite(struct buf * bp)
bqrelse(bp);
/*
- * Wakeup the buffer flushing daemon if we have saturated the
- * buffer cache.
+ * Wakeup the buffer flushing daemon if we have a lot of dirty
+ * buffers (midpoint between our recovery point and our stall
+ * point).
*/
-
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
/*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
@@ -817,7 +871,7 @@ bdirty(bp)
bp->b_flags |= B_DONE | B_DELWRI;
reassignbuf(bp, bp->b_vp);
++numdirtybuffers;
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
}
}
@@ -843,7 +897,7 @@ bundirty(bp)
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp, bp->b_vp);
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
/*
* Since it is now being written, we can clear its deferred write flag.
@@ -896,14 +950,12 @@ bowrite(struct buf * bp)
void
bwillwrite(void)
{
- int slop = hidirtybuffers / 10;
-
- if (numdirtybuffers > hidirtybuffers + slop) {
+ if (numdirtybuffers >= hidirtybuffers) {
int s;
s = splbio();
- while (numdirtybuffers > hidirtybuffers) {
- bd_wakeup(hidirtybuffers);
+ while (numdirtybuffers >= hidirtybuffers) {
+ bd_wakeup(1);
needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
}
@@ -963,7 +1015,7 @@ brelse(struct buf * bp)
buf_deallocate(bp);
if (bp->b_flags & B_DELWRI) {
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
bp->b_flags &= ~(B_DELWRI | B_CACHE);
if ((bp->b_flags & B_VMIO) == 0) {
@@ -1169,11 +1221,9 @@ brelse(struct buf * bp)
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
bp->b_flags &= ~B_DELWRI;
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
- runningbufspace -= bp->b_bufsize;
-
/*
* Fixup numfreebuffers count. The bp is on an appropriate queue
* unless locked. We then bump numfreebuffers if it is not B_DELWRI.
@@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp)
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
}
- runningbufspace -= bp->b_bufsize;
-
if ((bp->b_flags & B_LOCKED) == 0 &&
((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
bufcountwakeup();
@@ -1309,13 +1357,13 @@ vfs_vmio_release(bp)
}
}
}
- runningbufspace -= bp->b_bufsize;
splx(s);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_npages = 0;
- bp->b_bufsize = 0;
bp->b_flags &= ~B_VMIO;
if (bp->b_vp)
brelvp(bp);
@@ -1723,27 +1771,6 @@ restart:
return(bp);
}
-#if 0
-/*
- * waitfreebuffers:
- *
- * Wait for sufficient free buffers. Only called from normal processes.
- */
-
-static void
-waitfreebuffers(int slpflag, int slptimeo)
-{
- while (numfreebuffers < hifreebuffers) {
- if (numfreebuffers >= hifreebuffers)
- break;
- needsbuffer |= VFS_BIO_NEED_FREE;
- if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
- break;
- }
-}
-
-#endif
-
/*
* buf_daemon:
*
@@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo)
*/
static struct proc *bufdaemonproc;
-static int bd_interval;
-static int bd_flushto;
-static int bd_flushinc;
static struct kproc_desc buf_kp = {
"bufdaemon",
@@ -1783,65 +1807,50 @@ buf_daemon()
curproc->p_flag |= P_BUFEXHAUST;
s = splbio();
- bd_interval = 5 * hz; /* dynamically adjusted */
- bd_flushto = hidirtybuffers; /* dynamically adjusted */
- bd_flushinc = 1;
-
for (;;) {
kthread_suspend_check(bufdaemonproc);
bd_request = 0;
/*
- * Do the flush. Limit the number of buffers we flush in one
- * go. The failure condition occurs when processes are writing
- * buffers faster then we can dispose of them. In this case
- * we may be flushing so often that the previous set of flushes
- * have not had time to complete, causing us to run out of
- * physical buffers and block.
+ * Do the flush. Limit the amount of in-transit I/O we
+ * allow to build up, otherwise we would completely saturate
+ * the I/O system. Wakeup any waiting processes before we
+ * normally would so they can run in parallel with our drain.
*/
- {
- int runcount = maxbdrun;
-
- while (numdirtybuffers > bd_flushto && runcount) {
- --runcount;
- if (flushbufqueues() == 0)
- break;
- }
+ while (numdirtybuffers > lodirtybuffers) {
+ if (flushbufqueues() == 0)
+ break;
+ waitrunningbufspace();
+ numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
}
- if (bd_request ||
- tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) {
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 5 seconds and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep half a second.
+ * Otherwise we loop immediately.
+ */
+ if (numdirtybuffers <= lodirtybuffers) {
/*
- * Another request is pending or we were woken up
- * without timing out. Flush more.
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
*/
- --bd_flushto;
- if (bd_flushto >= numdirtybuffers - 5) {
- bd_flushto = numdirtybuffers - 10;
- bd_flushinc = 1;
- }
- if (bd_flushto < 2)
- bd_flushto = 2;
+ bd_request = 0;
+ tsleep(&bd_request, PVM, "psleep", hz);
} else {
/*
- * We slept and timed out, we can slow down.
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
*/
- bd_flushto += bd_flushinc;
- if (bd_flushto > hidirtybuffers)
- bd_flushto = hidirtybuffers;
- ++bd_flushinc;
- if (bd_flushinc > hidirtybuffers / 20 + 1)
- bd_flushinc = hidirtybuffers / 20 + 1;
+ tsleep(&bd_request, PVM, "qsleep", hz / 2);
}
-
- /*
- * Set the interval on a linear scale based on hidirtybuffers
- * with a maximum frequency of 1/10 second.
- */
- bd_interval = bd_flushto * 5 * hz / hidirtybuffers;
- if (bd_interval < hz / 10)
- bd_interval = hz / 10;
}
}
@@ -2097,21 +2106,11 @@ loop:
*
* XXX remove if 0 sections (clean this up after its proven)
*/
-#if 0
- if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
-#endif
- if (numfreebuffers == 0) {
- if (curproc == idleproc)
- return NULL;
- needsbuffer |= VFS_BIO_NEED_ANY;
- tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
- slptimeo);
- }
-#if 0
- } else if (numfreebuffers < lofreebuffers) {
- waitfreebuffers(slpflag, slptimeo);
+ if (numfreebuffers == 0) {
+ if (curproc == idleproc)
+ return NULL;
+ needsbuffer |= VFS_BIO_NEED_ANY;
}
-#endif
if ((bp = gbincore(vp, blkno))) {
/*
@@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
} else {
free(bp->b_data, M_BIOBUF);
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_data = bp->b_kvabase;
- bp->b_bufsize = 0;
bp->b_bcount = 0;
bp->b_flags &= ~B_MALLOC;
}
@@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
bp->b_flags |= B_MALLOC;
bufmallocspace += mbsize;
- runningbufspace += bp->b_bufsize;
return 1;
}
#endif
@@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size)
origbuf = bp->b_data;
origbufsize = bp->b_bufsize;
bp->b_data = bp->b_kvabase;
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
- bp->b_bufsize = 0;
+ bp->b_bufsize = 0;
+ }
bp->b_flags &= ~B_MALLOC;
newbsize = round_page(newbsize);
}
@@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size)
(vm_offset_t)(bp->b_offset & PAGE_MASK));
}
}
- runningbufspace += (newbsize - bp->b_bufsize);
if (newbsize < bp->b_bufsize)
bufspacewakeup();
bp->b_bufsize = newbsize; /* actual buffer allocation */
@@ -2681,6 +2678,7 @@ bufdone(struct buf *bp)
KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
bp->b_flags |= B_DONE;
+ runningbufwakeup(bp);
if (bp->b_iocmd == BIO_DELETE) {
brelse(bp);
@@ -2768,18 +2766,8 @@ bufdone(struct buf *bp)
if (m == bogus_page) {
bogusflag = 1;
m = vm_page_lookup(obj, OFF_TO_IDX(foff));
- if (!m) {
+ if (m == NULL)
panic("biodone: page disappeared!");
-#if defined(VFS_BIO_DEBUG)
- printf("biodone: page disappeared\n");
-#endif
- vm_object_pip_subtract(obj, 1);
- bp->b_flags &= ~B_CACHE;
- foff = (foff + PAGE_SIZE) &
- ~(off_t)PAGE_MASK;
- iosize -= resid;
- continue;
- }
bp->b_pages[i] = m;
pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
}
@@ -2833,6 +2821,7 @@ bufdone(struct buf *bp)
if (obj)
vm_object_pip_wakeupn(obj, 0);
}
+
/*
* For asynchronous completions, release the buffer now. The brelse
* will do a wakeup there if necessary - so no need to do a wakeup
@@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp)
{
int i;
+ runningbufwakeup(bp);
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
@@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
{
int i, bogus;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 29a1879..088dc40 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -247,8 +247,12 @@ single_block_read:
printf("S(%ld,%ld,%d) ",
(long)bp->b_lblkno, bp->b_bcount, seqcount);
#endif
- if ((bp->b_flags & B_CLUSTER) == 0)
+ if ((bp->b_flags & B_CLUSTER) == 0) {
vfs_busy_pages(bp, 0);
+ } else {
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+ }
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
@@ -283,8 +287,12 @@ single_block_read:
}
#endif
- if ((rbp->b_flags & B_CLUSTER) == 0)
+ if ((rbp->b_flags & B_CLUSTER) == 0) {
vfs_busy_pages(rbp, 0);
+ } else {
+ rbp->b_runningbufspace = rbp->b_bufsize;
+ runningbufspace += rbp->b_runningbufspace;
+ }
rbp->b_flags &= ~B_INVAL;
rbp->b_ioflags &= ~BIO_ERROR;
if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
bp->b_bcount = size;
bp->b_bufsize = size;
bp->b_resid = 0;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += pcount;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index a10083f..223c036 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -110,6 +110,7 @@ struct buf {
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
long b_bufsize; /* Allocated buffer size. */
+ long b_runningbufspace; /* when I/O is running, pipelining */
caddr_t b_kvabase; /* base kva for buffer */
int b_kvasize; /* size of kva for buffer */
daddr_t b_lblkno; /* Logical block number. */
@@ -480,6 +481,7 @@ buf_countdeps(struct buf *bp, int i)
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
+extern int runningbufspace;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 75462f6..2ab6f3f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -213,6 +213,7 @@ struct vattr {
#define IO_NDELAY 0x10 /* FNDELAY flag set in file table */
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
+#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index e1d775c..62ec9e3 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -504,7 +504,9 @@ WRITE(ap)
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
- } else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+ } else if (vm_page_count_severe() ||
+ buf_dirty_count_severe() ||
+ (ioflag & IO_ASYNC)) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else if (xfersize + blkoffset == fs->fs_bsize) {
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 1b2db6e..7cbe750 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1273,6 +1273,7 @@ vm_page_unwire(m, activate)
vm_page_queues[PQ_ACTIVE].lcnt++;
cnt.v_active_count++;
} else {
+ vm_page_flag_clear(m, PG_WINATCFLS);
TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
m->queue = PQ_INACTIVE;
vm_page_queues[PQ_INACTIVE].lcnt++;
@@ -1311,6 +1312,7 @@ _vm_page_deactivate(vm_page_t m, int athead)
if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
if ((m->queue - m->pc) == PQ_CACHE)
cnt.v_reactivated++;
+ vm_page_flag_clear(m, PG_WINATCFLS);
vm_page_unqueue(m);
if (athead)
TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 4c31df9..dc8290e 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -242,6 +242,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
*/
#define PG_BUSY 0x0001 /* page is in transit (O) */
#define PG_WANTED 0x0002 /* someone is waiting for page (O) */
+#define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */
#define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */
#define PG_WRITEABLE 0x0010 /* page is mapped writeable */
#define PG_MAPPED 0x0020 /* page is mapped */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index dbea3d6..943fb11 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -106,7 +106,7 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout __P((void));
static int vm_pageout_clean __P((vm_page_t));
-static int vm_pageout_scan __P((void));
+static void vm_pageout_scan __P((int pass));
static int vm_pageout_free_page_calc __P((vm_size_t count));
struct proc *pageproc;
@@ -140,14 +140,13 @@ static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
#endif
extern int vm_swap_size;
+static int vm_max_launder = 32;
static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
static int vm_pageout_full_stats_interval = 0;
-static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0;
+static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
static int defer_swap_pageouts=0;
static int disable_swap_pageouts=0;
-static int max_page_launder=100;
-static int vm_pageout_actcmp=0;
#if defined(NO_SWAPPING)
static int vm_swap_enabled=0;
static int vm_swap_idle_enabled=0;
@@ -157,7 +156,10 @@ static int vm_swap_idle_enabled=0;
#endif
SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
- CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt");
+ CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
+
+SYSCTL_INT(_vm, OID_AUTO, max_launder,
+ CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
@@ -189,12 +191,6 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
-SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
- CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
-SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
- CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
-
-
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
@@ -509,7 +505,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
} else if (p->queue == PQ_ACTIVE) {
if ((p->flags & PG_REFERENCED) == 0) {
p->act_count -= min(p->act_count, ACT_DECLINE);
- if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) {
+ if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
vm_page_protect(p, VM_PROT_NONE);
vm_page_deactivate(p);
} else {
@@ -627,20 +623,21 @@ vm_pageout_page_free(vm_page_t m) {
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*/
-static int
-vm_pageout_scan()
+static void
+vm_pageout_scan(int pass)
{
vm_page_t m, next;
struct vm_page marker;
+ int save_page_shortage;
+ int save_inactive_count;
int page_shortage, maxscan, pcount;
int addl_page_shortage, addl_page_shortage_init;
- int maxlaunder;
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
vm_object_t object;
- int force_wakeup = 0;
int actcount;
int vnodes_skipped = 0;
+ int maxlaunder;
int s;
/*
@@ -651,27 +648,13 @@ vm_pageout_scan()
addl_page_shortage_init = vm_pageout_deficit;
vm_pageout_deficit = 0;
- if (max_page_launder == 0)
- max_page_launder = 1;
-
/*
* Calculate the number of pages we want to either free or move
- * to the cache. Be more agressive if we aren't making our target.
+ * to the cache.
*/
-
- page_shortage = vm_paging_target() +
- addl_page_shortage_init + vm_pageout_actcmp;
-
- /*
- * Figure out how agressively we should flush dirty pages.
- */
- {
- int factor = vm_pageout_actcmp;
-
- maxlaunder = cnt.v_inactive_target / 3 + factor;
- if (maxlaunder > max_page_launder + factor)
- maxlaunder = max_page_launder + factor;
- }
+ page_shortage = vm_paging_target() + addl_page_shortage_init;
+ save_page_shortage = page_shortage;
+ save_inactive_count = cnt.v_inactive_count;
/*
* Initialize our marker
@@ -687,8 +670,22 @@ vm_pageout_scan()
* we have scanned the entire inactive queue. Note that m->act_count
* is not used to form decisions for the inactive queue, only for the
* active queue.
+ *
+ * maxlaunder limits the number of dirty pages we flush per scan.
+ * For most systems a smaller value (16 or 32) is more robust under
+ * extreme memory and disk pressure because any unnecessary writes
+ * to disk can result in extreme performance degredation. However,
+ * systems with excessive dirty pages (especially when MAP_NOSYNC is
+ * used) will die horribly with limited laundering. If the pageout
+ * daemon cannot clean enough pages in the first pass, we let it go
+ * all out in succeeding passes.
*/
+ if ((maxlaunder = vm_max_launder) <= 1)
+ maxlaunder = 1;
+ if (pass)
+ maxlaunder = 10000;
+
rescan0:
addl_page_shortage = addl_page_shortage_init;
maxscan = cnt.v_inactive_count;
@@ -792,12 +789,32 @@ rescan0:
} else if (m->dirty == 0) {
vm_page_cache(m);
--page_shortage;
-
- /*
- * Dirty pages need to be paged out. Note that we clean
- * only a limited number of pages per pagedaemon pass.
- */
+ } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
+ /*
+ * Dirty pages need to be paged out, but flushing
+ * a page is extremely expensive verses freeing
+ * a clean page. Rather then artificially limiting
+ * the number of pages we can flush, we instead give
+ * dirty pages extra priority on the inactive queue
+ * by forcing them to be cycled through the queue
+ * twice before being flushed, after which the
+ * (now clean) page will cycle through once more
+ * before being freed. This significantly extends
+ * the thrash point for a heavily loaded machine.
+ */
+ s = splvm();
+ vm_page_flag_set(m, PG_WINATCFLS);
+ TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+ TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+ splx(s);
} else if (maxlaunder > 0) {
+ /*
+ * We always want to try to flush some dirty pages if
+ * we encounter them, to keep the system stable.
+ * Normally this number is small, but under extreme
+ * pressure where there are insufficient clean pages
+ * on the inactive queue, we may have to go all out.
+ */
int swap_pageouts_ok;
struct vnode *vp = NULL;
struct mount *mp;
@@ -826,29 +843,24 @@ rescan0:
}
/*
- * Presumably we have sufficient free memory to do
- * the more sophisticated checks and locking required
- * for vnodes.
- *
- * The object is already known NOT to be dead. The
- * vget() may still block, though, because
- * VOP_ISLOCKED() doesn't check to see if an inode
- * (v_data) is associated with the vnode. If it isn't,
- * vget() will load in it from disk. Worse, vget()
- * may actually get stuck waiting on "inode" if another
- * process is in the process of bringing the inode in.
- * This is bad news for us either way.
+ * The object is already known NOT to be dead. It
+ * is possible for the vget() to block the whole
+ * pageout daemon, but the new low-memory handling
+ * code should prevent it.
*
- * So for the moment we check v_data == NULL as a
- * workaround. This means that vnodes which do not
- * use v_data in the way we expect probably will not
- * wind up being paged out by the pager and it will be
- * up to the syncer to get them. That's better then
- * us blocking here.
+ * The previous code skipped locked vnodes and, worse,
+ * reordered pages in the queue. This results in
+ * completely non-deterministic operation and, on a
+ * busy system, can lead to extremely non-optimal
+ * pageouts. For example, it can cause clean pages
+ * to be freed and dirty pages to be moved to the end
+ * of the queue. Since dirty pages are also moved to
+ * the end of the queue once-cleaned, this gives
+ * way too large a weighting to defering the freeing
+ * of dirty pages.
*
- * This whole code section is bogus - we need to fix
- * the vnode pager to handle vm_page_t's without us
- * having to do any sophisticated VOP tests.
+ * XXX we need to be able to apply a timeout to the
+ * vget() lock attempt.
*/
if (object->type == OBJT_VNODE) {
@@ -857,19 +869,8 @@ rescan0:
mp = NULL;
if (vp->v_type == VREG)
vn_start_write(vp, &mp, V_NOWAIT);
- if (VOP_ISLOCKED(vp, NULL) ||
- vp->v_data == NULL ||
- vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
+ if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
vn_finished_write(mp);
- if ((m->queue == PQ_INACTIVE) &&
- (m->hold_count == 0) &&
- (m->busy == 0) &&
- (m->flags & PG_BUSY) == 0) {
- s = splvm();
- TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
- TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
- splx(s);
- }
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
continue;
@@ -924,18 +925,23 @@ rescan0:
* If a page is dirty, then it is either being washed
* (but not yet cleaned) or it is still in the
* laundry. If it is still in the laundry, then we
- * start the cleaning operation. maxlaunder nominally
- * counts I/O cost (seeks) rather then bytes.
+ * start the cleaning operation.
*
* This operation may cluster, invalidating the 'next'
* pointer. To prevent an inordinate number of
* restarts we use our marker to remember our place.
+ *
+ * decrement page_shortage on success to account for
+ * the (future) cleaned page. Otherwise we could wind
+ * up laundering or cleaning too many pages.
*/
s = splvm();
TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
splx(s);
- if (vm_pageout_clean(m) != 0)
+ if (vm_pageout_clean(m) != 0) {
+ --page_shortage;
--maxlaunder;
+ }
s = splvm();
next = TAILQ_NEXT(&marker, pageq);
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
@@ -948,28 +954,12 @@ rescan0:
}
/*
- * If we were not able to meet our target, increase actcmp
- */
-
- if (vm_page_count_min()) {
- if (vm_pageout_actcmp < ACT_MAX / 2)
- vm_pageout_actcmp += ACT_ADVANCE;
- } else {
- if (vm_pageout_actcmp < ACT_DECLINE)
- vm_pageout_actcmp = 0;
- else
- vm_pageout_actcmp -= ACT_DECLINE;
- }
-
- /*
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
*/
-
page_shortage = vm_paging_target() +
cnt.v_inactive_target - cnt.v_inactive_count;
page_shortage += addl_page_shortage;
- page_shortage += vm_pageout_actcmp;
/*
* Scan the active queue for things we can deactivate. We nominally
@@ -1043,9 +1033,9 @@ rescan0:
splx(s);
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
- if (vm_pageout_algorithm_lru ||
- (m->object->ref_count == 0) ||
- (m->act_count <= vm_pageout_actcmp)) {
+ if (vm_pageout_algorithm ||
+ m->object->ref_count == 0 ||
+ m->act_count == 0) {
page_shortage--;
if (m->object->ref_count == 0) {
vm_page_protect(m, VM_PROT_NONE);
@@ -1175,7 +1165,6 @@ rescan0:
wakeup(&cnt.v_free_count);
}
}
- return force_wakeup;
}
/*
@@ -1254,11 +1243,13 @@ vm_pageout_page_stats()
} else {
if (m->act_count == 0) {
/*
- * We turn off page access, so that we have more accurate
- * RSS stats. We don't do this in the normal page deactivation
- * when the system is loaded VM wise, because the cost of
- * the large number of page protect operations would be higher
- * than the value of doing the operation.
+ * We turn off page access, so that we have
+ * more accurate RSS stats. We don't do this
+ * in the normal page deactivation when the
+ * system is loaded VM wise, because the
+ * cost of the large number of page protect
+ * operations would be higher than the value
+ * of doing the operation.
*/
vm_page_protect(m, VM_PROT_NONE);
vm_page_deactivate(m);
@@ -1307,6 +1298,7 @@ vm_size_t count;
static void
vm_pageout()
{
+ int pass;
mtx_enter(&Giant, MTX_DEF);
@@ -1320,11 +1312,18 @@ vm_pageout()
vm_pageout_free_page_calc(cnt.v_page_count);
/*
- * free_reserved needs to include enough for the largest swap pager
- * structures plus enough for any pv_entry structs when paging.
+ * v_free_target and v_cache_min control pageout hysteresis. Note
+ * that these are more a measure of the VM cache queue hysteresis
+ * then the VM free queue. Specifically, v_free_target is the
+ * high water mark (free+cache pages).
+ *
+ * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
+ * low water mark, while v_free_min is the stop. v_cache_min must
+ * be big enough to handle memory needs while the pageout daemon
+ * is signalled and run to free more pages.
*/
if (cnt.v_free_count > 6144)
- cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
+ cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
else
cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
@@ -1362,10 +1361,9 @@ vm_pageout()
if (vm_pageout_stats_free_max == 0)
vm_pageout_stats_free_max = 5;
- max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16);
-
curproc->p_flag |= P_BUFEXHAUST;
swap_pager_swap_init();
+ pass = 0;
/*
* The pageout daemon is never done, so loop forever.
*/
@@ -1386,19 +1384,27 @@ vm_pageout()
}
if (vm_pages_needed) {
/*
- * Still not done, sleep a bit and go again
+ * Still not done, take a second pass without waiting
+ * (unlimited dirty cleaning), otherwise sleep a bit
+ * and try again.
*/
- tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
+ ++pass;
+ if (pass > 1)
+ tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
} else {
/*
- * Good enough, sleep & handle stats
+ * Good enough, sleep & handle stats. Prime the pass
+ * for the next run.
*/
+ if (pass > 1)
+ pass = 1;
+ else
+ pass = 0;
error = tsleep(&vm_pages_needed,
PVM, "psleep", vm_pageout_stats_interval * hz);
if (error && !vm_pages_needed) {
- if (vm_pageout_actcmp > 0)
- --vm_pageout_actcmp;
splx(s);
+ pass = 0;
vm_pageout_page_stats();
continue;
}
@@ -1407,7 +1413,7 @@ vm_pageout()
if (vm_pages_needed)
cnt.v_pdwakeups++;
splx(s);
- vm_pageout_scan();
+ vm_pageout_scan(pass);
vm_pageout_deficit = 0;
}
}
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 3dd12ec..c79f62a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -300,10 +300,29 @@ vnode_pager_setsize(vp, nsize)
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
if (m) {
+ int base = (int)nsize & PAGE_MASK;
+ int size = PAGE_SIZE - base;
+
+ /*
+ * Clear out partial-page garbage in case
+ * the page has been mapped.
+ */
kva = vm_pager_map_page(m);
- bzero((caddr_t) kva + (nsize & PAGE_MASK),
- (int) (round_page(nsize) - nsize));
+ bzero((caddr_t)kva + base, size);
vm_pager_unmap_page(kva);
+
+ /*
+ * Clear out partial-page dirty bits. This
+ * has the side effect of setting the valid
+ * bits, but that is ok. There are a bunch
+ * of places in the VM system where we expected
+ * m->dirty == VM_PAGE_BITS_ALL. The file EOF
+ * case is one of them. If the page is still
+ * partially dirty, make it fully dirty.
+ */
+ vm_page_set_validclean(m, base, size);
+ if (m->dirty != 0)
+ m->dirty = VM_PAGE_BITS_ALL;
}
}
}
@@ -424,6 +443,8 @@ vnode_pager_input_smlfs(object, m)
pbgetvp(dp, bp);
bp->b_bcount = bsize;
bp->b_bufsize = bsize;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
/* do the input */
BUF_STRATEGY(bp);
@@ -742,6 +763,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
pbgetvp(dp, bp);
bp->b_bcount = size;
bp->b_bufsize = size;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += count;
@@ -888,6 +911,11 @@ vnode_pager_putpages(object, m, count, sync, rtvals)
/*
* This is now called from local media FS's to operate against their
* own vnodes if they fail to implement VOP_PUTPAGES.
+ *
+ * This is typically called indirectly via the pageout daemon and
+ * clustering has already typically occured, so in general we ask the
+ * underlying filesystem to write the data out asynchronously rather
+ * then delayed.
*/
int
vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
@@ -938,8 +966,13 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
}
}
+ /*
+ * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
+ * rather then a bdwrite() to prevent paging I/O from saturating
+ * the buffer cache.
+ */
ioflags = IO_VMIO;
- ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: 0;
+ ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
aiov.iov_base = (caddr_t) 0;
OpenPOWER on IntegriCloud