summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/fs/specfs/spec_vnops.c2
-rw-r--r--sys/kern/vfs_bio.c263
-rw-r--r--sys/kern/vfs_cluster.c12
-rw-r--r--sys/miscfs/specfs/spec_vnops.c2
-rw-r--r--sys/sys/buf.h2
-rw-r--r--sys/sys/vnode.h1
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c4
-rw-r--r--sys/vm/vm_page.c2
-rw-r--r--sys/vm/vm_page.h1
-rw-r--r--sys/vm/vm_pageout.c228
-rw-r--r--sys/vm/vnode_pager.c39
11 files changed, 304 insertions, 252 deletions
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
bp->b_bcount = size;
bp->b_bufsize = size;
bp->b_resid = 0;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += pcount;
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 21d447d..9949813 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -85,22 +85,24 @@ static void buf_daemon __P((void));
* but the code is intricate enough already.
*/
vm_page_t bogus_page;
-int runningbufspace;
int vmiodirenable = FALSE;
+int runningbufspace;
static vm_offset_t bogus_offset;
-static int bufspace, maxbufspace,
+static int bufspace, maxbufspace,
bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
static int bufreusecnt, bufdefragcnt, buffreekvacnt;
-static int maxbdrun;
static int needsbuffer;
-static int numdirtybuffers, hidirtybuffers;
+static int lorunningspace, hirunningspace, runningbufreq;
+static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
static int numfreebuffers, lofreebuffers, hifreebuffers;
static int getnewbufcalls;
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
&numdirtybuffers, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
+ &lodirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
&hidirtybuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
@@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
&hifreebuffers, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
&runningbufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
+ &lorunningspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
+ &hirunningspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
&maxbufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
@@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
&lobufspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
&bufspace, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
- &maxbdrun, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
&maxbufmallocspace, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
@@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn)
*/
static __inline void
-numdirtywakeup(void)
+numdirtywakeup(int level)
{
- if (numdirtybuffers < hidirtybuffers) {
+ if (numdirtybuffers <= level) {
if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
wakeup(&needsbuffer);
@@ -204,6 +208,23 @@ bufspacewakeup(void)
}
/*
+ * runningbufwakeup() - in-progress I/O accounting.
+ *
+ */
+static __inline void
+runningbufwakeup(struct buf *bp)
+{
+ if (bp->b_runningbufspace) {
+ runningbufspace -= bp->b_runningbufspace;
+ bp->b_runningbufspace = 0;
+ if (runningbufreq && runningbufspace <= lorunningspace) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ }
+}
+
+/*
* bufcountwakeup:
*
* Called when a buffer has been added to one of the free queues to
@@ -225,6 +246,31 @@ bufcountwakeup(void)
}
/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * Reads will adjust runningbufspace, but will not block based on it.
+ * The read load has a side effect of reducing the allowed write load.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+static __inline void
+waitrunningbufspace(void)
+{
+ while (runningbufspace > hirunningspace) {
+ ++runningbufreq;
+ tsleep(&runningbufreq, PVM, "wdrain", 0);
+ }
+}
+
+
+/*
* vfs_buf_test_cache:
*
* Called when a buffer is extended. This function clears the B_CACHE
@@ -248,7 +294,7 @@ static __inline__
void
bd_wakeup(int dirtybuflevel)
{
- if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
+ if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
bd_request = 1;
wakeup(&bd_request);
}
@@ -330,6 +376,9 @@ bufinit(void)
hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
lobufspace = hibufspace - MAXBSIZE;
+ lorunningspace = 512 * 1024;
+ hirunningspace = 1024 * 1024;
+
/*
* Limit the amount of malloc memory since it is wired permanently into
* the kernel space. Even though this is accounted for in the buffer
@@ -354,6 +403,7 @@ bufinit(void)
while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
hidirtybuffers >>= 1;
}
+ lodirtybuffers = hidirtybuffers / 2;
/*
* Try to keep the number of free buffers in the specified range,
@@ -370,8 +420,6 @@ bufinit(void)
* based on the number of bytes of I/O in-transit that were initiated
* from buf_daemon.
*/
- if ((maxbdrun = nswbuf / 4) < 4)
- maxbdrun = 4;
bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
bogus_page = vm_page_alloc(kernel_object,
@@ -419,7 +467,6 @@ bremfree(struct buf * bp)
KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
bp->b_qindex = QUEUE_NONE;
- runningbufspace += bp->b_bufsize;
} else {
if (BUF_REFCNT(bp) <= 1)
panic("bremfree: removing a buffer not on a queue");
@@ -659,6 +706,13 @@ bwrite(struct buf * bp)
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
+ } else {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. There is no chance of deadlock here because
+ * we are blocking on I/O that is already in-progress.
+ */
+ waitrunningbufspace();
}
return (0);
@@ -774,11 +828,11 @@ bdwrite(struct buf * bp)
bqrelse(bp);
/*
- * Wakeup the buffer flushing daemon if we have saturated the
- * buffer cache.
+ * Wakeup the buffer flushing daemon if we have a lot of dirty
+ * buffers (midpoint between our recovery point and our stall
+ * point).
*/
-
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
/*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
@@ -817,7 +871,7 @@ bdirty(bp)
bp->b_flags |= B_DONE | B_DELWRI;
reassignbuf(bp, bp->b_vp);
++numdirtybuffers;
- bd_wakeup(hidirtybuffers);
+ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
}
}
@@ -843,7 +897,7 @@ bundirty(bp)
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp, bp->b_vp);
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
/*
* Since it is now being written, we can clear its deferred write flag.
@@ -896,14 +950,12 @@ bowrite(struct buf * bp)
void
bwillwrite(void)
{
- int slop = hidirtybuffers / 10;
-
- if (numdirtybuffers > hidirtybuffers + slop) {
+ if (numdirtybuffers >= hidirtybuffers) {
int s;
s = splbio();
- while (numdirtybuffers > hidirtybuffers) {
- bd_wakeup(hidirtybuffers);
+ while (numdirtybuffers >= hidirtybuffers) {
+ bd_wakeup(1);
needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
}
@@ -963,7 +1015,7 @@ brelse(struct buf * bp)
buf_deallocate(bp);
if (bp->b_flags & B_DELWRI) {
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
bp->b_flags &= ~(B_DELWRI | B_CACHE);
if ((bp->b_flags & B_VMIO) == 0) {
@@ -1169,11 +1221,9 @@ brelse(struct buf * bp)
if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
bp->b_flags &= ~B_DELWRI;
--numdirtybuffers;
- numdirtywakeup();
+ numdirtywakeup(lodirtybuffers);
}
- runningbufspace -= bp->b_bufsize;
-
/*
* Fixup numfreebuffers count. The bp is on an appropriate queue
* unless locked. We then bump numfreebuffers if it is not B_DELWRI.
@@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp)
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
}
- runningbufspace -= bp->b_bufsize;
-
if ((bp->b_flags & B_LOCKED) == 0 &&
((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
bufcountwakeup();
@@ -1309,13 +1357,13 @@ vfs_vmio_release(bp)
}
}
}
- runningbufspace -= bp->b_bufsize;
splx(s);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_npages = 0;
- bp->b_bufsize = 0;
bp->b_flags &= ~B_VMIO;
if (bp->b_vp)
brelvp(bp);
@@ -1723,27 +1771,6 @@ restart:
return(bp);
}
-#if 0
-/*
- * waitfreebuffers:
- *
- * Wait for sufficient free buffers. Only called from normal processes.
- */
-
-static void
-waitfreebuffers(int slpflag, int slptimeo)
-{
- while (numfreebuffers < hifreebuffers) {
- if (numfreebuffers >= hifreebuffers)
- break;
- needsbuffer |= VFS_BIO_NEED_FREE;
- if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
- break;
- }
-}
-
-#endif
-
/*
* buf_daemon:
*
@@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo)
*/
static struct proc *bufdaemonproc;
-static int bd_interval;
-static int bd_flushto;
-static int bd_flushinc;
static struct kproc_desc buf_kp = {
"bufdaemon",
@@ -1783,65 +1807,50 @@ buf_daemon()
curproc->p_flag |= P_BUFEXHAUST;
s = splbio();
- bd_interval = 5 * hz; /* dynamically adjusted */
- bd_flushto = hidirtybuffers; /* dynamically adjusted */
- bd_flushinc = 1;
-
for (;;) {
kthread_suspend_check(bufdaemonproc);
bd_request = 0;
/*
- * Do the flush. Limit the number of buffers we flush in one
- * go. The failure condition occurs when processes are writing
- * buffers faster then we can dispose of them. In this case
- * we may be flushing so often that the previous set of flushes
- * have not had time to complete, causing us to run out of
- * physical buffers and block.
+ * Do the flush. Limit the amount of in-transit I/O we
+ * allow to build up, otherwise we would completely saturate
+ * the I/O system. Wakeup any waiting processes before we
+ * normally would so they can run in parallel with our drain.
*/
- {
- int runcount = maxbdrun;
-
- while (numdirtybuffers > bd_flushto && runcount) {
- --runcount;
- if (flushbufqueues() == 0)
- break;
- }
+ while (numdirtybuffers > lodirtybuffers) {
+ if (flushbufqueues() == 0)
+ break;
+ waitrunningbufspace();
+ numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
}
- if (bd_request ||
- tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) {
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 5 seconds and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep half a second.
+ * Otherwise we loop immediately.
+ */
+ if (numdirtybuffers <= lodirtybuffers) {
/*
- * Another request is pending or we were woken up
- * without timing out. Flush more.
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
*/
- --bd_flushto;
- if (bd_flushto >= numdirtybuffers - 5) {
- bd_flushto = numdirtybuffers - 10;
- bd_flushinc = 1;
- }
- if (bd_flushto < 2)
- bd_flushto = 2;
+ bd_request = 0;
+ tsleep(&bd_request, PVM, "psleep", hz);
} else {
/*
- * We slept and timed out, we can slow down.
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
*/
- bd_flushto += bd_flushinc;
- if (bd_flushto > hidirtybuffers)
- bd_flushto = hidirtybuffers;
- ++bd_flushinc;
- if (bd_flushinc > hidirtybuffers / 20 + 1)
- bd_flushinc = hidirtybuffers / 20 + 1;
+ tsleep(&bd_request, PVM, "qsleep", hz / 2);
}
-
- /*
- * Set the interval on a linear scale based on hidirtybuffers
- * with a maximum frequency of 1/10 second.
- */
- bd_interval = bd_flushto * 5 * hz / hidirtybuffers;
- if (bd_interval < hz / 10)
- bd_interval = hz / 10;
}
}
@@ -2097,21 +2106,11 @@ loop:
*
* XXX remove if 0 sections (clean this up after its proven)
*/
-#if 0
- if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
-#endif
- if (numfreebuffers == 0) {
- if (curproc == idleproc)
- return NULL;
- needsbuffer |= VFS_BIO_NEED_ANY;
- tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
- slptimeo);
- }
-#if 0
- } else if (numfreebuffers < lofreebuffers) {
- waitfreebuffers(slpflag, slptimeo);
+ if (numfreebuffers == 0) {
+ if (curproc == idleproc)
+ return NULL;
+ needsbuffer |= VFS_BIO_NEED_ANY;
}
-#endif
if ((bp = gbincore(vp, blkno))) {
/*
@@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
} else {
free(bp->b_data, M_BIOBUF);
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
bp->b_data = bp->b_kvabase;
- bp->b_bufsize = 0;
bp->b_bcount = 0;
bp->b_flags &= ~B_MALLOC;
}
@@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size)
bp->b_bcount = size;
bp->b_flags |= B_MALLOC;
bufmallocspace += mbsize;
- runningbufspace += bp->b_bufsize;
return 1;
}
#endif
@@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size)
origbuf = bp->b_data;
origbufsize = bp->b_bufsize;
bp->b_data = bp->b_kvabase;
- bufmallocspace -= bp->b_bufsize;
- runningbufspace -= bp->b_bufsize;
- if (bp->b_bufsize)
+ if (bp->b_bufsize) {
+ bufmallocspace -= bp->b_bufsize;
bufspacewakeup();
- bp->b_bufsize = 0;
+ bp->b_bufsize = 0;
+ }
bp->b_flags &= ~B_MALLOC;
newbsize = round_page(newbsize);
}
@@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size)
(vm_offset_t)(bp->b_offset & PAGE_MASK));
}
}
- runningbufspace += (newbsize - bp->b_bufsize);
if (newbsize < bp->b_bufsize)
bufspacewakeup();
bp->b_bufsize = newbsize; /* actual buffer allocation */
@@ -2681,6 +2678,7 @@ bufdone(struct buf *bp)
KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
bp->b_flags |= B_DONE;
+ runningbufwakeup(bp);
if (bp->b_iocmd == BIO_DELETE) {
brelse(bp);
@@ -2768,18 +2766,8 @@ bufdone(struct buf *bp)
if (m == bogus_page) {
bogusflag = 1;
m = vm_page_lookup(obj, OFF_TO_IDX(foff));
- if (!m) {
+ if (m == NULL)
panic("biodone: page disappeared!");
-#if defined(VFS_BIO_DEBUG)
- printf("biodone: page disappeared\n");
-#endif
- vm_object_pip_subtract(obj, 1);
- bp->b_flags &= ~B_CACHE;
- foff = (foff + PAGE_SIZE) &
- ~(off_t)PAGE_MASK;
- iosize -= resid;
- continue;
- }
bp->b_pages[i] = m;
pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
}
@@ -2833,6 +2821,7 @@ bufdone(struct buf *bp)
if (obj)
vm_object_pip_wakeupn(obj, 0);
}
+
/*
* For asynchronous completions, release the buffer now. The brelse
* will do a wakeup there if necessary - so no need to do a wakeup
@@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp)
{
int i;
+ runningbufwakeup(bp);
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
@@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
{
int i, bogus;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 29a1879..088dc40 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -247,8 +247,12 @@ single_block_read:
printf("S(%ld,%ld,%d) ",
(long)bp->b_lblkno, bp->b_bcount, seqcount);
#endif
- if ((bp->b_flags & B_CLUSTER) == 0)
+ if ((bp->b_flags & B_CLUSTER) == 0) {
vfs_busy_pages(bp, 0);
+ } else {
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
+ }
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
@@ -283,8 +287,12 @@ single_block_read:
}
#endif
- if ((rbp->b_flags & B_CLUSTER) == 0)
+ if ((rbp->b_flags & B_CLUSTER) == 0) {
vfs_busy_pages(rbp, 0);
+ } else {
+ rbp->b_runningbufspace = rbp->b_bufsize;
+ runningbufspace += rbp->b_runningbufspace;
+ }
rbp->b_flags &= ~B_INVAL;
rbp->b_ioflags &= ~BIO_ERROR;
if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index 582bece..f3d7f11 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -684,6 +684,8 @@ spec_getpages(ap)
bp->b_bcount = size;
bp->b_bufsize = size;
bp->b_resid = 0;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += pcount;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index a10083f..223c036 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -110,6 +110,7 @@ struct buf {
unsigned char b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
long b_bufsize; /* Allocated buffer size. */
+ long b_runningbufspace; /* when I/O is running, pipelining */
caddr_t b_kvabase; /* base kva for buffer */
int b_kvasize; /* size of kva for buffer */
daddr_t b_lblkno; /* Logical block number. */
@@ -480,6 +481,7 @@ buf_countdeps(struct buf *bp, int i)
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
+extern int runningbufspace;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 75462f6..2ab6f3f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -213,6 +213,7 @@ struct vattr {
#define IO_NDELAY 0x10 /* FNDELAY flag set in file table */
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
+#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index e1d775c..62ec9e3 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -504,7 +504,9 @@ WRITE(ap)
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
- } else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+ } else if (vm_page_count_severe() ||
+ buf_dirty_count_severe() ||
+ (ioflag & IO_ASYNC)) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else if (xfersize + blkoffset == fs->fs_bsize) {
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 1b2db6e..7cbe750 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1273,6 +1273,7 @@ vm_page_unwire(m, activate)
vm_page_queues[PQ_ACTIVE].lcnt++;
cnt.v_active_count++;
} else {
+ vm_page_flag_clear(m, PG_WINATCFLS);
TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
m->queue = PQ_INACTIVE;
vm_page_queues[PQ_INACTIVE].lcnt++;
@@ -1311,6 +1312,7 @@ _vm_page_deactivate(vm_page_t m, int athead)
if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
if ((m->queue - m->pc) == PQ_CACHE)
cnt.v_reactivated++;
+ vm_page_flag_clear(m, PG_WINATCFLS);
vm_page_unqueue(m);
if (athead)
TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 4c31df9..dc8290e 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -242,6 +242,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
*/
#define PG_BUSY 0x0001 /* page is in transit (O) */
#define PG_WANTED 0x0002 /* someone is waiting for page (O) */
+#define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */
#define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */
#define PG_WRITEABLE 0x0010 /* page is mapped writeable */
#define PG_MAPPED 0x0020 /* page is mapped */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index dbea3d6..943fb11 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -106,7 +106,7 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout __P((void));
static int vm_pageout_clean __P((vm_page_t));
-static int vm_pageout_scan __P((void));
+static void vm_pageout_scan __P((int pass));
static int vm_pageout_free_page_calc __P((vm_size_t count));
struct proc *pageproc;
@@ -140,14 +140,13 @@ static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
#endif
extern int vm_swap_size;
+static int vm_max_launder = 32;
static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
static int vm_pageout_full_stats_interval = 0;
-static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0;
+static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
static int defer_swap_pageouts=0;
static int disable_swap_pageouts=0;
-static int max_page_launder=100;
-static int vm_pageout_actcmp=0;
#if defined(NO_SWAPPING)
static int vm_swap_enabled=0;
static int vm_swap_idle_enabled=0;
@@ -157,7 +156,10 @@ static int vm_swap_idle_enabled=0;
#endif
SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
- CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt");
+ CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
+
+SYSCTL_INT(_vm, OID_AUTO, max_launder,
+ CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
@@ -189,12 +191,6 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
-SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
- CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
-SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
- CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
-
-
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
@@ -509,7 +505,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
} else if (p->queue == PQ_ACTIVE) {
if ((p->flags & PG_REFERENCED) == 0) {
p->act_count -= min(p->act_count, ACT_DECLINE);
- if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) {
+ if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
vm_page_protect(p, VM_PROT_NONE);
vm_page_deactivate(p);
} else {
@@ -627,20 +623,21 @@ vm_pageout_page_free(vm_page_t m) {
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*/
-static int
-vm_pageout_scan()
+static void
+vm_pageout_scan(int pass)
{
vm_page_t m, next;
struct vm_page marker;
+ int save_page_shortage;
+ int save_inactive_count;
int page_shortage, maxscan, pcount;
int addl_page_shortage, addl_page_shortage_init;
- int maxlaunder;
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
vm_object_t object;
- int force_wakeup = 0;
int actcount;
int vnodes_skipped = 0;
+ int maxlaunder;
int s;
/*
@@ -651,27 +648,13 @@ vm_pageout_scan()
addl_page_shortage_init = vm_pageout_deficit;
vm_pageout_deficit = 0;
- if (max_page_launder == 0)
- max_page_launder = 1;
-
/*
* Calculate the number of pages we want to either free or move
- * to the cache. Be more agressive if we aren't making our target.
+ * to the cache.
*/
-
- page_shortage = vm_paging_target() +
- addl_page_shortage_init + vm_pageout_actcmp;
-
- /*
- * Figure out how agressively we should flush dirty pages.
- */
- {
- int factor = vm_pageout_actcmp;
-
- maxlaunder = cnt.v_inactive_target / 3 + factor;
- if (maxlaunder > max_page_launder + factor)
- maxlaunder = max_page_launder + factor;
- }
+ page_shortage = vm_paging_target() + addl_page_shortage_init;
+ save_page_shortage = page_shortage;
+ save_inactive_count = cnt.v_inactive_count;
/*
* Initialize our marker
@@ -687,8 +670,22 @@ vm_pageout_scan()
* we have scanned the entire inactive queue. Note that m->act_count
* is not used to form decisions for the inactive queue, only for the
* active queue.
+ *
+ * maxlaunder limits the number of dirty pages we flush per scan.
+ * For most systems a smaller value (16 or 32) is more robust under
+ * extreme memory and disk pressure because any unnecessary writes
+ * to disk can result in extreme performance degredation. However,
+ * systems with excessive dirty pages (especially when MAP_NOSYNC is
+ * used) will die horribly with limited laundering. If the pageout
+ * daemon cannot clean enough pages in the first pass, we let it go
+ * all out in succeeding passes.
*/
+ if ((maxlaunder = vm_max_launder) <= 1)
+ maxlaunder = 1;
+ if (pass)
+ maxlaunder = 10000;
+
rescan0:
addl_page_shortage = addl_page_shortage_init;
maxscan = cnt.v_inactive_count;
@@ -792,12 +789,32 @@ rescan0:
} else if (m->dirty == 0) {
vm_page_cache(m);
--page_shortage;
-
- /*
- * Dirty pages need to be paged out. Note that we clean
- * only a limited number of pages per pagedaemon pass.
- */
+ } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
+ /*
+ * Dirty pages need to be paged out, but flushing
+ * a page is extremely expensive verses freeing
+ * a clean page. Rather then artificially limiting
+ * the number of pages we can flush, we instead give
+ * dirty pages extra priority on the inactive queue
+ * by forcing them to be cycled through the queue
+ * twice before being flushed, after which the
+ * (now clean) page will cycle through once more
+ * before being freed. This significantly extends
+ * the thrash point for a heavily loaded machine.
+ */
+ s = splvm();
+ vm_page_flag_set(m, PG_WINATCFLS);
+ TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+ TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
+ splx(s);
} else if (maxlaunder > 0) {
+ /*
+ * We always want to try to flush some dirty pages if
+ * we encounter them, to keep the system stable.
+ * Normally this number is small, but under extreme
+ * pressure where there are insufficient clean pages
+ * on the inactive queue, we may have to go all out.
+ */
int swap_pageouts_ok;
struct vnode *vp = NULL;
struct mount *mp;
@@ -826,29 +843,24 @@ rescan0:
}
/*
- * Presumably we have sufficient free memory to do
- * the more sophisticated checks and locking required
- * for vnodes.
- *
- * The object is already known NOT to be dead. The
- * vget() may still block, though, because
- * VOP_ISLOCKED() doesn't check to see if an inode
- * (v_data) is associated with the vnode. If it isn't,
- * vget() will load in it from disk. Worse, vget()
- * may actually get stuck waiting on "inode" if another
- * process is in the process of bringing the inode in.
- * This is bad news for us either way.
+ * The object is already known NOT to be dead. It
+ * is possible for the vget() to block the whole
+ * pageout daemon, but the new low-memory handling
+ * code should prevent it.
*
- * So for the moment we check v_data == NULL as a
- * workaround. This means that vnodes which do not
- * use v_data in the way we expect probably will not
- * wind up being paged out by the pager and it will be
- * up to the syncer to get them. That's better then
- * us blocking here.
+ * The previous code skipped locked vnodes and, worse,
+ * reordered pages in the queue. This results in
+ * completely non-deterministic operation and, on a
+ * busy system, can lead to extremely non-optimal
+ * pageouts. For example, it can cause clean pages
+ * to be freed and dirty pages to be moved to the end
+ * of the queue. Since dirty pages are also moved to
+ * the end of the queue once-cleaned, this gives
+ * way too large a weighting to defering the freeing
+ * of dirty pages.
*
- * This whole code section is bogus - we need to fix
- * the vnode pager to handle vm_page_t's without us
- * having to do any sophisticated VOP tests.
+ * XXX we need to be able to apply a timeout to the
+ * vget() lock attempt.
*/
if (object->type == OBJT_VNODE) {
@@ -857,19 +869,8 @@ rescan0:
mp = NULL;
if (vp->v_type == VREG)
vn_start_write(vp, &mp, V_NOWAIT);
- if (VOP_ISLOCKED(vp, NULL) ||
- vp->v_data == NULL ||
- vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
+ if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
vn_finished_write(mp);
- if ((m->queue == PQ_INACTIVE) &&
- (m->hold_count == 0) &&
- (m->busy == 0) &&
- (m->flags & PG_BUSY) == 0) {
- s = splvm();
- TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
- TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
- splx(s);
- }
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
continue;
@@ -924,18 +925,23 @@ rescan0:
* If a page is dirty, then it is either being washed
* (but not yet cleaned) or it is still in the
* laundry. If it is still in the laundry, then we
- * start the cleaning operation. maxlaunder nominally
- * counts I/O cost (seeks) rather then bytes.
+ * start the cleaning operation.
*
* This operation may cluster, invalidating the 'next'
* pointer. To prevent an inordinate number of
* restarts we use our marker to remember our place.
+ *
+ * decrement page_shortage on success to account for
+ * the (future) cleaned page. Otherwise we could wind
+ * up laundering or cleaning too many pages.
*/
s = splvm();
TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
splx(s);
- if (vm_pageout_clean(m) != 0)
+ if (vm_pageout_clean(m) != 0) {
+ --page_shortage;
--maxlaunder;
+ }
s = splvm();
next = TAILQ_NEXT(&marker, pageq);
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
@@ -948,28 +954,12 @@ rescan0:
}
/*
- * If we were not able to meet our target, increase actcmp
- */
-
- if (vm_page_count_min()) {
- if (vm_pageout_actcmp < ACT_MAX / 2)
- vm_pageout_actcmp += ACT_ADVANCE;
- } else {
- if (vm_pageout_actcmp < ACT_DECLINE)
- vm_pageout_actcmp = 0;
- else
- vm_pageout_actcmp -= ACT_DECLINE;
- }
-
- /*
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
*/
-
page_shortage = vm_paging_target() +
cnt.v_inactive_target - cnt.v_inactive_count;
page_shortage += addl_page_shortage;
- page_shortage += vm_pageout_actcmp;
/*
* Scan the active queue for things we can deactivate. We nominally
@@ -1043,9 +1033,9 @@ rescan0:
splx(s);
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
- if (vm_pageout_algorithm_lru ||
- (m->object->ref_count == 0) ||
- (m->act_count <= vm_pageout_actcmp)) {
+ if (vm_pageout_algorithm ||
+ m->object->ref_count == 0 ||
+ m->act_count == 0) {
page_shortage--;
if (m->object->ref_count == 0) {
vm_page_protect(m, VM_PROT_NONE);
@@ -1175,7 +1165,6 @@ rescan0:
wakeup(&cnt.v_free_count);
}
}
- return force_wakeup;
}
/*
@@ -1254,11 +1243,13 @@ vm_pageout_page_stats()
} else {
if (m->act_count == 0) {
/*
- * We turn off page access, so that we have more accurate
- * RSS stats. We don't do this in the normal page deactivation
- * when the system is loaded VM wise, because the cost of
- * the large number of page protect operations would be higher
- * than the value of doing the operation.
+ * We turn off page access, so that we have
+ * more accurate RSS stats. We don't do this
+ * in the normal page deactivation when the
+ * system is loaded VM wise, because the
+ * cost of the large number of page protect
+ * operations would be higher than the value
+ * of doing the operation.
*/
vm_page_protect(m, VM_PROT_NONE);
vm_page_deactivate(m);
@@ -1307,6 +1298,7 @@ vm_size_t count;
static void
vm_pageout()
{
+ int pass;
mtx_enter(&Giant, MTX_DEF);
@@ -1320,11 +1312,18 @@ vm_pageout()
vm_pageout_free_page_calc(cnt.v_page_count);
/*
- * free_reserved needs to include enough for the largest swap pager
- * structures plus enough for any pv_entry structs when paging.
+ * v_free_target and v_cache_min control pageout hysteresis. Note
+ * that these are more a measure of the VM cache queue hysteresis
+ * then the VM free queue. Specifically, v_free_target is the
+ * high water mark (free+cache pages).
+ *
+ * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
+ * low water mark, while v_free_min is the stop. v_cache_min must
+ * be big enough to handle memory needs while the pageout daemon
+ * is signalled and run to free more pages.
*/
if (cnt.v_free_count > 6144)
- cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
+ cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
else
cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
@@ -1362,10 +1361,9 @@ vm_pageout()
if (vm_pageout_stats_free_max == 0)
vm_pageout_stats_free_max = 5;
- max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16);
-
curproc->p_flag |= P_BUFEXHAUST;
swap_pager_swap_init();
+ pass = 0;
/*
* The pageout daemon is never done, so loop forever.
*/
@@ -1386,19 +1384,27 @@ vm_pageout()
}
if (vm_pages_needed) {
/*
- * Still not done, sleep a bit and go again
+ * Still not done, take a second pass without waiting
+ * (unlimited dirty cleaning), otherwise sleep a bit
+ * and try again.
*/
- tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
+ ++pass;
+ if (pass > 1)
+ tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
} else {
/*
- * Good enough, sleep & handle stats
+ * Good enough, sleep & handle stats. Prime the pass
+ * for the next run.
*/
+ if (pass > 1)
+ pass = 1;
+ else
+ pass = 0;
error = tsleep(&vm_pages_needed,
PVM, "psleep", vm_pageout_stats_interval * hz);
if (error && !vm_pages_needed) {
- if (vm_pageout_actcmp > 0)
- --vm_pageout_actcmp;
splx(s);
+ pass = 0;
vm_pageout_page_stats();
continue;
}
@@ -1407,7 +1413,7 @@ vm_pageout()
if (vm_pages_needed)
cnt.v_pdwakeups++;
splx(s);
- vm_pageout_scan();
+ vm_pageout_scan(pass);
vm_pageout_deficit = 0;
}
}
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index 3dd12ec..c79f62a 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -300,10 +300,29 @@ vnode_pager_setsize(vp, nsize)
m = vm_page_lookup(object, OFF_TO_IDX(nsize));
if (m) {
+ int base = (int)nsize & PAGE_MASK;
+ int size = PAGE_SIZE - base;
+
+ /*
+ * Clear out partial-page garbage in case
+ * the page has been mapped.
+ */
kva = vm_pager_map_page(m);
- bzero((caddr_t) kva + (nsize & PAGE_MASK),
- (int) (round_page(nsize) - nsize));
+ bzero((caddr_t)kva + base, size);
vm_pager_unmap_page(kva);
+
+ /*
+ * Clear out partial-page dirty bits. This
+ * has the side effect of setting the valid
+ * bits, but that is ok. There are a bunch
+ * of places in the VM system where we expected
+ * m->dirty == VM_PAGE_BITS_ALL. The file EOF
+ * case is one of them. If the page is still
+ * partially dirty, make it fully dirty.
+ */
+ vm_page_set_validclean(m, base, size);
+ if (m->dirty != 0)
+ m->dirty = VM_PAGE_BITS_ALL;
}
}
}
@@ -424,6 +443,8 @@ vnode_pager_input_smlfs(object, m)
pbgetvp(dp, bp);
bp->b_bcount = bsize;
bp->b_bufsize = bsize;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
/* do the input */
BUF_STRATEGY(bp);
@@ -742,6 +763,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
pbgetvp(dp, bp);
bp->b_bcount = size;
bp->b_bufsize = size;
+ bp->b_runningbufspace = bp->b_bufsize;
+ runningbufspace += bp->b_runningbufspace;
cnt.v_vnodein++;
cnt.v_vnodepgsin += count;
@@ -888,6 +911,11 @@ vnode_pager_putpages(object, m, count, sync, rtvals)
/*
* This is now called from local media FS's to operate against their
* own vnodes if they fail to implement VOP_PUTPAGES.
+ *
+ * This is typically called indirectly via the pageout daemon and
+ * clustering has already typically occured, so in general we ask the
+ * underlying filesystem to write the data out asynchronously rather
+ * then delayed.
*/
int
vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
@@ -938,8 +966,13 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
}
}
+ /*
+ * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
+ * rather then a bdwrite() to prevent paging I/O from saturating
+ * the buffer cache.
+ */
ioflags = IO_VMIO;
- ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: 0;
+ ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
aiov.iov_base = (caddr_t) 0;
OpenPOWER on IntegriCloud