These changes appear to give us benefits with both small (32MB) and

large (1G) memory machine configurations. I was able to run 'dbench 32' on a 32MB system without bring the machine to a grinding halt. * buffer cache hash table now dynamically allocated. This will have no effect on memory consumption for smaller systems and will help scale the buffer cache for larger systems. * minor enhancement to pmap_clearbit(). I noticed that all the calls to it used constant arguments. Making it an inline allows the constants to propogate to deeper inlines and should produce better code. * removal of inherent vfs_ioopt support through the emplacement of appropriate #ifdef's, with John's permission. If we do not find a use for it by the end of the year we will remove it entirely. * removal of getnewbufloops* counters & sysctl's - no longer necessary for debugging, getnewbuf() is now optimal. * buffer hash table functions removed from sys/buf.h and localized to vfs_bio.c * VFS_BIO_NEED_DIRTYFLUSH flag and support code added ( bwillwrite() ), allowing processes to block when too many dirty buffers are present in the system. * removal of a softdep test in bdwrite() that is no longer necessary now that bdwrite() no longer attempts to flush dirty buffers. * slight optimization added to bqrelse() - there is no reason to test for available buffer space on B_DELWRI buffers. * addition of reverse-scanning code to vfs_bio_awrite(). vfs_bio_awrite() will attempt to locate clusterable areas in both the forward and reverse direction relative to the offset of the buffer passed to it. This will probably not make much of a difference now, but I believe we will start to rely on it heavily in the future if we decide to shift some of the burden of the clustering closer to the actual I/O initiation. * Removal of the newbufcnt and lastnewbuf counters that Kirk added. They do not fix any race conditions that haven't already been fixed by the gbincore() test done after the only call to getnewbuf(). getnewbuf() is a static, so there is no chance of it being misused by other modules. ( Unless Kirk can think of a specific thing that this code fixes. I went through it very carefully and didn't see anything ). * removal of VOP_ISLOCKED() check in flushbufqueues(). I do not think this check is necessary, the buffer should flush properly whether the vnode is locked or not. ( yes? ). * removal of extra arguments passed to getnewbuf() that are not necessary. * missed cluster_wbuild() that had to be a cluster_wbuild_wb() in vfs_cluster.c * vn_write() now calls bwillwrite() *PRIOR* to locking the vnode, which should greatly aid flushing operations in heavy load situations - both the pageout and update daemons will be able to operate more efficiently. * removal of b_usecount. We may add it back in later but for now it is useless. Prior implementations of the buffer cache never had enough buffers for it to be useful, and current implementations which make more buffers available might not benefit relative to the amount of sophistication required to implement a b_usecount. Straight LRU should work just as well, especially when most things are VMIO backed. I expect that (even though John will not like this assumption) directories will become VMIO backed some point soon. Submitted by: Matthew Dillon <dillon@backplane.com> Reviewed by: Kirk McKusick <mckusick@mckusick.com>
author: mckusick <mckusick@FreeBSD.org> 1999-07-08 06:06:00 +0000
committer: mckusick <mckusick@FreeBSD.org> 1999-07-08 06:06:00 +0000
commit: 52ea4270f31e0e98f9256262d955b9d8b54b4a00 (patch)
tree: dca69dbd5c0caa38ba5484660947559b9a6e6acc /sys/kern/vfs_bio.c
parent: 28a2932015592e29bc53e27e881a9987c0020796 (diff)
download: FreeBSD-src-52ea4270f31e0e98f9256262d955b9d8b54b4a00.zip
FreeBSD-src-52ea4270f31e0e98f9256262d955b9d8b54b4a00.tar.gz
1 files changed, 214 insertions, 167 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 5c478c6..47e8276 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $
+ * $Id: vfs_bio.c,v 1.220 1999/07/04 00:25:27 mckusick Exp $
  */
 
 /*
@@ -90,14 +90,11 @@ static int bufspace, maxbufspace, vmiospace,
 #if 0
 static int maxvmiobufspace;
 #endif
+static int maxbdrun;
 static int needsbuffer;
 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
 static int numfreebuffers, lofreebuffers, hifreebuffers;
 static int getnewbufcalls;
-static int getnewbufloops;
-static int getnewbufloops1;
-static int getnewbufloops2;
-static int getnewbufloops3;
 static int getnewbufrestarts;
 static int kvafreespace;
 
@@ -121,6 +118,8 @@ SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
 	&hibufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
 	&bufspace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
+	&maxbdrun, 0, "");
 #if 0
 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
 	&maxvmiobufspace, 0, "");
@@ -135,18 +134,12 @@ SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
 	&kvafreespace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
 	&getnewbufcalls, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW,
-	&getnewbufloops, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW,
-	&getnewbufloops1, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW,
-	&getnewbufloops2, 0, "");
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW,
-	&getnewbufloops3, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
 	&getnewbufrestarts, 0, "");
 
-static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
+
+static int bufhashmask;
+static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
 char *buf_wmesg = BUF_WMESG;
 
@@ -155,12 +148,24 @@ extern int vm_swap_size;
 #define BUF_MAXUSE		24
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
-#define VFS_BIO_NEED_RESERVED02	0x02	/* unused */
+#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 #define VFS_BIO_NEED_KVASPACE	0x10	/* wait for buffer_map space, emerg  */
 
 /*
+ * Buffer hash table code.  Note that the logical block scans linearly, which
+ * gives us some L1 cache locality.
+ */
+
+static __inline 
+struct bufhashhdr *
+bufhash(struct vnode *vnp, daddr_t bn)
+{
+	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
+}
+
+/*
  *	kvaspacewakeup:
  *
  *	Called when kva space is potential available for recovery or when
@@ -185,6 +190,24 @@ kvaspacewakeup(void)
 }
 
 /*
+ *	numdirtywakeup:
+ *
+ *	If someone is blocked due to there being too many dirty buffers,
+ *	and numdirtybuffers is now reasonable, wake them up.
+ */
+
+static __inline void
+numdirtywakeup(void)
+{
+	if (numdirtybuffers < hidirtybuffers) {
+		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
+			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
+			wakeup(&needsbuffer);
+		}
+	}
+}
+
+/*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery or when
@@ -260,10 +283,23 @@ bd_wakeup(int dirtybuflevel)
 
 
 /*
- * Initialize buffer headers and related structures.
+ * Initialize buffer headers and related structures. 
  */
+
+vm_offset_t
+bufhashinit(vm_offset_t vaddr)
+{
+	/* first, make a null hash table */
+	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
+		;
+	bufhashtbl = (void *)vaddr;
+	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
+	--bufhashmask;
+	return(vaddr);
+}
+
 void
-bufinit()
+bufinit(void)
 {
 	struct buf *bp;
 	int i;
@@ -272,8 +308,7 @@ bufinit()
 	LIST_INIT(&invalhash);
 	simple_lock_init(&buftimelock);
 
-	/* first, make a null hash table */
-	for (i = 0; i < BUFHSZ; i++)
+	for (i = 0; i <= bufhashmask; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
@@ -329,8 +364,8 @@ bufinit()
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
-	lodirtybuffers = nbuf / 6 + 10;
-	hidirtybuffers = nbuf / 3 + 20;
+	lodirtybuffers = nbuf / 7 + 10;
+	hidirtybuffers = nbuf / 4 + 20;
 	numdirtybuffers = 0;
 
 /*
@@ -341,6 +376,15 @@ bufinit()
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
+/*
+ * Maximum number of async ops initiated per buf_daemon loop.  This is
+ * somewhat of a hack at the moment, we really need to limit ourselves
+ * based on the number of bytes of I/O in-transit that were initiated
+ * from buf_daemon.
+ */
+	if ((maxbdrun = nswbuf / 4) < 4)
+		maxbdrun = 4;
+
 	kvafreespace = 0;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
@@ -383,19 +427,14 @@ bremfree(struct buf * bp)
 		if (bp->b_qindex == QUEUE_EMPTYKVA) {
 			kvafreespace -= bp->b_kvasize;
 		}
-		if (BUF_REFCNT(bp) == 1)
-			TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
-		else if (BUF_REFCNT(bp) == 0)
-			panic("bremfree: not locked");
-		else
-			/* Temporary panic to verify exclusive locking */
-			/* This panic goes away when we allow shared refs */
-			panic("bremfree: multiple refs");
+		KASSERT(BUF_REFCNT(bp) == 0, ("bremfree: bp %p not locked",bp));
+		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
 		runningbufspace += bp->b_bufsize;
 	} else {
 #if !defined(MAX_PERF)
-		panic("bremfree: removing a buffer when not on a queue");
+		if (BUF_REFCNT(bp) <= 1)
+			panic("bremfree: removing a buffer not on a queue");
 #endif
 	}
 
@@ -599,7 +638,9 @@ bwrite(struct buf * bp)
 void
 bdwrite(struct buf * bp)
 {
+#if 0
 	struct vnode *vp;
+#endif
 
 #if !defined(MAX_PERF)
 	if (BUF_REFCNT(bp) == 0)
@@ -654,6 +695,11 @@ bdwrite(struct buf * bp)
 	bd_wakeup(hidirtybuffers);
 
 	/*
+	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+	 * due to the softdep code.
+	 */
+#if 0
+	/*
 	 * XXX The soft dependency code is not prepared to
 	 * have I/O done when a bdwrite is requested. For
 	 * now we just let the write be delayed if it is
@@ -664,6 +710,7 @@ bdwrite(struct buf * bp)
 		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
 		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
 		return;
+#endif
 }
 
 /*
@@ -722,6 +769,7 @@ bundirty(bp)
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		--numdirtybuffers;
+		numdirtywakeup();
 	}
 }
 
@@ -757,6 +805,34 @@ bowrite(struct buf * bp)
 }
 
 /*
+ *	bwillwrite:
+ *
+ *	Called prior to the locking of any vnodes when we are expecting to
+ *	write.  We do not want to starve the buffer cache with too many
+ *	dirty buffers so we block here.  By blocking prior to the locking
+ *	of any vnodes we attempt to avoid the situation where a locked vnode
+ *	prevents the various system daemons from flushing related buffers.
+ */
+
+void
+bwillwrite(void)
+{
+	int twenty = (hidirtybuffers - lodirtybuffers) / 5;
+
+	if (numdirtybuffers > hidirtybuffers + twenty) {
+		int s;
+
+		s = splbio();
+		while (numdirtybuffers > hidirtybuffers) {
+			bd_wakeup(hidirtybuffers);
+			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
+			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
+		}
+		splx(s);
+	}
+}
+
+/*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
@@ -799,8 +875,10 @@ brelse(struct buf * bp)
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
-		if (bp->b_flags & B_DELWRI)
+		if (bp->b_flags & B_DELWRI) {
 			--numdirtybuffers;
+			numdirtywakeup();
+		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
@@ -991,6 +1069,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
 		bp->b_flags &= ~B_DELWRI;
 		--numdirtybuffers;
+		numdirtywakeup();
 	}
 
 	runningbufspace -= bp->b_bufsize;
@@ -1070,7 +1149,7 @@ bqrelse(struct buf * bp)
 	/*
 	 * Something we can maybe wakeup
 	 */
-	if (bp->b_bufsize)
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	/* unlock */
@@ -1139,7 +1218,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
-	bh = BUFHASH(vp, blkno);
+	bh = bufhash(vp, blkno);
 	bp = bh->lh_first;
 
 	/* Search hash chain */
@@ -1155,14 +1234,18 @@ gbincore(struct vnode * vp, daddr_t blkno)
 }
 
 /*
- * this routine implements clustered async writes for
- * clearing out B_DELWRI buffers...  This is much better
- * than the old way of writing only one buffer at a time.
+ *	vfs_bio_awrite:
+ *
+ *	Implement clustered async writes for clearing out B_DELWRI buffers.
+ *	This is much better then the old way of writing only one buffer at
+ *	a time.  Note that we may not be presented with the buffers in the 
+ *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
+	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
@@ -1174,8 +1257,9 @@ vfs_bio_awrite(struct buf * bp)
 
 	s = splbio();
 	/*
-	 * right now we support clustered writing only to regular files, and
-	 * then only if our I/O system is not saturated.
+	 * right now we support clustered writing only to regular files.  If
+	 * we find a clusterable block we could be in the middle of a cluster
+	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
@@ -1191,18 +1275,34 @@ vfs_bio_awrite(struct buf * bp)
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
-				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+				    (bpa->b_blkno !=
+				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
-		ncl = i;
+		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
+			if ((bpa = gbincore(vp, lblkno - j)) &&
+			    BUF_REFCNT(bpa) == 0 &&
+			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno !=
+				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		--j;
+		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
-			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			splx(s);
 			return nwritten;
 		}
@@ -1240,21 +1340,12 @@ vfs_bio_awrite(struct buf * bp)
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *	Instead we ask the pageout daemon to do it for us.  We attempt to
+ *	Instead we ask the buf daemon to do it for us.  We attempt to
  *	avoid piecemeal wakeups of the pageout daemon.
  */
 
-	/*
-	 * We fully expect to be able to handle any fragmentation and buffer
-	 * space issues by freeing QUEUE_CLEAN buffers.  If this fails, we 
-	 * have to wakeup the pageout daemon and ask it to flush some of our 
-	 * QUEUE_DIRTY buffers.  We have to be careful to prevent a deadlock.
-	 * XXX
-	 */
-
 static struct buf *
-getnewbuf(struct vnode *vp, daddr_t blkno,
-	int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
@@ -1262,8 +1353,6 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
 	int outofspace;
 	int nqindex;
 	int defrag = 0;
-	static int newbufcnt = 0;
-	int lastnewbuf = newbufcnt;
 	
 	++getnewbufcalls;
 	--getnewbufrestarts;
@@ -1338,13 +1427,9 @@ restart:
 	 * depending.
 	 */
 
-	if (nbp)
-		--getnewbufloops;
-
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
 
-		++getnewbufloops;
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
@@ -1372,7 +1457,6 @@ restart:
 		/*
 		 * Sanity Checks
 		 */
-		KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp));
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
@@ -1388,14 +1472,10 @@ restart:
 		 * buffer isn't useful for fixing that problem we continue.
 		 */
 
-		if (defrag > 0 && bp->b_kvasize == 0) {
-			++getnewbufloops1;
+		if (defrag > 0 && bp->b_kvasize == 0)
 			continue;
-		}
-		if (outofspace > 0 && bp->b_bufsize == 0) {
-			++getnewbufloops2;
+		if (outofspace > 0 && bp->b_bufsize == 0)
 			continue;
-		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
@@ -1433,7 +1513,6 @@ restart:
 		}
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
-
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 
@@ -1451,7 +1530,6 @@ restart:
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_usecount = 5;
 
 		LIST_INIT(&bp->b_dep);
 
@@ -1489,19 +1567,26 @@ restart:
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
-	 * wakeup the pageout daemon to write out some dirty buffers.
+	 * wakeup various daemons and write out some dirty buffers.
+	 *
+	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 
 	if (bp == NULL) {
 		int flags;
+		char *waitmsg;
 
 dosleep:
-		if (defrag > 0)
+		if (defrag > 0) {
 			flags = VFS_BIO_NEED_KVASPACE;
-		else if (outofspace > 0)
+			waitmsg = "nbufkv";
+		} else if (outofspace > 0) {
+			waitmsg = "nbufbs";
 			flags = VFS_BIO_NEED_BUFSPACE;
-		else
+		} else {
+			waitmsg = "newbuf";
 			flags = VFS_BIO_NEED_ANY;
+		}
 
 		/* XXX */
 
@@ -1509,7 +1594,7 @@ dosleep:
 		needsbuffer |= flags;
 		while (needsbuffer & flags) {
 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
-			    "newbuf", slptimeo))
+			    waitmsg, slptimeo))
 				return (NULL);
 		}
 	} else {
@@ -1553,42 +1638,7 @@ dosleep:
 		}
 		bp->b_data = bp->b_kvabase;
 	}
-
-	/*
-	 * If we have slept at some point in this process and another
-	 * process has managed to allocate a new buffer while we slept,
-	 * we have to return NULL so that our caller can recheck to
-	 * ensure that the other process did not create an identically
-	 * identified buffer to the one we were requesting. We make this
-	 * check by incrementing the static int newbufcnt each time we
-	 * successfully allocate a new buffer. By saving the value of
-	 * newbufcnt in our local lastnewbuf, we can compare newbufcnt
-	 * with lastnewbuf to see if any other process managed to
-	 * allocate a buffer while we were doing so ourselves.
-	 *
-	 * Note that bp, if valid, is locked.
-	 */
-	if (lastnewbuf == newbufcnt) {
-		/*
-		 * No buffers allocated, so we can return one if we were
-		 * successful, or continue trying if we were not successful.
-		 */
-		if (bp != NULL) {
-			newbufcnt += 1;
-			return (bp);
-		}
-		goto restart;
-	}
-	/*
-	 * Another process allocated a buffer since we were called, so
-	 * we have to free the one we allocated and return NULL to let
-	 * our caller recheck to see if a new buffer is still needed.
-	 */
-	if (bp != NULL) {
-		bp->b_flags |= B_INVAL;
-		brelse(bp);
-	}
-	return (NULL);
+	return(bp);
 }
 
 /*
@@ -1601,7 +1651,6 @@ static void
 waitfreebuffers(int slpflag, int slptimeo) 
 {
 	while (numfreebuffers < hifreebuffers) {
-		bd_wakeup(0);
 		if (numfreebuffers >= hifreebuffers)
 			break;
 		needsbuffer |= VFS_BIO_NEED_FREE;
@@ -1646,60 +1695,72 @@ buf_daemon()
 		bd_request = 0;
 
 		/*
-		 * Do the flush.  
+		 * Do the flush.  Limit the number of buffers we flush in one
+		 * go.  The failure condition occurs when processes are writing
+		 * buffers faster then we can dispose of them.  In this case
+		 * we may be flushing so often that the previous set of flushes
+		 * have not had time to complete, causing us to run out of
+		 * physical buffers and block.
 		 */
 		{
-			while (numdirtybuffers > bd_flushto) {
+			int runcount = maxbdrun;
+
+			while (numdirtybuffers > bd_flushto && runcount) {
+				--runcount;
 				if (flushbufqueues() == 0)
 					break;
 			}
 		}
 
 		/*
-		 * Whew.  If nobody is requesting anything we sleep until the
-		 * next event.  If we sleep and the sleep times out and
-		 * nobody is waiting for interesting things we back-off.  
-		 * Otherwise we get more aggressive.
+		 * If nobody is requesting anything we sleep
 		 */
+		if (bd_request == 0)
+			tsleep(&bd_request, PVM, "psleep", bd_interval);
 
-		if (bd_request == 0 &&
-		    tsleep(&bd_request, PVM, "psleep", bd_interval) &&
-		    needsbuffer == 0) {
-			/*
-			 * timed out and nothing serious going on,
-			 * increase the flushto high water mark to reduce
-			 * the flush rate.
-			 */
-			bd_flushto += 10;
-		} else {
-			/*
-			 * We were woken up or hit a serious wall that needs
-			 * to be addressed.
-			 */
-			bd_flushto -= 10;
-			if (needsbuffer) {
-				int middb = (lodirtybuffers+hidirtybuffers)/2;
-				bd_interval >>= 1;
-				if (bd_flushto > middb)
-					bd_flushto = middb;
-			}
+		/*
+		 * We calculate how much to add or subtract from bd_flushto
+		 * and bd_interval based on how far off we are from the 
+		 * optimal number of dirty buffers, which is 20% below the
+		 * hidirtybuffers mark.  We cannot use hidirtybuffers straight
+		 * because being right on the mark will cause getnewbuf()
+		 * to oscillate our wakeup.
+		 *
+		 * The larger the error in either direction, the more we adjust
+		 * bd_flushto and bd_interval.  The time interval is adjusted
+		 * by 2 seconds per whole-buffer-range of error.  This is an
+		 * exponential convergence algorithm, with large errors
+		 * producing large changes and small errors producing small
+		 * changes.
+		 */
+
+		{
+			int brange = hidirtybuffers - lodirtybuffers;
+			int middb = hidirtybuffers - brange / 5;
+			int deltabuf = middb - numdirtybuffers;
+
+			bd_flushto += deltabuf / 20;
+			bd_interval += deltabuf * (2 * hz) / (brange * 1);
 		}
-		if (bd_flushto < lodirtybuffers) {
+		if (bd_flushto < lodirtybuffers)
 			bd_flushto = lodirtybuffers;
-			bd_interval -= hz / 10;
-		}
-		if (bd_flushto > hidirtybuffers) {
+		if (bd_flushto > hidirtybuffers)
 			bd_flushto = hidirtybuffers;
-			bd_interval += hz / 10;
-		}
 		if (bd_interval < hz / 10)
 			bd_interval = hz / 10;
-
 		if (bd_interval > 5 * hz)
 			bd_interval = 5 * hz;
 	}
 }
 
+/*
+ *	flushbufqueues:
+ *
+ *	Try to flush a buffer in the dirty queue.  We must be careful to
+ *	free up B_INVAL buffers instead of write them, which NFS is 
+ *	particularly sensitive to.
+ */
+
 static int
 flushbufqueues(void)
 {
@@ -1709,15 +1770,6 @@ flushbufqueues(void)
 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 
 	while (bp) {
-		/*
-		 * Try to free up B_INVAL delayed-write buffers rather then
-		 * writing them out.  Note also that NFS is somewhat sensitive
-		 * to B_INVAL buffers so it is doubly important that we do 
-		 * this.
-		 *
-		 * We do not try to sync buffers whos vnodes are locked, we
-		 * cannot afford to block in this process.
-		 */
 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 		if ((bp->b_flags & B_DELWRI) != 0) {
 			if (bp->b_flags & B_INVAL) {
@@ -1728,11 +1780,9 @@ flushbufqueues(void)
 				++r;
 				break;
 			}
-			if (!VOP_ISLOCKED(bp->b_vp)) {
-				vfs_bio_awrite(bp);
-				++r;
-				break;
-			}
+			vfs_bio_awrite(bp);
+			++r;
+			break;
 		}
 		bp = TAILQ_NEXT(bp, b_freelist);
 	}
@@ -1957,8 +2007,6 @@ loop:
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-			if (bp->b_usecount < BUF_MAXUSE)
-				++bp->b_usecount;
 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 			    "getblk", slpflag, slptimeo) == ENOLCK)
 				goto loop;
@@ -2036,8 +2084,6 @@ loop:
 			goto loop;
 		}
 
-		if (bp->b_usecount < BUF_MAXUSE)
-			++bp->b_usecount;
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	} else {
@@ -2063,8 +2109,7 @@ loop:
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
-		if ((bp = getnewbuf(vp, blkno,
-			slpflag, slptimeo, size, maxsize)) == NULL) {
+		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
@@ -2079,7 +2124,8 @@ loop:
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.  There is now window
 		 * race because we are safely running at splbio() from the
-		 * point of the duplicate buffer creation through to here.
+		 * point of the duplicate buffer creation through to here,
+		 * and we've locked the buffer.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
@@ -2096,7 +2142,7 @@ loop:
 
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
-		bh = BUFHASH(vp, blkno);
+		bh = bufhash(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		/*
@@ -2135,7 +2181,7 @@ geteblk(int size)
 	int s;
 
 	s = splbio();
-	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
+	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
 	splx(s);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
@@ -2218,7 +2264,8 @@ allocbuf(struct buf *bp, int size)
 #if !defined(NO_B_MALLOC)
 			/*
 			 * We only use malloced memory on the first allocation.
-			 * and revert to page-allocated memory when the buffer grows.
+			 * and revert to page-allocated memory when the buffer
+			 * grows.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
author	mckusick <mckusick@FreeBSD.org>	1999-07-08 06:06:00 +0000
committer	mckusick <mckusick@FreeBSD.org>	1999-07-08 06:06:00 +0000
commit	52ea4270f31e0e98f9256262d955b9d8b54b4a00 (patch)
tree	dca69dbd5c0caa38ba5484660947559b9a6e6acc /sys/kern/vfs_bio.c
parent	28a2932015592e29bc53e27e881a9987c0020796 (diff)
download	FreeBSD-src-52ea4270f31e0e98f9256262d955b9d8b54b4a00.zip FreeBSD-src-52ea4270f31e0e98f9256262d955b9d8b54b4a00.tar.gz