1 files changed, 181 insertions, 127 deletions
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index d45663a..064977b 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.38 1996/10/06 07:50:04 dyson Exp $
+ * $Id: vfs_cluster.c,v 1.39 1996/11/30 22:41:41 dyson Exp $
  */
 
 #include <sys/param.h>
@@ -52,6 +52,13 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+static int	rcluster= 0;
+SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
+
 #ifdef notyet_block_reallocation_enabled
 #ifdef DEBUG
 #include <sys/sysctl.h>
@@ -70,171 +77,197 @@ static struct cluster_save *
 #endif
 static struct buf *
 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
-			    daddr_t blkno, long size, int run));
+			    daddr_t blkno, long size, int run, struct buf *fbp));
 
-static int	totreads;
-static int	totreadblocks;
 extern vm_page_t	bogus_page;
 
-#ifdef DIAGNOSTIC
 /*
- * Set to 1 if reads of block zero should cause readahead to be done.
- * Set to 0 treats a read of block zero as a non-sequential read.
- *
- * Setting to one assumes that most reads of block zero of files are due to
- * sequential passes over the files (e.g. cat, sum) where additional blocks
- * will soon be needed.  Setting to zero assumes that the majority are
- * surgical strikes to get particular info (e.g. size, file) where readahead
- * blocks will not be used and, in fact, push out other potentially useful
- * blocks from the cache.  The former seems intuitive, but some quick tests
- * showed that the latter performed better from a system-wide point of view.
+ * Maximum number of blocks for read-ahead.
  */
-	int doclusterraz = 0;
-
-#define ISSEQREAD(vp, blk) \
-	(((blk) != 0 || doclusterraz) && \
-	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#else
-#define ISSEQREAD(vp, blk) \
-	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#endif
+#define MAXRA 32
 
 /*
- * allow for three entire read-aheads...  The system will
- * adjust downwards rapidly if needed...
- */
-#define RA_MULTIPLE_FAST	2
-#define RA_MULTIPLE_SLOW	3
-#define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
-/*
- * This replaces bread.  If this is a bread at the beginning of a file and
- * lastr is 0, we assume this is the first read and we'll read up to two
- * blocks if they are sequential.  After that, we'll do regular read ahead
- * in clustered chunks.
- * 	bp is the block requested.
- *	rbp is the read-ahead block.
- *	If either is NULL, then you don't have to do the I/O.
+ * This replaces bread.
  */
 int
-cluster_read(vp, filesize, lblkno, size, cred, bpp)
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lblkno;
 	long size;
 	struct ucred *cred;
+	long totread;
+	int seqcount;
 	struct buf **bpp;
 {
-	struct buf *bp, *rbp;
-	daddr_t blkno, rablkno, origlblkno;
-	int error, num_ra, alreadyincore;
+	struct buf *bp, *rbp, *reqbp;
+	daddr_t blkno, rablkno, origblkno;
+	int error, num_ra;
 	int i;
-	int seq;
+	int maxra, racluster;
+	long origtotread;
 
 	error = 0;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = MAXPHYS/size;
+	maxra = 2 * racluster + (totread / size);
+	if (maxra > MAXRA)
+		maxra = MAXRA;
+	if (maxra > nbuf/8)
+		maxra = nbuf/8;
+
 	/*
 	 * get the requested block
 	 */
-	origlblkno = lblkno;
-	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+	origblkno = lblkno;
+	origtotread = totread;
 
-	seq = ISSEQREAD(vp, lblkno);
 	/*
 	 * if it is in the cache, then check to see if the reads have been
 	 * sequential.  If they have, then try some read-ahead, otherwise
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
-		if (!seq) {
-			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
-			vp->v_ralen >>= RA_SHIFTDOWN;
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
 			return 0;
-		} else if( vp->v_maxra > lblkno) {
-			if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
-				++vp->v_ralen;
-			if ( vp->v_maxra > lblkno + vp->v_ralen ) {
+		} else {
+			int s;
+			struct buf *tbp;
+			bp->b_flags &= ~B_RAM;
+			/*
+			 * We do the spl here so that there is no window
+			 * between the incore and the b_usecount increment
+			 * below.  We opt to keep the spl out of the loop
+			 * for efficiency.
+			 */
+			s = splbio();
+			for(i=1;i<maxra;i++) {
+
+				if (!(tbp = incore(vp, lblkno+i))) {
+					break;
+				}
+
+				/*
+				 * Set another read-ahead mark so we know to check
+				 * again.
+				 */
+				if (((i % racluster) == (racluster - 1)) ||
+					(i == (maxra - 1)))
+					tbp->b_flags |= B_RAM;
+
+#if 0
+				if (tbp->b_usecount == 0) {
+					/*
+					 * Make sure that the soon-to-be used readaheads
+					 * are still there.  The getblk/bqrelse pair will
+					 * boost the priority of the buffer.
+					 */
+					tbp = getblk(vp, lblkno+i, size, 0, 0);
+					bqrelse(tbp);
+				}
+#endif
+			}
+			splx(s);
+			if (i >= maxra) {
 				return 0;
 			}
-			lblkno = vp->v_maxra;
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	} else {
+		u_quad_t firstread;
+		firstread = (u_quad_t) lblkno * size;
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		if (totread > size) {
+			int nblks = 0;
+			int ncontigafter;
+			while (totread > 0) {
+				nblks++;
+				totread -= size;
+			}
+			if (nblks == 1)
+				goto single_block_read;
+			if (nblks > racluster)
+				nblks = racluster;
+
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontigafter, NULL);
+			if (error)
+				goto single_block_read;
+			if (blkno == -1)
+				goto single_block_read;
+			if (ncontigafter == 0)
+				goto single_block_read;
+			if (ncontigafter + 1 < nblks)
+				nblks = ncontigafter + 1;
+
+			bp = cluster_rbuild(vp, filesize, lblkno,
+				blkno, size, nblks, bp);
+			lblkno += nblks;
 		} else {
+single_block_read:
+			/*
+			 * if it isn't in the cache, then get a chunk from
+			 * disk if sequential, otherwise just get the block.
+			 */
+			bp->b_flags |= B_READ | B_RAM;
 			lblkno += 1;
 		}
-		bp = NULL;
-	} else {
-		/*
-		 * if it isn't in the cache, then get a chunk from disk if
-		 * sequential, otherwise just get the block.
-		 */
-		bp->b_flags |= B_READ;
-		lblkno += 1;
-		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
-		vp->v_ralen = 0;
 	}
-	/*
-	 * assume no read-ahead
-	 */
-	alreadyincore = 1;
-	rablkno = lblkno;
 
 	/*
 	 * if we have been doing sequential I/O, then do some read-ahead
 	 */
-	if (seq) {
-		alreadyincore = 0;
-
-	/*
-	 * bump ralen a bit...
-	 */
-		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
-			++vp->v_ralen;
+	rbp = NULL;
+	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
+	if (seqcount && (lblkno < (origblkno + seqcount))) {
 		/*
-		 * this code makes sure that the stuff that we have read-ahead
-		 * is still in the cache.  If it isn't, we have been reading
-		 * ahead too much, and we need to back-off, otherwise we might
-		 * try to read more.
+		 * we now build the read-ahead buffer if it is desirable.
 		 */
-		for (i = 0; i < vp->v_maxra - lblkno; i++) {
-			rablkno = lblkno + i;
-			alreadyincore = (int) incore(vp, rablkno);
-			if (!alreadyincore) {
-				vp->v_maxra = rablkno;
-				vp->v_ralen >>= RA_SHIFTDOWN;
-				alreadyincore = 1;
+		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+		    blkno != -1) {
+			int nblksread;
+			int ntoread = num_ra + 1;
+			nblksread = (origtotread + size - 1) / size;
+			if (seqcount < nblksread)
+				seqcount = nblksread;
+			if (seqcount < ntoread)
+				ntoread = seqcount;
+			if (num_ra) {
+				rbp = cluster_rbuild(vp, filesize, lblkno,
+					blkno, size, ntoread, NULL);
+			} else {
+				rbp = getblk(vp, lblkno, size, 0, 0);
+				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+				rbp->b_blkno = blkno;
 			}
 		}
 	}
-	/*
-	 * we now build the read-ahead buffer if it is desirable.
-	 */
-	rbp = NULL;
-	if (!alreadyincore &&
-	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
-	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
-	    blkno != -1) {
-		if (num_ra > vp->v_ralen)
-			num_ra = vp->v_ralen;
-
-		if (num_ra) {
-			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
-				num_ra + 1);
-		} else {
-			rbp = getblk(vp, rablkno, size, 0, 0);
-			rbp->b_flags |= B_READ | B_ASYNC;
-			rbp->b_blkno = blkno;
-		}
-	}
 
 	/*
 	 * handle the synchronous read
 	 */
 	if (bp) {
-		if (bp->b_flags & (B_DONE | B_DELWRI))
+		if (bp->b_flags & (B_DONE | B_DELWRI)) {
 			panic("cluster_read: DONE bp");
-		else {
-			vfs_busy_pages(bp, 0);
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster)
+				printf("S(%d,%d,%d) ",
+					bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+			if ((bp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(bp, 0);
 			error = VOP_STRATEGY(bp);
-			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
-			totreads++;
-			totreadblocks += bp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
@@ -242,7 +275,6 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 	 * and if we have read-aheads, do them too
 	 */
 	if (rbp) {
-		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
 		if (error) {
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			brelse(rbp);
@@ -250,17 +282,31 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			bqrelse(rbp);
 		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster) {
+				if (bp)
+					printf("A+(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+				else
+					printf("A(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+			}
+#endif
+
 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
 			(void) VOP_STRATEGY(rbp);
-			totreads++;
-			totreadblocks += rbp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
-	if (bp && ((bp->b_flags & B_ASYNC) == 0))
-		return (biowait(bp));
-	return (error);
+	if (reqbp)
+		return (biowait(reqbp));
+	else
+		return (error);
 }
 
 /*
@@ -269,13 +315,14 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run)
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lbn;
 	daddr_t blkno;
 	long size;
 	int run;
+	struct buf *fbp;
 {
 	struct buf *bp, *tbp;
 	daddr_t bn;
@@ -293,12 +340,17 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 		--run;
 	}
 
-	tbp = getblk(vp, lbn, size, 0, 0);
-	if (tbp->b_flags & B_CACHE)
-		return tbp;
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_flags |= B_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+	}
 
 	tbp->b_blkno = blkno;
-	tbp->b_flags |= B_ASYNC | B_READ; 
 	if( (tbp->b_flags & B_MALLOC) ||
 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
@@ -353,6 +405,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 				break;
 			}
 
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
 			tbp->b_flags |= B_READ | B_ASYNC;
 			if (tbp->b_blkno == tbp->b_lblkno) {
 				tbp->b_blkno = bn;
@@ -419,9 +473,9 @@ cluster_callback(bp)
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
-	for (tbp = bp->b_cluster.cluster_head.tqh_first;
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
 		tbp; tbp = nbp) {
-		nbp = tbp->b_cluster.cluster_entry.tqe_next;
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;