summaryrefslogtreecommitdiffstats
path: root/sys/kern/vfs_cluster.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/vfs_cluster.c')
-rw-r--r--sys/kern/vfs_cluster.c308
1 files changed, 181 insertions, 127 deletions
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index d45663a..064977b 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.38 1996/10/06 07:50:04 dyson Exp $
+ * $Id: vfs_cluster.c,v 1.39 1996/11/30 22:41:41 dyson Exp $
*/
#include <sys/param.h>
@@ -52,6 +52,13 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+static int rcluster= 0;
+SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
+
#ifdef notyet_block_reallocation_enabled
#ifdef DEBUG
#include <sys/sysctl.h>
@@ -70,171 +77,197 @@ static struct cluster_save *
#endif
static struct buf *
cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
- daddr_t blkno, long size, int run));
+ daddr_t blkno, long size, int run, struct buf *fbp));
-static int totreads;
-static int totreadblocks;
extern vm_page_t bogus_page;
-#ifdef DIAGNOSTIC
/*
- * Set to 1 if reads of block zero should cause readahead to be done.
- * Set to 0 treats a read of block zero as a non-sequential read.
- *
- * Setting to one assumes that most reads of block zero of files are due to
- * sequential passes over the files (e.g. cat, sum) where additional blocks
- * will soon be needed. Setting to zero assumes that the majority are
- * surgical strikes to get particular info (e.g. size, file) where readahead
- * blocks will not be used and, in fact, push out other potentially useful
- * blocks from the cache. The former seems intuitive, but some quick tests
- * showed that the latter performed better from a system-wide point of view.
+ * Maximum number of blocks for read-ahead.
*/
- int doclusterraz = 0;
-
-#define ISSEQREAD(vp, blk) \
- (((blk) != 0 || doclusterraz) && \
- ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#else
-#define ISSEQREAD(vp, blk) \
- (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
-#endif
+#define MAXRA 32
/*
- * allow for three entire read-aheads... The system will
- * adjust downwards rapidly if needed...
- */
-#define RA_MULTIPLE_FAST 2
-#define RA_MULTIPLE_SLOW 3
-#define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */
-/*
- * This replaces bread. If this is a bread at the beginning of a file and
- * lastr is 0, we assume this is the first read and we'll read up to two
- * blocks if they are sequential. After that, we'll do regular read ahead
- * in clustered chunks.
- * bp is the block requested.
- * rbp is the read-ahead block.
- * If either is NULL, then you don't have to do the I/O.
+ * This replaces bread.
*/
int
-cluster_read(vp, filesize, lblkno, size, cred, bpp)
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
struct vnode *vp;
u_quad_t filesize;
daddr_t lblkno;
long size;
struct ucred *cred;
+ long totread;
+ int seqcount;
struct buf **bpp;
{
- struct buf *bp, *rbp;
- daddr_t blkno, rablkno, origlblkno;
- int error, num_ra, alreadyincore;
+ struct buf *bp, *rbp, *reqbp;
+ daddr_t blkno, rablkno, origblkno;
+ int error, num_ra;
int i;
- int seq;
+ int maxra, racluster;
+ long origtotread;
error = 0;
+
+ /*
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
+ */
+ racluster = MAXPHYS/size;
+ maxra = 2 * racluster + (totread / size);
+ if (maxra > MAXRA)
+ maxra = MAXRA;
+ if (maxra > nbuf/8)
+ maxra = nbuf/8;
+
/*
* get the requested block
*/
- origlblkno = lblkno;
- *bpp = bp = getblk(vp, lblkno, size, 0, 0);
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+ origblkno = lblkno;
+ origtotread = totread;
- seq = ISSEQREAD(vp, lblkno);
/*
* if it is in the cache, then check to see if the reads have been
* sequential. If they have, then try some read-ahead, otherwise
* back-off on prospective read-aheads.
*/
if (bp->b_flags & B_CACHE) {
- if (!seq) {
- vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
- vp->v_ralen >>= RA_SHIFTDOWN;
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
return 0;
- } else if( vp->v_maxra > lblkno) {
- if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
- ++vp->v_ralen;
- if ( vp->v_maxra > lblkno + vp->v_ralen ) {
+ } else {
+ int s;
+ struct buf *tbp;
+ bp->b_flags &= ~B_RAM;
+ /*
+ * We do the spl here so that there is no window
+ * between the incore and the b_usecount increment
+ * below. We opt to keep the spl out of the loop
+ * for efficiency.
+ */
+ s = splbio();
+ for(i=1;i<maxra;i++) {
+
+ if (!(tbp = incore(vp, lblkno+i))) {
+ break;
+ }
+
+ /*
+ * Set another read-ahead mark so we know to check
+ * again.
+ */
+ if (((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ tbp->b_flags |= B_RAM;
+
+#if 0
+ if (tbp->b_usecount == 0) {
+ /*
+ * Make sure that the soon-to-be used readaheads
+ * are still there. The getblk/bqrelse pair will
+ * boost the priority of the buffer.
+ */
+ tbp = getblk(vp, lblkno+i, size, 0, 0);
+ bqrelse(tbp);
+ }
+#endif
+ }
+ splx(s);
+ if (i >= maxra) {
return 0;
}
- lblkno = vp->v_maxra;
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ } else {
+ u_quad_t firstread;
+ firstread = (u_quad_t) lblkno * size;
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ if (totread > size) {
+ int nblks = 0;
+ int ncontigafter;
+ while (totread > 0) {
+ nblks++;
+ totread -= size;
+ }
+ if (nblks == 1)
+ goto single_block_read;
+ if (nblks > racluster)
+ nblks = racluster;
+
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontigafter, NULL);
+ if (error)
+ goto single_block_read;
+ if (blkno == -1)
+ goto single_block_read;
+ if (ncontigafter == 0)
+ goto single_block_read;
+ if (ncontigafter + 1 < nblks)
+ nblks = ncontigafter + 1;
+
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, bp);
+ lblkno += nblks;
} else {
+single_block_read:
+ /*
+ * if it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ bp->b_flags |= B_READ | B_RAM;
lblkno += 1;
}
- bp = NULL;
- } else {
- /*
- * if it isn't in the cache, then get a chunk from disk if
- * sequential, otherwise just get the block.
- */
- bp->b_flags |= B_READ;
- lblkno += 1;
- curproc->p_stats->p_ru.ru_inblock++; /* XXX */
- vp->v_ralen = 0;
}
- /*
- * assume no read-ahead
- */
- alreadyincore = 1;
- rablkno = lblkno;
/*
* if we have been doing sequential I/O, then do some read-ahead
*/
- if (seq) {
- alreadyincore = 0;
-
- /*
- * bump ralen a bit...
- */
- if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
- ++vp->v_ralen;
+ rbp = NULL;
+ /* if (seqcount && (lblkno < (origblkno + maxra))) { */
+ if (seqcount && (lblkno < (origblkno + seqcount))) {
/*
- * this code makes sure that the stuff that we have read-ahead
- * is still in the cache. If it isn't, we have been reading
- * ahead too much, and we need to back-off, otherwise we might
- * try to read more.
+ * we now build the read-ahead buffer if it is desirable.
*/
- for (i = 0; i < vp->v_maxra - lblkno; i++) {
- rablkno = lblkno + i;
- alreadyincore = (int) incore(vp, rablkno);
- if (!alreadyincore) {
- vp->v_maxra = rablkno;
- vp->v_ralen >>= RA_SHIFTDOWN;
- alreadyincore = 1;
+ if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+ !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+ blkno != -1) {
+ int nblksread;
+ int ntoread = num_ra + 1;
+ nblksread = (origtotread + size - 1) / size;
+ if (seqcount < nblksread)
+ seqcount = nblksread;
+ if (seqcount < ntoread)
+ ntoread = seqcount;
+ if (num_ra) {
+ rbp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, ntoread, NULL);
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0);
+ rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+ rbp->b_blkno = blkno;
}
}
}
- /*
- * we now build the read-ahead buffer if it is desirable.
- */
- rbp = NULL;
- if (!alreadyincore &&
- ((u_quad_t)(rablkno + 1) * size) <= filesize &&
- !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
- blkno != -1) {
- if (num_ra > vp->v_ralen)
- num_ra = vp->v_ralen;
-
- if (num_ra) {
- rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
- num_ra + 1);
- } else {
- rbp = getblk(vp, rablkno, size, 0, 0);
- rbp->b_flags |= B_READ | B_ASYNC;
- rbp->b_blkno = blkno;
- }
- }
/*
* handle the synchronous read
*/
if (bp) {
- if (bp->b_flags & (B_DONE | B_DELWRI))
+ if (bp->b_flags & (B_DONE | B_DELWRI)) {
panic("cluster_read: DONE bp");
- else {
- vfs_busy_pages(bp, 0);
+ } else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster)
+ printf("S(%d,%d,%d) ",
+ bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+ if ((bp->b_flags & B_CLUSTER) == 0)
+ vfs_busy_pages(bp, 0);
error = VOP_STRATEGY(bp);
- vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
- totreads++;
- totreadblocks += bp->b_bcount / size;
curproc->p_stats->p_ru.ru_inblock++;
}
}
@@ -242,7 +275,6 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
* and if we have read-aheads, do them too
*/
if (rbp) {
- vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
if (error) {
rbp->b_flags &= ~(B_ASYNC | B_READ);
brelse(rbp);
@@ -250,17 +282,31 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
rbp->b_flags &= ~(B_ASYNC | B_READ);
bqrelse(rbp);
} else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster) {
+ if (bp)
+ printf("A+(%d,%d,%d,%d) ",
+ rbp->b_lblkno, rbp->b_bcount,
+ rbp->b_lblkno - origblkno,
+ seqcount);
+ else
+ printf("A(%d,%d,%d,%d) ",
+ rbp->b_lblkno, rbp->b_bcount,
+ rbp->b_lblkno - origblkno,
+ seqcount);
+ }
+#endif
+
if ((rbp->b_flags & B_CLUSTER) == 0)
vfs_busy_pages(rbp, 0);
(void) VOP_STRATEGY(rbp);
- totreads++;
- totreadblocks += rbp->b_bcount / size;
curproc->p_stats->p_ru.ru_inblock++;
}
}
- if (bp && ((bp->b_flags & B_ASYNC) == 0))
- return (biowait(bp));
- return (error);
+ if (reqbp)
+ return (biowait(reqbp));
+ else
+ return (error);
}
/*
@@ -269,13 +315,14 @@ cluster_read(vp, filesize, lblkno, size, cred, bpp)
* and then parcel them up into logical blocks in the buffer hash table.
*/
static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run)
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
struct vnode *vp;
u_quad_t filesize;
daddr_t lbn;
daddr_t blkno;
long size;
int run;
+ struct buf *fbp;
{
struct buf *bp, *tbp;
daddr_t bn;
@@ -293,12 +340,17 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
--run;
}
- tbp = getblk(vp, lbn, size, 0, 0);
- if (tbp->b_flags & B_CACHE)
- return tbp;
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_flags |= B_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+ }
tbp->b_blkno = blkno;
- tbp->b_flags |= B_ASYNC | B_READ;
if( (tbp->b_flags & B_MALLOC) ||
((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
return tbp;
@@ -353,6 +405,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run)
break;
}
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
tbp->b_flags |= B_READ | B_ASYNC;
if (tbp->b_blkno == tbp->b_lblkno) {
tbp->b_blkno = bn;
@@ -419,9 +473,9 @@ cluster_callback(bp)
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
*/
- for (tbp = bp->b_cluster.cluster_head.tqh_first;
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
tbp; tbp = nbp) {
- nbp = tbp->b_cluster.cluster_entry.tqe_next;
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
if (error) {
tbp->b_flags |= B_ERROR;
tbp->b_error = error;
OpenPOWER on IntegriCloud