summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2000-04-02 00:55:28 +0000
committerdillon <dillon@FreeBSD.org>2000-04-02 00:55:28 +0000
commit057e33d02ca1d636be1b99e212ebb7911cf4fc62 (patch)
treec8d43beb4d47e12029b09bb06c919f2a25c818e1
parent0589fa18882d56c333936da3d552f6f1fb8a3fab (diff)
downloadFreeBSD-src-057e33d02ca1d636be1b99e212ebb7911cf4fc62.zip
FreeBSD-src-057e33d02ca1d636be1b99e212ebb7911cf4fc62.tar.gz
Change the write-behind code to take more care when starting
async I/O's. The sequential read heuristic has been extended to cover writes as well. We continue to call cluster_write() normally, thus blocks in the file will still be reallocated for large (but still random) I/O's, but I/O will only be initiated for truely sequential writes. This solves a number of annoying situations, especially with DBM (hash method) writes, and also has the side effect of fixing a number of (stupid) benchmarks. Reviewed-by: mckusick
-rw-r--r--sys/gnu/ext2fs/ext2_readwrite.c4
-rw-r--r--sys/gnu/fs/ext2fs/ext2_readwrite.c4
-rw-r--r--sys/kern/vfs_cluster.c37
-rw-r--r--sys/kern/vfs_vnops.c63
-rw-r--r--sys/sys/bio.h2
-rw-r--r--sys/sys/buf.h2
-rw-r--r--sys/sys/file.h4
-rw-r--r--sys/sys/vnode.h3
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c4
9 files changed, 80 insertions, 43 deletions
diff --git a/sys/gnu/ext2fs/ext2_readwrite.c b/sys/gnu/ext2fs/ext2_readwrite.c
index d49466d..f054a6e 100644
--- a/sys/gnu/ext2fs/ext2_readwrite.c
+++ b/sys/gnu/ext2fs/ext2_readwrite.c
@@ -175,9 +175,11 @@ WRITE(ap)
struct proc *p;
daddr_t lbn;
off_t osize;
+ int seqcount;
int blkoffset, error, flags, ioflag, resid, size, xfersize;
ioflag = ap->a_ioflag;
+ seqcount = ap->a_ioflag >> 16;
uio = ap->a_uio;
vp = ap->a_vp;
ip = VTOI(vp);
@@ -265,7 +267,7 @@ WRITE(ap)
} else if (xfersize + blkoffset == fs->s_frag_size) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
- cluster_write(bp, ip->i_size);
+ cluster_write(bp, ip->i_size, seqcount);
} else {
bawrite(bp);
}
diff --git a/sys/gnu/fs/ext2fs/ext2_readwrite.c b/sys/gnu/fs/ext2fs/ext2_readwrite.c
index d49466d..f054a6e 100644
--- a/sys/gnu/fs/ext2fs/ext2_readwrite.c
+++ b/sys/gnu/fs/ext2fs/ext2_readwrite.c
@@ -175,9 +175,11 @@ WRITE(ap)
struct proc *p;
daddr_t lbn;
off_t osize;
+ int seqcount;
int blkoffset, error, flags, ioflag, resid, size, xfersize;
ioflag = ap->a_ioflag;
+ seqcount = ap->a_ioflag >> 16;
uio = ap->a_uio;
vp = ap->a_vp;
ip = VTOI(vp);
@@ -265,7 +267,7 @@ WRITE(ap)
} else if (xfersize + blkoffset == fs->s_frag_size) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
- cluster_write(bp, ip->i_size);
+ cluster_write(bp, ip->i_size, seqcount);
} else {
bawrite(bp);
}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index ac14bc4..d7e6f61 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -533,9 +533,10 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
* 4. end of a cluster - asynchronously write cluster
*/
void
-cluster_write(bp, filesize)
+cluster_write(bp, filesize, seqcount)
struct buf *bp;
u_quad_t filesize;
+ int seqcount;
{
struct vnode *vp;
daddr_t lbn;
@@ -570,13 +571,21 @@ cluster_write(bp, filesize)
* write, or we have reached our maximum cluster size,
* then push the previous cluster. Otherwise try
* reallocating to make it sequential.
+ *
+ * Change to algorithm: only push previous cluster if
+ * it was sequential from the point of view of the
+ * seqcount heuristic, otherwise leave the buffer
+ * intact so we can potentially optimize the I/O
+ * later on in the buf_daemon or update daemon
+ * flush.
*/
cursize = vp->v_lastw - vp->v_cstart + 1;
if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
- if (!async)
+ if (!async && seqcount > 0) {
cluster_wbuild_wb(vp, lblocksize,
vp->v_cstart, cursize);
+ }
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
@@ -586,14 +595,22 @@ cluster_write(bp, filesize)
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
/*
- * Failed, push the previous cluster.
+ * Failed, push the previous cluster
+ * if *really* writing sequentially
+ * in the logical file (seqcount > 1),
+ * otherwise delay it in the hopes that
+ * the low level disk driver can
+ * optimize the write ordering.
*/
for (bpp = buflist->bs_children;
bpp < endbp; bpp++)
brelse(*bpp);
free(buflist, M_SEGMENT);
- cluster_wbuild_wb(vp, lblocksize,
- vp->v_cstart, cursize);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp,
+ lblocksize, vp->v_cstart,
+ cursize);
+ }
} else {
/*
* Succeeded, keep building cluster.
@@ -635,17 +652,21 @@ cluster_write(bp, filesize)
}
} else if (lbn == vp->v_cstart + vp->v_clen) {
/*
- * At end of cluster, write it out.
+ * At end of cluster, write it out if seqcount tells us we
+ * are operating sequentially, otherwise let the buf or
+ * update daemon handle it.
*/
bdwrite(bp);
- cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ if (seqcount > 1)
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
- } else
+ } else {
/*
* In the middle of a cluster, so just delay the I/O for now.
*/
bdwrite(bp);
+ }
vp->v_lastw = lbn;
vp->v_lasta = bp->b_blkno;
}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index bb390ae..9f1a387 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -233,6 +233,37 @@ vn_close(vp, flags, cred, p)
return (error);
}
+static __inline
+int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+ /*
+ * Sequential heuristic - detect sequential operation
+ */
+ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+ uio->uio_offset == fp->f_nextoff) {
+ /*
+ * XXX we assume that the filesystem block size is
+ * the default. Not true, but still gives us a pretty
+ * good indicator of how sequential the read operations
+ * are.
+ */
+ fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+ if (fp->f_seqcount >= 127)
+ fp->f_seqcount = 127;
+ return(fp->f_seqcount << 16);
+ }
+
+ /*
+ * Not sequential, quick draw-down of seqcount
+ */
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ return(0);
+}
+
/*
* Package up an I/O request on a vnode into a uio and do it.
*/
@@ -304,36 +335,12 @@ vn_read(fp, uio, cred, flags, p)
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
- /*
- * Sequential read heuristic.
- * If we have been doing sequential input,
- * a rewind operation doesn't turn off
- * sequential input mode.
- */
- if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
- uio->uio_offset == fp->f_nextread) {
- int tmpseq = fp->f_seqcount;
- /*
- * XXX we assume that the filesystem block size is
- * the default. Not true, but still gives us a pretty
- * good indicator of how sequential the read operations
- * are.
- */
- tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
- if (tmpseq >= 127)
- tmpseq = 127;
- fp->f_seqcount = tmpseq;
- ioflag |= fp->f_seqcount << 16;
- } else {
- if (fp->f_seqcount > 1)
- fp->f_seqcount = 1;
- else
- fp->f_seqcount = 0;
- }
+ ioflag |= sequential_heuristic(uio, fp);
+
error = VOP_READ(vp, uio, ioflag, cred);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
- fp->f_nextread = uio->uio_offset;
+ fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0, p);
return (error);
}
@@ -370,9 +377,11 @@ vn_write(fp, uio, cred, flags, p)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
+ ioflag |= sequential_heuristic(uio, fp);
error = VOP_WRITE(vp, uio, ioflag, cred);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
+ fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0, p);
return (error);
}
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 4d672c5..8ebd3d0 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *));
int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
struct ucred *, long, int, struct buf **));
int cluster_wbuild __P((struct vnode *, long, daddr_t, int));
-void cluster_write __P((struct buf *, u_quad_t));
+void cluster_write __P((struct buf *, u_quad_t, int));
int physio __P((dev_t dev, struct uio *uio, int ioflag));
#define physread physio
#define physwrite physio
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 4d672c5..8ebd3d0 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *));
int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
struct ucred *, long, int, struct buf **));
int cluster_wbuild __P((struct vnode *, long, daddr_t, int));
-void cluster_write __P((struct buf *, u_quad_t));
+void cluster_write __P((struct buf *, u_quad_t, int));
int physio __P((dev_t dev, struct uio *uio, int ioflag));
#define physread physio
#define physwrite physio
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 9ed6ac4..7986425 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -84,8 +84,8 @@ struct file {
* count of sequential accesses -- cleared
* by most seek operations.
*/
- off_t f_nextread; /*
- * offset of next expected read
+ off_t f_nextoff; /*
+ * offset of next expected read or write
*/
off_t f_offset;
caddr_t f_data; /* vnode or socket */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index f8b4f31..4053270 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -201,7 +201,8 @@ struct vattr {
#define VA_EXCLUSIVE 0x02 /* exclusive create request */
/*
- * Flags for ioflag. (high 16 bits used to ask for read-ahead)
+ * Flags for ioflag. (high 16 bits used to ask for read-ahead and
+ * help with write clustering)
*/
#define IO_UNIT 0x01 /* do I/O as atomic unit */
#define IO_APPEND 0x02 /* append write to end */
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 62b0241..48088e9 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -379,10 +379,12 @@ WRITE(ap)
struct proc *p;
ufs_daddr_t lbn;
off_t osize;
+ int seqcount;
int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
vm_object_t object;
extended = 0;
+ seqcount = ap->a_ioflag >> 16;
ioflag = ap->a_ioflag;
uio = ap->a_uio;
vp = ap->a_vp;
@@ -492,7 +494,7 @@ WRITE(ap)
} else if (xfersize + blkoffset == fs->fs_bsize) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
- cluster_write(bp, ip->i_size);
+ cluster_write(bp, ip->i_size, seqcount);
} else {
bawrite(bp);
}
OpenPOWER on IntegriCloud