diff options
author | dillon <dillon@FreeBSD.org> | 2000-04-02 00:55:28 +0000 |
---|---|---|
committer | dillon <dillon@FreeBSD.org> | 2000-04-02 00:55:28 +0000 |
commit | 057e33d02ca1d636be1b99e212ebb7911cf4fc62 (patch) | |
tree | c8d43beb4d47e12029b09bb06c919f2a25c818e1 /sys/kern | |
parent | 0589fa18882d56c333936da3d552f6f1fb8a3fab (diff) | |
download | FreeBSD-src-057e33d02ca1d636be1b99e212ebb7911cf4fc62.zip FreeBSD-src-057e33d02ca1d636be1b99e212ebb7911cf4fc62.tar.gz |
Change the write-behind code to take more care when starting
async I/O's. The sequential read heuristic has been extended to
cover writes as well. We continue to call cluster_write() normally,
thus blocks in the file will still be reallocated for large (but still
random) I/O's, but I/O will only be initiated for truely sequential
writes.
This solves a number of annoying situations, especially with DBM (hash
method) writes, and also has the side effect of fixing a number of
(stupid) benchmarks.
Reviewed-by: mckusick
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/vfs_cluster.c | 37 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 63 |
2 files changed, 65 insertions, 35 deletions
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ac14bc4..d7e6f61 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -533,9 +533,10 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) * 4. end of a cluster - asynchronously write cluster */ void -cluster_write(bp, filesize) +cluster_write(bp, filesize, seqcount) struct buf *bp; u_quad_t filesize; + int seqcount; { struct vnode *vp; daddr_t lbn; @@ -570,13 +571,21 @@ cluster_write(bp, filesize) * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. + * + * Change to algorithm: only push previous cluster if + * it was sequential from the point of view of the + * seqcount heuristic, otherwise leave the buffer + * intact so we can potentially optimize the I/O + * later on in the buf_daemon or update daemon + * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { - if (!async) + if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); + } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; @@ -586,14 +595,22 @@ cluster_write(bp, filesize) [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* - * Failed, push the previous cluster. + * Failed, push the previous cluster + * if *really* writing sequentially + * in the logical file (seqcount > 1), + * otherwise delay it in the hopes that + * the low level disk driver can + * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); - cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + if (seqcount > 1) { + cluster_wbuild_wb(vp, + lblocksize, vp->v_cstart, + cursize); + } } else { /* * Succeeded, keep building cluster. @@ -635,17 +652,21 @@ cluster_write(bp, filesize) } } else if (lbn == vp->v_cstart + vp->v_clen) { /* - * At end of cluster, write it out. + * At end of cluster, write it out if seqcount tells us we + * are operating sequentially, otherwise let the buf or + * update daemon handle it. */ bdwrite(bp); - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; - } else + } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); + } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index bb390ae..9f1a387 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -233,6 +233,37 @@ vn_close(vp, flags, cred, p) return (error); } +static __inline +int +sequential_heuristic(struct uio *uio, struct file *fp) +{ + /* + * Sequential heuristic - detect sequential operation + */ + if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || + uio->uio_offset == fp->f_nextoff) { + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; + if (fp->f_seqcount >= 127) + fp->f_seqcount = 127; + return(fp->f_seqcount << 16); + } + + /* + * Not sequential, quick draw-down of seqcount + */ + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + return(0); +} + /* * Package up an I/O request on a vnode into a uio and do it. */ @@ -304,36 +335,12 @@ vn_read(fp, uio, cred, flags, p) if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; - /* - * Sequential read heuristic. - * If we have been doing sequential input, - * a rewind operation doesn't turn off - * sequential input mode. - */ - if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || - uio->uio_offset == fp->f_nextread) { - int tmpseq = fp->f_seqcount; - /* - * XXX we assume that the filesystem block size is - * the default. Not true, but still gives us a pretty - * good indicator of how sequential the read operations - * are. - */ - tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; - if (tmpseq >= 127) - tmpseq = 127; - fp->f_seqcount = tmpseq; - ioflag |= fp->f_seqcount << 16; - } else { - if (fp->f_seqcount > 1) - fp->f_seqcount = 1; - else - fp->f_seqcount = 0; - } + ioflag |= sequential_heuristic(uio, fp); + error = VOP_READ(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; - fp->f_nextread = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, p); return (error); } @@ -370,9 +377,11 @@ vn_write(fp, uio, cred, flags, p) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; + ioflag |= sequential_heuristic(uio, fp); error = VOP_WRITE(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, p); return (error); } |