From 057e33d02ca1d636be1b99e212ebb7911cf4fc62 Mon Sep 17 00:00:00 2001 From: dillon Date: Sun, 2 Apr 2000 00:55:28 +0000 Subject: Change the write-behind code to take more care when starting async I/O's. The sequential read heuristic has been extended to cover writes as well. We continue to call cluster_write() normally, thus blocks in the file will still be reallocated for large (but still random) I/O's, but I/O will only be initiated for truely sequential writes. This solves a number of annoying situations, especially with DBM (hash method) writes, and also has the side effect of fixing a number of (stupid) benchmarks. Reviewed-by: mckusick --- sys/gnu/ext2fs/ext2_readwrite.c | 4 ++- sys/gnu/fs/ext2fs/ext2_readwrite.c | 4 ++- sys/kern/vfs_cluster.c | 37 +++++++++++++++++----- sys/kern/vfs_vnops.c | 63 ++++++++++++++++++++++---------------- sys/sys/bio.h | 2 +- sys/sys/buf.h | 2 +- sys/sys/file.h | 4 +-- sys/sys/vnode.h | 3 +- sys/ufs/ufs/ufs_readwrite.c | 4 ++- 9 files changed, 80 insertions(+), 43 deletions(-) diff --git a/sys/gnu/ext2fs/ext2_readwrite.c b/sys/gnu/ext2fs/ext2_readwrite.c index d49466d..f054a6e 100644 --- a/sys/gnu/ext2fs/ext2_readwrite.c +++ b/sys/gnu/ext2fs/ext2_readwrite.c @@ -175,9 +175,11 @@ WRITE(ap) struct proc *p; daddr_t lbn; off_t osize; + int seqcount; int blkoffset, error, flags, ioflag, resid, size, xfersize; ioflag = ap->a_ioflag; + seqcount = ap->a_ioflag >> 16; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); @@ -265,7 +267,7 @@ WRITE(ap) } else if (xfersize + blkoffset == fs->s_frag_size) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(bp, ip->i_size); + cluster_write(bp, ip->i_size, seqcount); } else { bawrite(bp); } diff --git a/sys/gnu/fs/ext2fs/ext2_readwrite.c b/sys/gnu/fs/ext2fs/ext2_readwrite.c index d49466d..f054a6e 100644 --- a/sys/gnu/fs/ext2fs/ext2_readwrite.c +++ b/sys/gnu/fs/ext2fs/ext2_readwrite.c @@ -175,9 +175,11 @@ WRITE(ap) struct proc *p; daddr_t lbn; off_t osize; + int seqcount; int blkoffset, error, flags, ioflag, resid, size, xfersize; ioflag = ap->a_ioflag; + seqcount = ap->a_ioflag >> 16; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); @@ -265,7 +267,7 @@ WRITE(ap) } else if (xfersize + blkoffset == fs->s_frag_size) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(bp, ip->i_size); + cluster_write(bp, ip->i_size, seqcount); } else { bawrite(bp); } diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ac14bc4..d7e6f61 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -533,9 +533,10 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) * 4. end of a cluster - asynchronously write cluster */ void -cluster_write(bp, filesize) +cluster_write(bp, filesize, seqcount) struct buf *bp; u_quad_t filesize; + int seqcount; { struct vnode *vp; daddr_t lbn; @@ -570,13 +571,21 @@ cluster_write(bp, filesize) * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. + * + * Change to algorithm: only push previous cluster if + * it was sequential from the point of view of the + * seqcount heuristic, otherwise leave the buffer + * intact so we can potentially optimize the I/O + * later on in the buf_daemon or update daemon + * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { - if (!async) + if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize); + } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; @@ -586,14 +595,22 @@ cluster_write(bp, filesize) [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* - * Failed, push the previous cluster. + * Failed, push the previous cluster + * if *really* writing sequentially + * in the logical file (seqcount > 1), + * otherwise delay it in the hopes that + * the low level disk driver can + * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); - cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + if (seqcount > 1) { + cluster_wbuild_wb(vp, + lblocksize, vp->v_cstart, + cursize); + } } else { /* * Succeeded, keep building cluster. @@ -635,17 +652,21 @@ cluster_write(bp, filesize) } } else if (lbn == vp->v_cstart + vp->v_clen) { /* - * At end of cluster, write it out. + * At end of cluster, write it out if seqcount tells us we + * are operating sequentially, otherwise let the buf or + * update daemon handle it. */ bdwrite(bp); - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; - } else + } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); + } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index bb390ae..9f1a387 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -233,6 +233,37 @@ vn_close(vp, flags, cred, p) return (error); } +static __inline +int +sequential_heuristic(struct uio *uio, struct file *fp) +{ + /* + * Sequential heuristic - detect sequential operation + */ + if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || + uio->uio_offset == fp->f_nextoff) { + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; + if (fp->f_seqcount >= 127) + fp->f_seqcount = 127; + return(fp->f_seqcount << 16); + } + + /* + * Not sequential, quick draw-down of seqcount + */ + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + return(0); +} + /* * Package up an I/O request on a vnode into a uio and do it. */ @@ -304,36 +335,12 @@ vn_read(fp, uio, cred, flags, p) if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; - /* - * Sequential read heuristic. - * If we have been doing sequential input, - * a rewind operation doesn't turn off - * sequential input mode. - */ - if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || - uio->uio_offset == fp->f_nextread) { - int tmpseq = fp->f_seqcount; - /* - * XXX we assume that the filesystem block size is - * the default. Not true, but still gives us a pretty - * good indicator of how sequential the read operations - * are. - */ - tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; - if (tmpseq >= 127) - tmpseq = 127; - fp->f_seqcount = tmpseq; - ioflag |= fp->f_seqcount << 16; - } else { - if (fp->f_seqcount > 1) - fp->f_seqcount = 1; - else - fp->f_seqcount = 0; - } + ioflag |= sequential_heuristic(uio, fp); + error = VOP_READ(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; - fp->f_nextread = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, p); return (error); } @@ -370,9 +377,11 @@ vn_write(fp, uio, cred, flags, p) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; + ioflag |= sequential_heuristic(uio, fp); error = VOP_WRITE(vp, uio, ioflag, cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, p); return (error); } diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 4d672c5..8ebd3d0 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); -void cluster_write __P((struct buf *, u_quad_t)); +void cluster_write __P((struct buf *, u_quad_t, int)); int physio __P((dev_t dev, struct uio *uio, int ioflag)); #define physread physio #define physwrite physio diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 4d672c5..8ebd3d0 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -496,7 +496,7 @@ void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); -void cluster_write __P((struct buf *, u_quad_t)); +void cluster_write __P((struct buf *, u_quad_t, int)); int physio __P((dev_t dev, struct uio *uio, int ioflag)); #define physread physio #define physwrite physio diff --git a/sys/sys/file.h b/sys/sys/file.h index 9ed6ac4..7986425 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -84,8 +84,8 @@ struct file { * count of sequential accesses -- cleared * by most seek operations. */ - off_t f_nextread; /* - * offset of next expected read + off_t f_nextoff; /* + * offset of next expected read or write */ off_t f_offset; caddr_t f_data; /* vnode or socket */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index f8b4f31..4053270 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -201,7 +201,8 @@ struct vattr { #define VA_EXCLUSIVE 0x02 /* exclusive create request */ /* - * Flags for ioflag. (high 16 bits used to ask for read-ahead) + * Flags for ioflag. (high 16 bits used to ask for read-ahead and + * help with write clustering) */ #define IO_UNIT 0x01 /* do I/O as atomic unit */ #define IO_APPEND 0x02 /* append write to end */ diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index 62b0241..48088e9 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -379,10 +379,12 @@ WRITE(ap) struct proc *p; ufs_daddr_t lbn; off_t osize; + int seqcount; int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; vm_object_t object; extended = 0; + seqcount = ap->a_ioflag >> 16; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; @@ -492,7 +494,7 @@ WRITE(ap) } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; - cluster_write(bp, ip->i_size); + cluster_write(bp, ip->i_size, seqcount); } else { bawrite(bp); } -- cgit v1.1