diff options
author | jhb <jhb@FreeBSD.org> | 2011-04-28 14:27:17 +0000 |
---|---|---|
committer | jhb <jhb@FreeBSD.org> | 2011-04-28 14:27:17 +0000 |
commit | 574178d5e64458a7d476b3ea904f91a72301bfa4 (patch) | |
tree | 5c411cd76f416c4ad8905361c5890b5ff3a6729c /sys | |
parent | bc631ee68ae9daba50a7e7580cf277699922168f (diff) | |
download | FreeBSD-src-574178d5e64458a7d476b3ea904f91a72301bfa4.zip FreeBSD-src-574178d5e64458a7d476b3ea904f91a72301bfa4.tar.gz |
Sync with several changes in UFS/FFS:
- 77115: Implement support for O_DIRECT.
- 98425: Fix a performance issue introduced in 70131 that was causing
reads before writes even when writing full blocks.
- 98658: Rename the BALLOC flags from B_* to BA_* to avoid confusion with
the struct buf B_ flags.
- 100344: Merge the BA_ and IO_ flags so so that they may both be used in
the same flags word. This merger is possible by assigning the IO_ flags
to the low sixteen bits and the BA_ flags the high sixteen bits.
- 105422: Fix a file-rewrite performance case.
- 129545: Implement IO_INVAL in VOP_WRITE() by marking the buffer as
"no cache".
- Readd the DOINGASYNC() macro and use it to control asynchronous writes.
Change i-node updates to honor DOINGASYNC() instead of always being
synchronous.
- Use a PRIV_VFS_RETAINSUGID check instead of checking cr_uid against 0
directly when deciding whether or not to clear suid and sgid bits.
Submitted by: Pedro F. Giffuni giffunip at yahoo
Diffstat (limited to 'sys')
-rw-r--r-- | sys/fs/ext2fs/ext2_balloc.c | 21 | ||||
-rw-r--r-- | sys/fs/ext2fs/ext2_extern.h | 12 | ||||
-rw-r--r-- | sys/fs/ext2fs/ext2_inode.c | 24 | ||||
-rw-r--r-- | sys/fs/ext2fs/ext2_lookup.c | 12 | ||||
-rw-r--r-- | sys/fs/ext2fs/ext2_readwrite.c | 153 | ||||
-rw-r--r-- | sys/fs/ext2fs/ext2_vnops.c | 10 | ||||
-rw-r--r-- | sys/fs/ext2fs/inode.h | 3 | ||||
-rw-r--r-- | sys/modules/ext2fs/Makefile | 2 |
8 files changed, 168 insertions, 69 deletions
diff --git a/sys/fs/ext2fs/ext2_balloc.c b/sys/fs/ext2fs/ext2_balloc.c index 124ac32..3fb1976 100644 --- a/sys/fs/ext2fs/ext2_balloc.c +++ b/sys/fs/ext2fs/ext2_balloc.c @@ -41,7 +41,7 @@ #include <sys/bio.h> #include <sys/buf.h> #include <sys/lock.h> -#include <sys/ucred.h> +#include <sys/mount.h> #include <sys/vnode.h> #include <fs/ext2fs/inode.h> @@ -143,7 +143,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) return (error); bp = getblk(vp, lbn, nsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, newb); - if (flags & B_CLRBUF) + if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); } ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno); @@ -235,7 +235,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) * If required, write synchronously, otherwise use * delayed write. */ - if (flags & B_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->e2fs_bsize) @@ -258,14 +258,14 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) nb = newb; nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); - if (flags & B_CLRBUF) + if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ - if (flags & B_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->e2fs_bsize) @@ -276,8 +276,15 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags) return (0); } brelse(bp); - if (flags & B_CLRBUF) { - error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); + if (flags & BA_CLRBUF) { + int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; + if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + error = cluster_read(vp, ip->i_size, lbn, + (int)fs->e2fs_bsize, NOCRED, + MAXBSIZE, seqcount, &nbp); + } else { + error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); + } if (error) { brelse(nbp); return (error); diff --git a/sys/fs/ext2fs/ext2_extern.h b/sys/fs/ext2fs/ext2_extern.h index 60905cb..821809f 100644 --- a/sys/fs/ext2fs/ext2_extern.h +++ b/sys/fs/ext2fs/ext2_extern.h @@ -81,11 +81,13 @@ int ext2_checkpath(struct inode *, struct inode *, struct ucred *); int cg_has_sb(int i); int ext2_inactive(struct vop_inactive_args *); -/* Flags to low-level allocation routines. */ -#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ -#define B_SYNC 0x02 /* Do all allocations synchronously. */ -#define B_METAONLY 0x04 /* Return indirect block buffer. */ -#define B_NOWAIT 0x08 /* do not sleep to await lock */ +/* Flags to low-level allocation routines. + * The low 16-bits are reserved for IO_ flags from vnode.h. + */ +#define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ +#define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ +#define BA_SEQSHIFT 24 +#define BA_SEQMAX 0x7F extern struct vop_vector ext2_vnodeops; extern struct vop_vector ext2_fifoops; diff --git a/sys/fs/ext2fs/ext2_inode.c b/sys/fs/ext2fs/ext2_inode.c index fc65a63..2768c52 100644 --- a/sys/fs/ext2fs/ext2_inode.c +++ b/sys/fs/ext2fs/ext2_inode.c @@ -92,7 +92,7 @@ ext2_update(vp, waitfor) } ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number))); - if (waitfor && (vp->v_mount->mnt_kern_flag & MNTK_ASYNC) == 0) + if (waitfor && !DOINGASYNC(vp)) return (bwrite(bp)); else { bdwrite(bp); @@ -125,7 +125,7 @@ ext2_truncate(vp, length, flags, cred, td) struct buf *bp; int offset, size, level; long count, nblocks, blocksreleased = 0; - int aflags, error, i, allerror; + int error, i, allerror; off_t osize; oip = VTOI(ovp); @@ -164,10 +164,8 @@ ext2_truncate(vp, length, flags, cred, td) vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, aflags); + flags |= BA_CLRBUF; + error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); @@ -175,9 +173,9 @@ ext2_truncate(vp, length, flags, cred, td) oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (aflags & B_SYNC) + if (flags & IO_SYNC) bwrite(bp); - else if (ovp->v_mount->mnt_flag & MNT_ASYNC) + else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); @@ -197,10 +195,8 @@ ext2_truncate(vp, length, flags, cred, td) oip->i_size = length; } else { lbn = lblkno(fs, length); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2_balloc(oip, lbn, offset, cred, &bp, aflags); + flags |= BA_CLRBUF; + error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) return (error); oip->i_size = length; @@ -209,9 +205,9 @@ ext2_truncate(vp, length, flags, cred, td) allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (aflags & B_SYNC) + if (flags & IO_SYNC) bwrite(bp); - else if (ovp->v_mount->mnt_flag & MNT_ASYNC) + else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); diff --git a/sys/fs/ext2fs/ext2_lookup.c b/sys/fs/ext2fs/ext2_lookup.c index ca6a8d2..5e23b34 100644 --- a/sys/fs/ext2fs/ext2_lookup.c +++ b/sys/fs/ext2fs/ext2_lookup.c @@ -890,7 +890,12 @@ ext2_direnter(ip, dvp, cnp) ep = (struct ext2fs_direct_2 *)((char *)ep + dsize); } bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize); - error = bwrite(bp); + if (DOINGASYNC(dvp)) { + bdwrite(bp); + error = 0; + } else { + error = bwrite(bp); + } dp->i_flag |= IN_CHANGE | IN_UPDATE; if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC, @@ -947,7 +952,10 @@ ext2_dirremove(dvp, cnp) else rep = (struct ext2fs_direct_2 *)((char *)ep + ep->e2d_reclen); ep->e2d_reclen += rep->e2d_reclen; - error = bwrite(bp); + if (DOINGASYNC(dvp) && dp->i_count != 0) + bdwrite(bp); + else + error = bwrite(bp); dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } diff --git a/sys/fs/ext2fs/ext2_readwrite.c b/sys/fs/ext2fs/ext2_readwrite.c index 1a713ca..a68ff34 100644 --- a/sys/fs/ext2fs/ext2_readwrite.c +++ b/sys/fs/ext2fs/ext2_readwrite.c @@ -45,6 +45,15 @@ #define WRITE ext2_write #define WRITE_S "ext2_write" +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> + +#include "opt_directio.h" + /* * Vnode op for reading. */ @@ -66,15 +75,16 @@ READ(ap) off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid, seqcount; - seqcount = ap->a_ioflag >> IO_SEQSHIFT; - u_short mode; + int ioflag; vp = ap->a_vp; - ip = VTOI(vp); - mode = ip->i_mode; uio = ap->a_uio; + ioflag = ap->a_ioflag; -#ifdef DIAGNOSTIC + seqcount = ap->a_ioflag >> IO_SEQSHIFT; + ip = VTOI(vp); + +#ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("%s: mode", READ_S); @@ -90,8 +100,10 @@ READ(ap) return (0); KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0")); fs = ip->I_FS; - if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize) - return (EOVERFLOW); + if (uio->uio_offset < ip->i_size && + uio->uio_offset >= fs->e2fs_maxfilesize) + return (EOVERFLOW); + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; @@ -109,8 +121,8 @@ READ(ap) if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) - error = cluster_read(vp, ip->i_size, lbn, size, - NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); + error = cluster_read(vp, ip->i_size, lbn, size, + NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); else if (seqcount > 1) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, @@ -124,6 +136,15 @@ READ(ap) } /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, @@ -141,10 +162,42 @@ READ(ap) if (error) break; - bqrelse(bp); + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + /* + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. + */ + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + /* + * Otherwise let whoever + * made the request take care of + * freeing it. We just queue + * it onto another list. + */ + bqrelse(bp); + } + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bqrelse(bp); + } } - if (bp != NULL) - bqrelse(bp); + if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) ip->i_flag |= IN_ACCESS; @@ -173,12 +226,13 @@ WRITE(ap) int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize; ioflag = ap->a_ioflag; - seqcount = ioflag >> IO_SEQSHIFT; uio = ap->a_uio; vp = ap->a_vp; + + seqcount = ioflag >> IO_SEQSHIFT; ip = VTOI(vp); -#ifdef DIAGNOSTIC +#ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("%s: mode", WRITE_S); #endif @@ -217,7 +271,12 @@ WRITE(ap) resid = uio->uio_resid; osize = ip->i_size; - flags = ioflag & IO_SYNC ? B_SYNC : 0; + if (seqcount > BA_SEQMAX) + flags = BA_SEQMAX << BA_SEQSHIFT; + else + flags = seqcount << BA_SEQSHIFT; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); @@ -228,17 +287,30 @@ WRITE(ap) if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); - /* - * Avoid a data-consistency race between write() and mmap() - * by ensuring that newly allocated blocks are zeroed. The - * race can occur even in the case where the write covers - * the entire block. - */ - flags |= B_CLRBUF; + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->e2fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, - ap->a_cred, &bp, flags); + ap->a_cred, &bp, flags); if (error != 0) break; + + /* + * If the buffer is not valid and we did not clear garbage + * out above, we have to do so here even though the write + * covers the entire buffer in order to avoid a mmap()/write + * race where another process may see the garbage prior to + * the uiomove() for a write replacing it. + */ + if ((bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize <= xfersize) + vfs_bio_clrbuf(bp); + if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) + bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) ip->i_size = uio->uio_offset + xfersize; size = BLKSIZE(fs, ip, lbn) - bp->b_resid; @@ -247,12 +319,25 @@ WRITE(ap) error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & IO_VMIO) && - LIST_FIRST(&bp->b_dep) == NULL) /* in ext2fs? */ + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_EMPTY(&bp->b_dep))) { /* in ext2fs? */ bp->b_flags |= B_RELBUF; + } + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ if (ioflag & IO_SYNC) { (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; @@ -260,6 +345,9 @@ WRITE(ap) } else { bawrite(bp); } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); @@ -271,18 +359,13 @@ WRITE(ap) * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. - * XXX too late, the tamperer may have opened the file while we - * were writing the data (or before). - * XXX too early, if (error && ioflag & IO_UNIT) then we will - * unwrite the data. */ - if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) - ip->i_mode &= ~(ISUID | ISGID); + if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && + ap->a_cred) { + if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) + ip->i_mode &= ~(ISUID | ISGID); + } if (error) { - /* - * XXX should truncate to the last successfully written - * data if the uiomove() failed. - */ if (ioflag & IO_UNIT) { (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index 5333785..f9da170 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -738,7 +738,7 @@ ext2_link(ap) } ip->i_nlink++; ip->i_flag |= IN_CHANGE; - error = ext2_update(vp, 1); + error = ext2_update(vp, !DOINGASYNC(vp)); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { @@ -884,7 +884,7 @@ abortit: */ ip->i_nlink++; ip->i_flag |= IN_CHANGE; - if ((error = ext2_update(fvp, 1)) != 0) { + if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) { VOP_UNLOCK(fvp, 0); goto bad; } @@ -943,7 +943,7 @@ abortit: } dp->i_nlink++; dp->i_flag |= IN_CHANGE; - error = ext2_update(tdvp, 1); + error = ext2_update(tdvp, !DOINGASYNC(tdvp)); if (error) goto bad; } @@ -1211,7 +1211,7 @@ ext2_mkdir(ap) */ dp->i_nlink++; dp->i_flag |= IN_CHANGE; - error = ext2_update(dvp, 1); + error = ext2_update(dvp, !DOINGASYNC(dvp)); if (error) goto bad; @@ -1655,7 +1655,7 @@ ext2_makeinode(mode, dvp, vpp, cnp) /* * Make sure inode goes to disk before directory entry. */ - error = ext2_update(tvp, 1); + error = ext2_update(tvp, !DOINGASYNC(tvp)); if (error) goto bad; error = ext2_direnter(ip, dvp, cnp); diff --git a/sys/fs/ext2fs/inode.h b/sys/fs/ext2fs/inode.h index ae794d7..92a84ac 100644 --- a/sys/fs/ext2fs/inode.h +++ b/sys/fs/ext2fs/inode.h @@ -158,6 +158,9 @@ struct indir { #define VTOI(vp) ((struct inode *)(vp)->v_data) #define ITOV(ip) ((ip)->i_vnode) +/* Check whether the MNTK_ASYNC flag has been set for a mount point */ +#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) + /* This overlays the fid structure (see mount.h). */ struct ufid { uint16_t ufid_len; /* Length of structure. */ diff --git a/sys/modules/ext2fs/Makefile b/sys/modules/ext2fs/Makefile index d9ab969..63726b5 100644 --- a/sys/modules/ext2fs/Makefile +++ b/sys/modules/ext2fs/Makefile @@ -2,7 +2,7 @@ .PATH: ${.CURDIR}/../../fs/ext2fs KMOD= ext2fs -SRCS= opt_ddb.h opt_quota.h opt_suiddir.h vnode_if.h \ +SRCS= opt_ddb.h opt_directio.h opt_quota.h opt_suiddir.h vnode_if.h \ ext2_alloc.c ext2_balloc.c ext2_bmap.c ext2_inode.c \ ext2_inode_cnv.c ext2_lookup.c ext2_subr.c ext2_vfsops.c \ ext2_vnops.c |