summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2011-04-28 14:27:17 +0000
committerjhb <jhb@FreeBSD.org>2011-04-28 14:27:17 +0000
commit574178d5e64458a7d476b3ea904f91a72301bfa4 (patch)
tree5c411cd76f416c4ad8905361c5890b5ff3a6729c /sys
parentbc631ee68ae9daba50a7e7580cf277699922168f (diff)
downloadFreeBSD-src-574178d5e64458a7d476b3ea904f91a72301bfa4.zip
FreeBSD-src-574178d5e64458a7d476b3ea904f91a72301bfa4.tar.gz
Sync with several changes in UFS/FFS:
- 77115: Implement support for O_DIRECT. - 98425: Fix a performance issue introduced in 70131 that was causing reads before writes even when writing full blocks. - 98658: Rename the BALLOC flags from B_* to BA_* to avoid confusion with the struct buf B_ flags. - 100344: Merge the BA_ and IO_ flags so so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. - 105422: Fix a file-rewrite performance case. - 129545: Implement IO_INVAL in VOP_WRITE() by marking the buffer as "no cache". - Readd the DOINGASYNC() macro and use it to control asynchronous writes. Change i-node updates to honor DOINGASYNC() instead of always being synchronous. - Use a PRIV_VFS_RETAINSUGID check instead of checking cr_uid against 0 directly when deciding whether or not to clear suid and sgid bits. Submitted by: Pedro F. Giffuni giffunip at yahoo
Diffstat (limited to 'sys')
-rw-r--r--sys/fs/ext2fs/ext2_balloc.c21
-rw-r--r--sys/fs/ext2fs/ext2_extern.h12
-rw-r--r--sys/fs/ext2fs/ext2_inode.c24
-rw-r--r--sys/fs/ext2fs/ext2_lookup.c12
-rw-r--r--sys/fs/ext2fs/ext2_readwrite.c153
-rw-r--r--sys/fs/ext2fs/ext2_vnops.c10
-rw-r--r--sys/fs/ext2fs/inode.h3
-rw-r--r--sys/modules/ext2fs/Makefile2
8 files changed, 168 insertions, 69 deletions
diff --git a/sys/fs/ext2fs/ext2_balloc.c b/sys/fs/ext2fs/ext2_balloc.c
index 124ac32..3fb1976 100644
--- a/sys/fs/ext2fs/ext2_balloc.c
+++ b/sys/fs/ext2fs/ext2_balloc.c
@@ -41,7 +41,7 @@
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/lock.h>
-#include <sys/ucred.h>
+#include <sys/mount.h>
#include <sys/vnode.h>
#include <fs/ext2fs/inode.h>
@@ -143,7 +143,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
return (error);
bp = getblk(vp, lbn, nsize, 0, 0, 0);
bp->b_blkno = fsbtodb(fs, newb);
- if (flags & B_CLRBUF)
+ if (flags & BA_CLRBUF)
vfs_bio_clrbuf(bp);
}
ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno);
@@ -235,7 +235,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
* If required, write synchronously, otherwise use
* delayed write.
*/
- if (flags & B_SYNC) {
+ if (flags & IO_SYNC) {
bwrite(bp);
} else {
if (bp->b_bufsize == fs->e2fs_bsize)
@@ -258,14 +258,14 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
nb = newb;
nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
- if (flags & B_CLRBUF)
+ if (flags & BA_CLRBUF)
vfs_bio_clrbuf(nbp);
bap[indirs[i].in_off] = nb;
/*
* If required, write synchronously, otherwise use
* delayed write.
*/
- if (flags & B_SYNC) {
+ if (flags & IO_SYNC) {
bwrite(bp);
} else {
if (bp->b_bufsize == fs->e2fs_bsize)
@@ -276,8 +276,15 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
return (0);
}
brelse(bp);
- if (flags & B_CLRBUF) {
- error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp);
+ if (flags & BA_CLRBUF) {
+ int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
+ if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
+ error = cluster_read(vp, ip->i_size, lbn,
+ (int)fs->e2fs_bsize, NOCRED,
+ MAXBSIZE, seqcount, &nbp);
+ } else {
+ error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp);
+ }
if (error) {
brelse(nbp);
return (error);
diff --git a/sys/fs/ext2fs/ext2_extern.h b/sys/fs/ext2fs/ext2_extern.h
index 60905cb..821809f 100644
--- a/sys/fs/ext2fs/ext2_extern.h
+++ b/sys/fs/ext2fs/ext2_extern.h
@@ -81,11 +81,13 @@ int ext2_checkpath(struct inode *, struct inode *, struct ucred *);
int cg_has_sb(int i);
int ext2_inactive(struct vop_inactive_args *);
-/* Flags to low-level allocation routines. */
-#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */
-#define B_SYNC 0x02 /* Do all allocations synchronously. */
-#define B_METAONLY 0x04 /* Return indirect block buffer. */
-#define B_NOWAIT 0x08 /* do not sleep to await lock */
+/* Flags to low-level allocation routines.
+ * The low 16-bits are reserved for IO_ flags from vnode.h.
+ */
+#define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */
+#define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */
+#define BA_SEQSHIFT 24
+#define BA_SEQMAX 0x7F
extern struct vop_vector ext2_vnodeops;
extern struct vop_vector ext2_fifoops;
diff --git a/sys/fs/ext2fs/ext2_inode.c b/sys/fs/ext2fs/ext2_inode.c
index fc65a63..2768c52 100644
--- a/sys/fs/ext2fs/ext2_inode.c
+++ b/sys/fs/ext2fs/ext2_inode.c
@@ -92,7 +92,7 @@ ext2_update(vp, waitfor)
}
ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data +
EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)));
- if (waitfor && (vp->v_mount->mnt_kern_flag & MNTK_ASYNC) == 0)
+ if (waitfor && !DOINGASYNC(vp))
return (bwrite(bp));
else {
bdwrite(bp);
@@ -125,7 +125,7 @@ ext2_truncate(vp, length, flags, cred, td)
struct buf *bp;
int offset, size, level;
long count, nblocks, blocksreleased = 0;
- int aflags, error, i, allerror;
+ int error, i, allerror;
off_t osize;
oip = VTOI(ovp);
@@ -164,10 +164,8 @@ ext2_truncate(vp, length, flags, cred, td)
vnode_pager_setsize(ovp, length);
offset = blkoff(fs, length - 1);
lbn = lblkno(fs, length - 1);
- aflags = B_CLRBUF;
- if (flags & IO_SYNC)
- aflags |= B_SYNC;
- error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, aflags);
+ flags |= BA_CLRBUF;
+ error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags);
if (error) {
vnode_pager_setsize(vp, osize);
return (error);
@@ -175,9 +173,9 @@ ext2_truncate(vp, length, flags, cred, td)
oip->i_size = length;
if (bp->b_bufsize == fs->e2fs_bsize)
bp->b_flags |= B_CLUSTEROK;
- if (aflags & B_SYNC)
+ if (flags & IO_SYNC)
bwrite(bp);
- else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
+ else if (DOINGASYNC(ovp))
bdwrite(bp);
else
bawrite(bp);
@@ -197,10 +195,8 @@ ext2_truncate(vp, length, flags, cred, td)
oip->i_size = length;
} else {
lbn = lblkno(fs, length);
- aflags = B_CLRBUF;
- if (flags & IO_SYNC)
- aflags |= B_SYNC;
- error = ext2_balloc(oip, lbn, offset, cred, &bp, aflags);
+ flags |= BA_CLRBUF;
+ error = ext2_balloc(oip, lbn, offset, cred, &bp, flags);
if (error)
return (error);
oip->i_size = length;
@@ -209,9 +205,9 @@ ext2_truncate(vp, length, flags, cred, td)
allocbuf(bp, size);
if (bp->b_bufsize == fs->e2fs_bsize)
bp->b_flags |= B_CLUSTEROK;
- if (aflags & B_SYNC)
+ if (flags & IO_SYNC)
bwrite(bp);
- else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
+ else if (DOINGASYNC(ovp))
bdwrite(bp);
else
bawrite(bp);
diff --git a/sys/fs/ext2fs/ext2_lookup.c b/sys/fs/ext2fs/ext2_lookup.c
index ca6a8d2..5e23b34 100644
--- a/sys/fs/ext2fs/ext2_lookup.c
+++ b/sys/fs/ext2fs/ext2_lookup.c
@@ -890,7 +890,12 @@ ext2_direnter(ip, dvp, cnp)
ep = (struct ext2fs_direct_2 *)((char *)ep + dsize);
}
bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize);
- error = bwrite(bp);
+ if (DOINGASYNC(dvp)) {
+ bdwrite(bp);
+ error = 0;
+ } else {
+ error = bwrite(bp);
+ }
dp->i_flag |= IN_CHANGE | IN_UPDATE;
if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC,
@@ -947,7 +952,10 @@ ext2_dirremove(dvp, cnp)
else
rep = (struct ext2fs_direct_2 *)((char *)ep + ep->e2d_reclen);
ep->e2d_reclen += rep->e2d_reclen;
- error = bwrite(bp);
+ if (DOINGASYNC(dvp) && dp->i_count != 0)
+ bdwrite(bp);
+ else
+ error = bwrite(bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
return (error);
}
diff --git a/sys/fs/ext2fs/ext2_readwrite.c b/sys/fs/ext2fs/ext2_readwrite.c
index 1a713ca..a68ff34 100644
--- a/sys/fs/ext2fs/ext2_readwrite.c
+++ b/sys/fs/ext2fs/ext2_readwrite.c
@@ -45,6 +45,15 @@
#define WRITE ext2_write
#define WRITE_S "ext2_write"
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+#include "opt_directio.h"
+
/*
* Vnode op for reading.
*/
@@ -66,15 +75,16 @@ READ(ap)
off_t bytesinfile;
long size, xfersize, blkoffset;
int error, orig_resid, seqcount;
- seqcount = ap->a_ioflag >> IO_SEQSHIFT;
- u_short mode;
+ int ioflag;
vp = ap->a_vp;
- ip = VTOI(vp);
- mode = ip->i_mode;
uio = ap->a_uio;
+ ioflag = ap->a_ioflag;
-#ifdef DIAGNOSTIC
+ seqcount = ap->a_ioflag >> IO_SEQSHIFT;
+ ip = VTOI(vp);
+
+#ifdef INVARIANTS
if (uio->uio_rw != UIO_READ)
panic("%s: mode", READ_S);
@@ -90,8 +100,10 @@ READ(ap)
return (0);
KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
fs = ip->I_FS;
- if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize)
- return (EOVERFLOW);
+ if (uio->uio_offset < ip->i_size &&
+ uio->uio_offset >= fs->e2fs_maxfilesize)
+ return (EOVERFLOW);
+
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
break;
@@ -109,8 +121,8 @@ READ(ap)
if (lblktosize(fs, nextlbn) >= ip->i_size)
error = bread(vp, lbn, size, NOCRED, &bp);
else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0)
- error = cluster_read(vp, ip->i_size, lbn, size,
- NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
+ error = cluster_read(vp, ip->i_size, lbn, size,
+ NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
else if (seqcount > 1) {
int nextsize = BLKSIZE(fs, ip, nextlbn);
error = breadn(vp, lbn,
@@ -124,6 +136,15 @@ READ(ap)
}
/*
+ * If IO_DIRECT then set B_DIRECT for the buffer. This
+ * will cause us to attempt to release the buffer later on
+ * and will cause the buffer cache to attempt to free the
+ * underlying pages.
+ */
+ if (ioflag & IO_DIRECT)
+ bp->b_flags |= B_DIRECT;
+
+ /*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
@@ -141,10 +162,42 @@ READ(ap)
if (error)
break;
- bqrelse(bp);
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
+ /*
+ * If there are no dependencies, and it's VMIO,
+ * then we don't need the buf, mark it available
+ * for freeing. The VM has the data.
+ */
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ /*
+ * Otherwise let whoever
+ * made the request take care of
+ * freeing it. We just queue
+ * it onto another list.
+ */
+ bqrelse(bp);
+ }
+ }
+
+ /*
+ * This can only happen in the case of an error
+ * because the loop above resets bp to NULL on each iteration
+ * and on normal completion has not set a new value into it.
+ * so it must have come from a 'break' statement
+ */
+ if (bp != NULL) {
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ bqrelse(bp);
+ }
}
- if (bp != NULL)
- bqrelse(bp);
+
if ((error == 0 || uio->uio_resid != orig_resid) &&
(vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
ip->i_flag |= IN_ACCESS;
@@ -173,12 +226,13 @@ WRITE(ap)
int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
ioflag = ap->a_ioflag;
- seqcount = ioflag >> IO_SEQSHIFT;
uio = ap->a_uio;
vp = ap->a_vp;
+
+ seqcount = ioflag >> IO_SEQSHIFT;
ip = VTOI(vp);
-#ifdef DIAGNOSTIC
+#ifdef INVARIANTS
if (uio->uio_rw != UIO_WRITE)
panic("%s: mode", WRITE_S);
#endif
@@ -217,7 +271,12 @@ WRITE(ap)
resid = uio->uio_resid;
osize = ip->i_size;
- flags = ioflag & IO_SYNC ? B_SYNC : 0;
+ if (seqcount > BA_SEQMAX)
+ flags = BA_SEQMAX << BA_SEQSHIFT;
+ else
+ flags = seqcount << BA_SEQSHIFT;
+ if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
+ flags |= IO_SYNC;
for (error = 0; uio->uio_resid > 0;) {
lbn = lblkno(fs, uio->uio_offset);
@@ -228,17 +287,30 @@ WRITE(ap)
if (uio->uio_offset + xfersize > ip->i_size)
vnode_pager_setsize(vp, uio->uio_offset + xfersize);
- /*
- * Avoid a data-consistency race between write() and mmap()
- * by ensuring that newly allocated blocks are zeroed. The
- * race can occur even in the case where the write covers
- * the entire block.
- */
- flags |= B_CLRBUF;
+ /*
+ * We must perform a read-before-write if the transfer size
+ * does not cover the entire buffer.
+ */
+ if (fs->e2fs_bsize > xfersize)
+ flags |= BA_CLRBUF;
+ else
+ flags &= ~BA_CLRBUF;
error = ext2_balloc(ip, lbn, blkoffset + xfersize,
- ap->a_cred, &bp, flags);
+ ap->a_cred, &bp, flags);
if (error != 0)
break;
+
+ /*
+ * If the buffer is not valid and we did not clear garbage
+ * out above, we have to do so here even though the write
+ * covers the entire buffer in order to avoid a mmap()/write
+ * race where another process may see the garbage prior to
+ * the uiomove() for a write replacing it.
+ */
+ if ((bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize <= xfersize)
+ vfs_bio_clrbuf(bp);
+ if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
+ bp->b_flags |= B_NOCACHE;
if (uio->uio_offset + xfersize > ip->i_size)
ip->i_size = uio->uio_offset + xfersize;
size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
@@ -247,12 +319,25 @@ WRITE(ap)
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
- if ((ioflag & IO_VMIO) &&
- LIST_FIRST(&bp->b_dep) == NULL) /* in ext2fs? */
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_EMPTY(&bp->b_dep))) { /* in ext2fs? */
bp->b_flags |= B_RELBUF;
+ }
+ /*
+ * If IO_SYNC each buffer is written synchronously. Otherwise
+ * if we have a severe page deficiency write the buffer
+ * asynchronously. Otherwise try to cluster, and if that
+ * doesn't do it then either do an async write (if O_DIRECT),
+ * or a delayed write (if not).
+ */
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
+ } else if (vm_page_count_severe() ||
+ buf_dirty_count_severe() ||
+ (ioflag & IO_ASYNC)) {
+ bp->b_flags |= B_CLUSTEROK;
+ bawrite(bp);
} else if (xfersize + blkoffset == fs->e2fs_fsize) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
@@ -260,6 +345,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
+ } else if (ioflag & IO_DIRECT) {
+ bp->b_flags |= B_CLUSTEROK;
+ bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
@@ -271,18 +359,13 @@ WRITE(ap)
* If we successfully wrote any data, and we are not the superuser
* we clear the setuid and setgid bits as a precaution against
* tampering.
- * XXX too late, the tamperer may have opened the file while we
- * were writing the data (or before).
- * XXX too early, if (error && ioflag & IO_UNIT) then we will
- * unwrite the data.
*/
- if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
- ip->i_mode &= ~(ISUID | ISGID);
+ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
+ ap->a_cred) {
+ if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
+ ip->i_mode &= ~(ISUID | ISGID);
+ }
if (error) {
- /*
- * XXX should truncate to the last successfully written
- * data if the uiomove() failed.
- */
if (ioflag & IO_UNIT) {
(void)ext2_truncate(vp, osize,
ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c
index 5333785..f9da170 100644
--- a/sys/fs/ext2fs/ext2_vnops.c
+++ b/sys/fs/ext2fs/ext2_vnops.c
@@ -738,7 +738,7 @@ ext2_link(ap)
}
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
- error = ext2_update(vp, 1);
+ error = ext2_update(vp, !DOINGASYNC(vp));
if (!error)
error = ext2_direnter(ip, tdvp, cnp);
if (error) {
@@ -884,7 +884,7 @@ abortit:
*/
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
- if ((error = ext2_update(fvp, 1)) != 0) {
+ if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) {
VOP_UNLOCK(fvp, 0);
goto bad;
}
@@ -943,7 +943,7 @@ abortit:
}
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
- error = ext2_update(tdvp, 1);
+ error = ext2_update(tdvp, !DOINGASYNC(tdvp));
if (error)
goto bad;
}
@@ -1211,7 +1211,7 @@ ext2_mkdir(ap)
*/
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
- error = ext2_update(dvp, 1);
+ error = ext2_update(dvp, !DOINGASYNC(dvp));
if (error)
goto bad;
@@ -1655,7 +1655,7 @@ ext2_makeinode(mode, dvp, vpp, cnp)
/*
* Make sure inode goes to disk before directory entry.
*/
- error = ext2_update(tvp, 1);
+ error = ext2_update(tvp, !DOINGASYNC(tvp));
if (error)
goto bad;
error = ext2_direnter(ip, dvp, cnp);
diff --git a/sys/fs/ext2fs/inode.h b/sys/fs/ext2fs/inode.h
index ae794d7..92a84ac 100644
--- a/sys/fs/ext2fs/inode.h
+++ b/sys/fs/ext2fs/inode.h
@@ -158,6 +158,9 @@ struct indir {
#define VTOI(vp) ((struct inode *)(vp)->v_data)
#define ITOV(ip) ((ip)->i_vnode)
+/* Check whether the MNTK_ASYNC flag has been set for a mount point */
+#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
+
/* This overlays the fid structure (see mount.h). */
struct ufid {
uint16_t ufid_len; /* Length of structure. */
diff --git a/sys/modules/ext2fs/Makefile b/sys/modules/ext2fs/Makefile
index d9ab969..63726b5 100644
--- a/sys/modules/ext2fs/Makefile
+++ b/sys/modules/ext2fs/Makefile
@@ -2,7 +2,7 @@
.PATH: ${.CURDIR}/../../fs/ext2fs
KMOD= ext2fs
-SRCS= opt_ddb.h opt_quota.h opt_suiddir.h vnode_if.h \
+SRCS= opt_ddb.h opt_directio.h opt_quota.h opt_suiddir.h vnode_if.h \
ext2_alloc.c ext2_balloc.c ext2_bmap.c ext2_inode.c \
ext2_inode_cnv.c ext2_lookup.c ext2_subr.c ext2_vfsops.c \
ext2_vnops.c
OpenPOWER on IntegriCloud