summaryrefslogtreecommitdiffstats
path: root/sys/kern/vfs_vnops.c
diff options
context:
space:
mode:
authorkib <kib@FreeBSD.org>2012-06-21 09:19:41 +0000
committerkib <kib@FreeBSD.org>2012-06-21 09:19:41 +0000
commitdf9f3d2faa1df2bb13d274183876fe227e259a65 (patch)
treef333c89c12a01492f7498427b3750b2549eacc1b /sys/kern/vfs_vnops.c
parent2a52af28e66573a140482815c762bae088491ab9 (diff)
downloadFreeBSD-src-df9f3d2faa1df2bb13d274183876fe227e259a65.zip
FreeBSD-src-df9f3d2faa1df2bb13d274183876fe227e259a65.tar.gz
Fix locking for f_offset, vn_read() and vn_write() cases only, for now.
It seems that intended locking protocol for struct file f_offset field was as follows: f_offset should always be changed under the vnode lock (except fcntl(2) and lseek(2) did not followed the rules). Since read(2) uses shared vnode lock, FOFFSET_LOCKED block is additionally taken to serialize shared vnode lock owners. This was broken first by enabling shared lock on writes, then by fadvise changes, which moved f_offset assigned from under vnode lock, and last by vn_io_fault() doing chunked i/o. More, due to uio_offset not yet valid in vn_io_fault(), the range lock for reads was taken on the wrong region. Change the locking for f_offset to always use FOFFSET_LOCKED block, which is placed before rangelocks in the lock order. Extract foffset_lock() and foffset_unlock() functions which implements FOFFSET_LOCKED lock, and consistently lock f_offset with it in the vn_io_fault() both for reads and writes, even if MNTK_NO_IOPF flag is not set for the vnode mount. Indicate that f_offset is already valid for vn_read() and vn_write() calls from vn_io_fault() with FOF_OFFSET flag, and assert that all callers of vn_read() and vn_write() follow this protocol. Extract get_advice() function to calculate the POSIX_FADV_XXX value for the i/o region, and use it were appropriate. Reviewed by: jhb Tested by: pho MFC after: 2 weeks
Diffstat (limited to 'sys/kern/vfs_vnops.c')
-rw-r--r--sys/kern/vfs_vnops.c133
1 files changed, 80 insertions, 53 deletions
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 75bd3df..e6eb97d 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -527,6 +527,66 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
return (error);
}
+static void
+foffset_lock(struct file *fp, struct uio *uio, int flags)
+{
+ struct mtx *mtxp;
+
+ if ((flags & FOF_OFFSET) != 0)
+ return;
+
+ /*
+ * According to McKusick the vn lock was protecting f_offset here.
+ * It is now protected by the FOFFSET_LOCKED flag.
+ */
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vnread offlock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ uio->uio_offset = fp->f_offset;
+ mtx_unlock(mtxp);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+ struct mtx *mtxp;
+ int ret;
+
+ ret = POSIX_FADV_NORMAL;
+ if (fp->f_advice == NULL)
+ return (ret);
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ ret = fp->f_advice->fa_advice;
+ mtx_unlock(mtxp);
+ return (ret);
+}
+
+static void
+foffset_unlock(struct file *fp, struct uio *uio, int flags)
+{
+ struct mtx *mtxp;
+
+ if ((flags & FOF_OFFSET) != 0)
+ return;
+
+ fp->f_offset = uio->uio_offset;
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+ wakeup(&fp->f_vnread_flags);
+ fp->f_vnread_flags = 0;
+ mtx_unlock(mtxp);
+}
+
/*
* File table vnode read routine.
*/
@@ -539,44 +599,22 @@ vn_read(fp, uio, active_cred, flags, td)
struct thread *td;
{
struct vnode *vp;
- int error, ioflag;
struct mtx *mtxp;
+ int error, ioflag;
int advice, vfslocked;
off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
- mtxp = NULL;
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
vp = fp->f_vnode;
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
- advice = POSIX_FADV_NORMAL;
+ advice = get_advice(fp, uio);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- /*
- * According to McKusick the vn lock was protecting f_offset here.
- * It is now protected by the FOFFSET_LOCKED flag.
- */
- if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
- mtxp = mtx_pool_find(mtxpool_sleep, fp);
- mtx_lock(mtxp);
- if ((flags & FOF_OFFSET) == 0) {
- while (fp->f_vnread_flags & FOFFSET_LOCKED) {
- fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
- msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
- "vnread offlock", 0);
- }
- fp->f_vnread_flags |= FOFFSET_LOCKED;
- uio->uio_offset = fp->f_offset;
- }
- if (fp->f_advice != NULL &&
- uio->uio_offset >= fp->f_advice->fa_start &&
- uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
- advice = fp->f_advice->fa_advice;
- mtx_unlock(mtxp);
- }
vn_lock(vp, LK_SHARED | LK_RETRY);
switch (advice) {
@@ -596,14 +634,6 @@ vn_read(fp, uio, active_cred, flags, td)
if (error == 0)
#endif
error = VOP_READ(vp, uio, ioflag, fp->f_cred);
- if ((flags & FOF_OFFSET) == 0) {
- fp->f_offset = uio->uio_offset;
- mtx_lock(mtxp);
- if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
- wakeup(&fp->f_vnread_flags);
- fp->f_vnread_flags = 0;
- mtx_unlock(mtxp);
- }
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
if (error == 0 && advice == POSIX_FADV_NOREUSE &&
@@ -625,6 +655,7 @@ vn_read(fp, uio, active_cred, flags, td)
*/
start = offset;
end = uio->uio_offset - 1;
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if (fp->f_advice != NULL &&
fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -656,13 +687,14 @@ vn_write(fp, uio, active_cred, flags, td)
{
struct vnode *vp;
struct mount *mp;
- int error, ioflag, lock_flags;
struct mtx *mtxp;
+ int error, ioflag, lock_flags;
int advice, vfslocked;
off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type == VREG)
@@ -681,6 +713,8 @@ vn_write(fp, uio, active_cred, flags, td)
if (vp->v_type != VCHR &&
(error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto unlock;
+
+ advice = get_advice(fp, uio);
if ((MNT_SHARED_WRITES(mp) ||
((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
@@ -691,19 +725,6 @@ vn_write(fp, uio, active_cred, flags, td)
}
vn_lock(vp, lock_flags | LK_RETRY);
- if ((flags & FOF_OFFSET) == 0)
- uio->uio_offset = fp->f_offset;
- advice = POSIX_FADV_NORMAL;
- mtxp = NULL;
- if (fp->f_advice != NULL) {
- mtxp = mtx_pool_find(mtxpool_sleep, fp);
- mtx_lock(mtxp);
- if (fp->f_advice != NULL &&
- uio->uio_offset >= fp->f_advice->fa_start &&
- uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
- advice = fp->f_advice->fa_advice;
- mtx_unlock(mtxp);
- }
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_SEQUENTIAL:
@@ -721,8 +742,6 @@ vn_write(fp, uio, active_cred, flags, td)
if (error == 0)
#endif
error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
- if ((flags & FOF_OFFSET) == 0)
- fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
if (vp->v_type != VCHR)
@@ -761,6 +780,7 @@ vn_write(fp, uio, active_cred, flags, td)
*/
start = offset;
end = uio->uio_offset - 1;
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if (fp->f_advice != NULL &&
fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -845,11 +865,15 @@ vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
else
doio = vn_write;
vp = fp->f_vnode;
+ foffset_lock(fp, uio, flags);
+
if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
((mp = vp->v_mount) != NULL &&
(mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
- !vn_io_fault_enable)
- return (doio(fp, uio, active_cred, flags, td));
+ !vn_io_fault_enable) {
+ error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+ goto out_last;
+ }
/*
* The UFS follows IO_UNIT directive and replays back both
@@ -882,7 +906,7 @@ vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
}
save = vm_fault_disable_pagefaults();
- error = doio(fp, uio, active_cred, flags, td);
+ error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
if (error != EFAULT)
goto out;
@@ -933,7 +957,8 @@ vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
td->td_ma = ma;
td->td_ma_cnt = cnt;
- error = doio(fp, &short_uio, active_cred, flags, td);
+ error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+ td);
vm_page_unhold_pages(ma, cnt);
adv = len - short_uio.uio_resid;
@@ -956,6 +981,8 @@ out:
vm_fault_enable_pagefaults(save);
vn_rangelock_unlock(vp, rl_cookie);
free(uio_clone, M_IOV);
+out_last:
+ foffset_unlock(fp, uio, flags);
return (error);
}
OpenPOWER on IntegriCloud