diff options
author | kib <kib@FreeBSD.org> | 2007-01-22 11:25:22 +0000 |
---|---|---|
committer | kib <kib@FreeBSD.org> | 2007-01-22 11:25:22 +0000 |
commit | 79752b63e1e72d72a46513db9b95df486877fdd2 (patch) | |
tree | 5c7c4a1283308fecd2e3cd335a281f350ff29125 /sys | |
parent | bbae4f99499054b3fc06acf2da55f260fd8d3974 (diff) | |
download | FreeBSD-src-79752b63e1e72d72a46513db9b95df486877fdd2.zip FreeBSD-src-79752b63e1e72d72a46513db9b95df486877fdd2.tar.gz |
Below is slightly edited description of the LOR by Tor Egge:
--------------------------
[Deadlock] is caused by a lock order reversal in vfs_lookup(), where
[some] process is trying to lock a directory vnode, that is the parent
directory of covered vnode) while holding an exclusive vnode lock on
covering vnode.
A simplified scenario:
root fs var fs
/ A / (/var) D
/var B /log (/var/log) E
vfs lock C vfs lock F
Within each file system, the lock order is clear: C->A->B and F->D->E
When traversing across mounts, the system can choose between two lock orders,
but everything must then follow that lock order:
L1: C->A->B
|
+->F->D->E
L2: F->D->E
|
+->C->A->B
The lookup() process for namei("/var") mixes those two lock orders:
VOP_LOOKUP() obtains B while A is held
vfs_busy() obtains a shared lock on F while A and B are held (follows L1,
violates L2)
vput() releases lock on B
VOP_UNLOCK() releases lock on A
VFS_ROOT() obtains lock on D while shared lock on F is held
vfs_unbusy() releases shared lock on F
vn_lock() obtains lock on A while D is held (violates L1, follows L2)
dounmount() follows L1 (B is locked while F is drained).
Without unmount activity, vfs_busy() will always succeed without blocking
and the deadlock isn't triggered (the system behaves as if L2 is followed).
With unmount, you can get 4 processes in a deadlock:
p1: holds D, want A (in lookup())
p2: holds shared lock on F, want D (in VFS_ROOT())
p3: holds B, want drain lock on F (in dounmount())
p4: holds A, want B (in VOP_LOOKUP())
You can have more than one instance of p2.
The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and
MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs
servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode.
- Tor Egge
To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp
is actually not used by the callers of namei. Thus, placeholder deadfs
vnode vp_crossmp is introduced that is filled into ni_dvp.
Idea by: ups
Reviewed by: tegge, ups, jeff, rwatson (mac interaction)
Tested by: Peter Holm
MFC after: 2 weeks
Diffstat (limited to 'sys')
-rw-r--r-- | sys/fs/deadfs/dead_vnops.c | 25 | ||||
-rw-r--r-- | sys/kern/vfs_lookup.c | 21 |
2 files changed, 41 insertions, 5 deletions
diff --git a/sys/fs/deadfs/dead_vnops.c b/sys/fs/deadfs/dead_vnops.c index 1bbf453..a5c359f 100644 --- a/sys/fs/deadfs/dead_vnops.c +++ b/sys/fs/deadfs/dead_vnops.c @@ -49,6 +49,7 @@ static vop_poll_t dead_poll; static vop_read_t dead_read; static vop_write_t dead_write; static vop_getwritemount_t dead_getwritemount; +static vop_rename_t dead_rename; struct vop_vector dead_vnodeops = { .vop_default = &default_vnodeops, @@ -73,7 +74,7 @@ struct vop_vector dead_vnodeops = { .vop_readlink = VOP_EBADF, .vop_reclaim = VOP_NULL, .vop_remove = VOP_PANIC, - .vop_rename = VOP_PANIC, + .vop_rename = dead_rename, .vop_rmdir = VOP_PANIC, .vop_setattr = VOP_EBADF, .vop_symlink = VOP_PANIC, @@ -211,3 +212,25 @@ dead_poll(ap) { return (POLLHUP); } + +static int +dead_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + if (ap->a_tvp) + vput(ap->a_tvp); + if (ap->a_tdvp == ap->a_tvp) + vrele(ap->a_tdvp); + else + vput(ap->a_tdvp); + vrele(ap->a_fdvp); + vrele(ap->a_fvp); + return (EXDEV); +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index a0cbc83..bf624d8 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -69,13 +69,18 @@ __FBSDID("$FreeBSD$"); * Allocation zone for namei */ uma_zone_t namei_zone; +/* + * Placeholder vnode for mp traversal + */ +static struct vnode *vp_crossmp; static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - + getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); + vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL) @@ -559,7 +564,8 @@ unionlookup: * If we have a shared lock we may need to upgrade the lock for the * last operation. */ - if (VOP_ISLOCKED(dp, td) == LK_SHARED && + if (dp != vp_crossmp && + VOP_ISLOCKED(dp, td) == LK_SHARED && (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) vn_lock(dp, LK_UPGRADE|LK_RETRY, td); /* @@ -658,10 +664,17 @@ unionlookup: VFS_UNLOCK_GIANT(vfslocked); vfslocked = VFS_LOCK_GIANT(mp); if (dp != ndp->ni_dvp) - VOP_UNLOCK(ndp->ni_dvp, 0, td); + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + VFS_UNLOCK_GIANT(dvfslocked); + dvfslocked = 0; + vref(vp_crossmp); + ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td); vfs_unbusy(mp, td); - vn_lock(ndp->ni_dvp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY), td); + if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td)) + panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; |