summaryrefslogtreecommitdiffstats
path: root/sys/ufs
diff options
context:
space:
mode:
authormckusick <mckusick@FreeBSD.org>2012-04-02 21:58:37 +0000
committermckusick <mckusick@FreeBSD.org>2012-04-02 21:58:37 +0000
commitdc3cb66f4c32ff1f57b40352c8a159c55df7a1ff (patch)
tree4e7b3e24f1e002b013178cefaa7ff9534e84aa2b /sys/ufs
parentff6239a557dad09f5ff2c55951b55a047430c529 (diff)
downloadFreeBSD-src-dc3cb66f4c32ff1f57b40352c8a159c55df7a1ff.zip
FreeBSD-src-dc3cb66f4c32ff1f57b40352c8a159c55df7a1ff.tar.gz
A file cannot be deallocated until its last name has been removed
and it is no longer referenced by a user process. The inode for a file whose name has been removed, but is still referenced at the time of a crash will still be allocated in the filesystem, but will have no references (e.g., they will have no names referencing them from any directory). With traditional soft updates these unreferenced inodes will be found and reclaimed when the background fsck is run. When using journaled soft updates, the kernel must keep track of these inodes so that it can find and reclaim them during the cleanup process. Their existence cannot be stored in the journal as the journal only handles short-term events, and they may persist for days. So, they are tracked by keeping them in a linked list whose head pointer is stored in the superblock. The journal tracks them only until their linked list pointers have been commited to disk. Part of the cleanup process involves traversing the list of unreferenced inodes and reclaiming them. This bug was triggered when confusion arose in the commit steps of keeping the unreferenced-inode linked list coherent on disk. Notably, a race between the link() system call adding a link-count to a file and the unlink() system call removing a link-count to the file. Here if the unlink() ran after link() had looked up the file but before link() had incremented the link-count of the file, the file's link-count would drop to zero before the link() incremented it back up to one. If the file was referenced by a user process, the first transition through zero made it appear that it should be added to the unreferenced-inode list when in fact it should not have been added. If the new name created by link() was deleted within a few seconds (with the file still referenced by a user process) it would legitimately be a candidate for addition to the unreferenced-inode list. The result was that there were two attempts to add the same inode to the unreferenced-inode list which scrambled the unreferenced-inode list's pointers leading to a panic. The fix is to detect and avoid the false attempt at adding it to the unreferenced-inode list by having the link() system call check to see if the link count is zero before it increments it. If it is, the link() fails with ENOENT (showing that it has failed the link()/unlink() race). While tracking down this bug, we have added additional assertions to detect the problem sooner and also simplified some of the code. Reported by: Kirk Russell Fix submitted by: Jeff Roberson Tested by: Peter Holm PR: kern/159971 MFC (to 9 only): 2 weeks
Diffstat (limited to 'sys/ufs')
-rw-r--r--sys/ufs/ffs/ffs_softdep.c81
-rw-r--r--sys/ufs/ufs/ufs_vnops.c8
2 files changed, 37 insertions, 52 deletions
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 34567ca..a435738 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -4323,6 +4323,7 @@ inodedep_lookup_ip(ip)
(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
&inodedep);
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
+ KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
return (inodedep);
}
@@ -8455,6 +8456,7 @@ softdep_setup_remove(bp, dp, ip, isrmdir)
if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
&inodedep) == 0)
panic("softdep_setup_remove: Lost inodedep.");
+ KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
dirrem->dm_state |= ONDEPLIST;
LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
@@ -8987,6 +8989,7 @@ first_unlinked_inodedep(ump)
struct inodedep *inodedep;
struct inodedep *idp;
+ mtx_assert(&lk, MA_OWNED);
for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
inodedep; inodedep = idp) {
if ((inodedep->id_state & UNLINKNEXT) == 0)
@@ -8995,11 +8998,8 @@ first_unlinked_inodedep(ump)
if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
break;
if ((inodedep->id_state & UNLINKPREV) == 0)
- panic("first_unlinked_inodedep: prev != next");
+ break;
}
- if (inodedep == NULL)
- return (NULL);
-
return (inodedep);
}
@@ -9038,8 +9038,12 @@ handle_written_sbdep(sbdep, bp)
struct mount *mp;
struct fs *fs;
+ mtx_assert(&lk, MA_OWNED);
fs = sbdep->sb_fs;
mp = UFSTOVFS(sbdep->sb_ump);
+ /*
+ * If the superblock doesn't match the in-memory list start over.
+ */
inodedep = first_unlinked_inodedep(sbdep->sb_ump);
if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
(inodedep == NULL && fs->fs_sujfree != 0)) {
@@ -9049,8 +9053,6 @@ handle_written_sbdep(sbdep, bp)
WORKITEM_FREE(sbdep, D_SBDEP);
if (fs->fs_sujfree == 0)
return (0);
- if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0)
- panic("handle_written_sbdep: lost inodedep");
/*
* Now that we have a record of this inode in stable store allow it
* to be written to free up pending work. Inodes may see a lot of
@@ -9078,10 +9080,13 @@ unlinked_inodedep(mp, inodedep)
{
struct ufsmount *ump;
+ mtx_assert(&lk, MA_OWNED);
if (MOUNTEDSUJ(mp) == 0)
return;
ump = VFSTOUFS(mp);
ump->um_fs->fs_fmod = 1;
+ if (inodedep->id_state & UNLINKED)
+ panic("unlinked_inodedep: %p already unlinked\n", inodedep);
inodedep->id_state |= UNLINKED;
TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
}
@@ -9109,6 +9114,10 @@ clear_unlinked_inodedep(inodedep)
ino = inodedep->id_ino;
error = 0;
for (;;) {
+ mtx_assert(&lk, MA_OWNED);
+ KASSERT((inodedep->id_state & UNLINKED) != 0,
+ ("clear_unlinked_inodedep: inodedep %p not unlinked",
+ inodedep));
/*
* If nothing has yet been written simply remove us from
* the in memory list and return. This is the most common
@@ -9166,36 +9175,19 @@ clear_unlinked_inodedep(inodedep)
ACQUIRE_LOCK(&lk);
continue;
}
+ nino = 0;
+ idn = TAILQ_NEXT(inodedep, id_unlinked);
+ if (idn)
+ nino = idn->id_ino;
/*
* Remove us from the in memory list. After this we cannot
* access the inodedep.
*/
- idn = TAILQ_NEXT(inodedep, id_unlinked);
- inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ KASSERT((inodedep->id_state & UNLINKED) != 0,
+ ("clear_unlinked_inodedep: inodedep %p not unlinked",
+ inodedep));
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
- /*
- * Determine the next inode number.
- */
- nino = 0;
- if (idn) {
- /*
- * If next isn't on the list we can just clear prev's
- * state and schedule it to be fixed later. No need
- * to synchronously write if we're not in the real
- * list.
- */
- if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) {
- idp->id_state &= ~UNLINKNEXT;
- if ((idp->id_state & ONWORKLIST) == 0)
- WORKLIST_INSERT(&bp->b_dep,
- &idp->id_list);
- FREE_LOCK(&lk);
- bawrite(bp);
- ACQUIRE_LOCK(&lk);
- return;
- }
- nino = idn->id_ino;
- }
FREE_LOCK(&lk);
/*
* The predecessor's next pointer is manually updated here
@@ -9234,13 +9226,14 @@ clear_unlinked_inodedep(inodedep)
bwrite(bp);
ACQUIRE_LOCK(&lk);
}
+
if (fs->fs_sujfree != ino)
return;
panic("clear_unlinked_inodedep: Failed to clear free head");
}
if (inodedep->id_ino == fs->fs_sujfree)
panic("clear_unlinked_inodedep: Freeing head of free list");
- inodedep->id_state &= ~(UNLINKED | UNLINKLINKS);
+ inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
return;
}
@@ -9839,18 +9832,6 @@ initiate_write_inodeblock_ufs2(inodedep, bp)
inon = TAILQ_NEXT(inodedep, id_unlinked);
dp->di_freelink = inon ? inon->id_ino : 0;
}
- if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) ==
- (UNLINKED | UNLINKNEXT)) {
- struct inodedep *inon;
- ino_t freelink;
-
- inon = TAILQ_NEXT(inodedep, id_unlinked);
- freelink = inon ? inon->id_ino : 0;
- if (freelink != dp->di_freelink)
- panic("ino %p(0x%X) %d, %d != %d",
- inodedep, inodedep->id_state, inodedep->id_ino,
- freelink, dp->di_freelink);
- }
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
@@ -10849,10 +10830,9 @@ handle_written_inodeblock(inodedep, bp)
freelink = dp2->di_freelink;
}
/*
- * If we wrote a valid freelink pointer during the last write
- * record it here.
+ * Leave this inodeblock dirty until it's in the list.
*/
- if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
+ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
struct inodedep *inon;
inon = TAILQ_NEXT(inodedep, id_unlinked);
@@ -10861,12 +10841,9 @@ handle_written_inodeblock(inodedep, bp)
if (inon)
inon->id_state |= UNLINKPREV;
inodedep->id_state |= UNLINKNEXT;
- } else
- hadchanges = 1;
- }
- /* Leave this inodeblock dirty until it's in the list. */
- if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED)
+ }
hadchanges = 1;
+ }
/*
* If we had to rollback the inode allocation because of
* bitmaps being incomplete, then simply restore it.
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index a333c58..99bc1ea 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1006,6 +1006,14 @@ ufs_link(ap)
error = EMLINK;
goto out;
}
+ /*
+ * The file may have been removed after namei droped the original
+ * lock.
+ */
+ if (ip->i_effnlink == 0) {
+ error = ENOENT;
+ goto out;
+ }
if (ip->i_flags & (IMMUTABLE | APPEND)) {
error = EPERM;
goto out;
OpenPOWER on IntegriCloud