summaryrefslogtreecommitdiffstats
path: root/sys/ufs/ffs/ffs_alloc.c
diff options
context:
space:
mode:
authorkib <kib@FreeBSD.org>2013-02-27 07:31:23 +0000
committerkib <kib@FreeBSD.org>2013-02-27 07:31:23 +0000
commit9e4aad0792ca9f43adad9b6e985e5e22a374dfd9 (patch)
tree74b9a7f74e7ed6db2a3fb83b5e337e0db47d4996 /sys/ufs/ffs/ffs_alloc.c
parent86e5cbf21f267d00b5b398c9c65f5c1137dbec35 (diff)
downloadFreeBSD-src-9e4aad0792ca9f43adad9b6e985e5e22a374dfd9.zip
FreeBSD-src-9e4aad0792ca9f43adad9b6e985e5e22a374dfd9.tar.gz
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing the order causes deadlocks. Inode block must not be written while cg block buffer is owned. The FFS copy on write needs to allocate a block to copy the content of the inode block, and the cylinder group selected for the allocation might be the same as the owned cg block. The reserved block detection code in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the situation, because the locked cg buffer is not exposed to it. In order to maintain the dependency between initialized inode block and the cg_initediblk pointer, look up the inode buffer in non-blocking mode. If succeeded, brelse cg block, initialize the inode block and write it. After the write is finished, reread cg block and update the cg_initediblk. If inode block is already locked by another thread, let the another thread initialize it. If another thread raced with us after we started writing inode block, the situation is detected by an update of cg_initediblk. Note that double-initialization of the inode block is harmless, the block cannot be used until cg_initediblk is incremented. Sponsored by: The FreeBSD Foundation In collaboration with: pho Reviewed by: mckusick MFC after: 1 month X-MFC-note: after r246877
Diffstat (limited to 'sys/ufs/ffs/ffs_alloc.c')
-rw-r--r--sys/ufs/ffs/ffs_alloc.c73
1 files changed, 67 insertions, 6 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index abe4073..789a7cf 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -1790,6 +1790,17 @@ fail:
return (0);
}
+static inline struct buf *
+getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
+{
+ struct fs *fs;
+
+ fs = ip->i_fs;
+ return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs,
+ cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
+ gbflags));
+}
+
/*
* Determine whether an inode can be allocated.
*
@@ -1814,9 +1825,11 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
u_int8_t *inosused, *loc;
struct ufs2_dinode *dp2;
int error, start, len, i;
+ u_int32_t old_initediblk;
fs = ip->i_fs;
ump = ip->i_ump;
+check_nifree:
if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
UFS_UNLOCK(ump);
@@ -1828,13 +1841,13 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
return (0);
}
cgp = (struct cg *)bp->b_data;
+restart:
if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
brelse(bp);
UFS_LOCK(ump);
return (0);
}
bp->b_xflags |= BX_BKGRDWRITE;
- cgp->cg_old_time = cgp->cg_time = time_second;
inosused = cg_inosused(cgp);
if (ipref) {
ipref %= fs->fs_ipg;
@@ -1856,7 +1869,6 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
}
}
ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
- cgp->cg_irotor = ipref;
gotit:
/*
* Check to see if we need to initialize more inodes.
@@ -1864,9 +1876,37 @@ gotit:
if (fs->fs_magic == FS_UFS2_MAGIC &&
ipref + INOPB(fs) > cgp->cg_initediblk &&
cgp->cg_initediblk < cgp->cg_niblk) {
- ibp = getblk(ip->i_devvp, fsbtodb(fs,
- ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
- (int)fs->fs_bsize, 0, 0, 0);
+ old_initediblk = cgp->cg_initediblk;
+
+ /*
+ * Free the cylinder group lock before writing the
+ * initialized inode block. Entering the
+ * babarrierwrite() with the cylinder group lock
+ * causes lock order violation between the lock and
+ * snaplk.
+ *
+ * Another thread can decide to initialize the same
+ * inode block, but whichever thread first gets the
+ * cylinder group lock after writing the newly
+ * allocated inode block will update it and the other
+ * will realize that it has lost and leave the
+ * cylinder group unchanged.
+ */
+ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
+ brelse(bp);
+ if (ibp == NULL) {
+ /*
+ * The inode block buffer is already owned by
+ * another thread, which must initialize it.
+ * Wait on the buffer to allow another thread
+ * to finish the updates, with dropped cg
+ * buffer lock, then retry.
+ */
+ ibp = getinobuf(ip, cg, old_initediblk, 0);
+ brelse(ibp);
+ UFS_LOCK(ump);
+ goto check_nifree;
+ }
bzero(ibp->b_data, (int)fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
for (i = 0; i < INOPB(fs); i++) {
@@ -1883,8 +1923,29 @@ gotit:
* loading of newly created filesystems.
*/
babarrierwrite(ibp);
- cgp->cg_initediblk += INOPB(fs);
+
+ /*
+ * After the inode block is written, try to update the
+ * cg initediblk pointer. If another thread beat us
+ * to it, then leave it unchanged as the other thread
+ * has already set it correctly.
+ */
+ error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+ (int)fs->fs_cgsize, NOCRED, &bp);
+ UFS_LOCK(ump);
+ ACTIVECLEAR(fs, cg);
+ UFS_UNLOCK(ump);
+ if (error != 0) {
+ brelse(bp);
+ return (error);
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (cgp->cg_initediblk == old_initediblk)
+ cgp->cg_initediblk += INOPB(fs);
+ goto restart;
}
+ cgp->cg_old_time = cgp->cg_time = time_second;
+ cgp->cg_irotor = ipref;
UFS_LOCK(ump);
ACTIVECLEAR(fs, cg);
setbit(inosused, ipref);
OpenPOWER on IntegriCloud