From f90e5b5b136ede1f0fd15999e95f13124d6b0dbd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 24 May 2011 10:44:42 -0400 Subject: GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a2a6abb..7137750 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -663,14 +663,19 @@ static void glock_work_func(struct work_struct *work) drop_ref = 1; } spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { unsigned long holdtime, now = jiffies; + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; - set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags); + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } } run_queue(gl, 0); spin_unlock(&gl->gl_spin); -- cgit v1.1 From 1ce533686c7d40bf900dc346a7279c17a9ee8e0e Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Mon, 13 Jun 2011 14:27:40 -0500 Subject: GFS2: force a log flush when invalidating the rindex glock Right now, there is nothing that forces the log to get flushed when a node drops its rindex glock so that another node can grow the filesystem. If the log doesn't get flushed, GFS2 can corrupt the sd_log_le_rg list in the following way. A node puts an rgd on the list in rg_lo_add(), and then the rindex glock is dropped so the other node can grow the filesystem. When the node reacquires the rindex glock, that rgd gets deleted in clear_rgrpdi() before ever being removed from the list by gfs2_log_flush(). This code simply forces a log flush when the rindex glock is invalidated, solving the problem. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8ef70f4..712b722 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -221,8 +221,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } } - if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL); gl->gl_sbd->sd_rindex_uptodate = 0; + } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); } -- cgit v1.1 From 3942ae5319640ced5844b75f44884e4bcb8a2f16 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 11 Jul 2011 08:53:30 +0100 Subject: GFS2: Fix race during filesystem mount There is a potential race during filesystem mounting which has recently been reported. It occurs when the userland gfs_controld is able to process requests fast enough that it tries to use the sysfs interface before the lock module is properly initialised. This is a pretty unusual case as normally the lock module initialisation is very quick compared with gfs_controld. This patch adds an interruptible completion which is used to ensure that userland will wait for the initialisation of the lock module to complete. There are other potential solutions to this problem, but this is the quickest at this stage and has been tested both with and without mount.gfs2 present in the system. Signed-off-by: Steven Whitehouse Reported-by: David Booher --- fs/gfs2/incore.h | 2 ++ fs/gfs2/ops_fstype.c | 3 +++ fs/gfs2/sys.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0a064e9..81206e7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -17,6 +17,7 @@ #include #include #include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -546,6 +547,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_trans_gl; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; + struct completion sd_locking_init; /* Inode Stuff */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8ac9ae1..2a77071 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -72,6 +72,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -1017,11 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return ret; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index e20eab3..443cabc 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -338,6 +338,9 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%u", &first); if (rv != 1 || first > 1) return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) @@ -414,7 +417,9 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%d", &jid); if (rv != 1) return -EINVAL; - + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EINVAL; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) -- cgit v1.1 From 380f7c65a7eb3288e4b6812acf3474a1de230707 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 14 Jul 2011 08:59:44 +0100 Subject: GFS2: Resolve inode eviction and ail list interaction bug This patch contains a few misc fixes which resolve a recently reported issue. This patch has been a real team effort and has received a lot of testing. The first issue is that the ail lock needs to be held over a few more operations. The lock thats added into gfs2_releasepage() may possibly be a candidate for replacing with RCU at some future point, but at this stage we've gone for the obvious fix. The second issue is that gfs2_write_inode() can end up calling a glock recursively when called from gfs2_evict_inode() via the syncing code, so it needs a guard added. The third issue is that we either need to not truncate the metadata pages of inodes which have zero link count, but which we cannot deallocate due to them still being in use by other nodes, or we need to ensure that those pages have all made it through the journal and ail lists first. This patch takes the former approach, but the latter has also been tested and there is nothing to choose between them performance-wise. So again, we could revise that decision in the future. Also, the inode eviction process is now better documented. Signed-off-by: Steven Whitehouse Tested-by: Bob Peterson Tested-by: Abhijith Das Reported-by: Barry J. Marson Reported-by: David Teigland --- fs/gfs2/aops.c | 3 +++ fs/gfs2/glops.c | 4 ++-- fs/gfs2/log.c | 1 + fs/gfs2/super.c | 36 ++++++++++++++++++++++++++++++------ 4 files changed, 36 insertions(+), 8 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 802ac5e..f9fbbe9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1069,6 +1069,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return 0; gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -1080,6 +1081,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto not_possible; bh = bh->b_this_page; } while(bh != head); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); head = bh = page_buffers(page); @@ -1112,6 +1114,7 @@ not_possible: /* Should never happen */ WARN_ON(buffer_dirty(bh)); WARN_ON(buffer_pinned(bh)); cannot_release: + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 712b722..2cca2931 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,10 +47,10 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl) bd_ail_gl_list); bh = bd->bd_bh; gfs2_remove_from_ail(bd); - spin_unlock(&sdp->sd_ail_lock); - bd->bd_bh = NULL; bh->b_private = NULL; + spin_unlock(&sdp->sd_ail_lock); + bd->bd_blkno = bh->b_blocknr; gfs2_log_lock(sdp); gfs2_assert_withdraw(sdp, !buffer_busy(bh)); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 903115f..85c6292 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -903,6 +903,7 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) if (gfs2_ail1_empty(sdp)) break; } + gfs2_log_flush(sdp, NULL); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ed540e7..fb0edf7 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -757,13 +757,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct timespec atime; struct gfs2_dinode *di; int ret = -EAGAIN; + int unlock_required = 0; /* Skip timestamp update, if this is from a memalloc */ if (current->flags & PF_MEMALLOC) goto do_flush; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + unlock_required = 1; + } ret = gfs2_trans_begin(sdp, RES_DINODE, 0); if (ret) goto do_unlock; @@ -780,7 +784,8 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) } gfs2_trans_end(sdp); do_unlock: - gfs2_glock_dq_uninit(&gh); + if (unlock_required) + gfs2_glock_dq_uninit(&gh); do_flush: if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); @@ -1427,7 +1432,20 @@ out: return error; } -/* +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * * We have to (at the moment) hold the inodes main lock to cover * the gap between unlocking the shared lock on the iopen lock and * taking the exclusive lock. I'd rather do a shared -> exclusive @@ -1470,6 +1488,8 @@ static void gfs2_evict_inode(struct inode *inode) if (error) goto out_truncate; + /* Case 1 starts here */ + if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { error = gfs2_dir_exhash_dealloc(ip); @@ -1493,13 +1513,16 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; - gfs2_final_release_pages(ip); + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); gfs2_trans_end(sdp); out_unlock: + /* Error path for case 1 */ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); @@ -1507,6 +1530,7 @@ out_unlock: if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: + /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); -- cgit v1.1