From 819ccbfa448403992ceafc05d6d7097aaa74d4c3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Apr 2010 09:33:46 -0700 Subject: ceph: fix leaked inode ref due to snap metadata writeback race We create a ceph_cap_snap if there is dirty cap metadata (for writeback to mds) OR dirty pages (for writeback to osd). It is thus possible that the metadata has been written back to the MDS but the OSD data has not when the cap_snap is created. This results in a cap_snap with dirty(caps) == 0. The problem is that cap writeback to the MDS isn't necessary, and a FLUSHSNAP cap op gets no ack from the MDS. This leaves the cap_snap attached to the inode along with its inode reference. Fix the problem by dropping the cap_snap if it becomes 'complete' (all pages written out) and dirty(caps) == 0 in ceph_put_wrbuffer_cap_refs(). Also, BUG() in __ceph_flush_snaps() if we encounter a cap_snap with dirty(caps) == 0. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/ceph/snap.c') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index df04e21..7e3e5f9 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " "still has %d dirty pages\n", inode, capsnap, capsnap->context, capsnap->context->seq, - capsnap->size, capsnap->dirty_pages); + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); return 0; } - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, - capsnap->context->seq, capsnap->size); + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); spin_lock(&mdsc->snap_flush_lock); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); -- cgit v1.1 From fc837c8f0446b73a1661339db406c0238dd1d184 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 13 Apr 2010 11:41:22 -0700 Subject: ceph: queue_cap_snap should always queue dirty context This simplifies the calling convention, and fixes a bug where we queue a capsnap with a context other than i_head_snapc (the one that matches the dirty pages). The result was a BUG at fs/ceph/caps.c:2178 on writeback completion when a capsnap matching the writeback snapc could not be found. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/ceph/snap.c') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 7e3e5f9..d1974315 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -430,8 +430,7 @@ static int dup_array(u64 **dst, __le64 *src, int num) * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci, - struct ceph_snap_context *snapc) +void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; @@ -450,10 +449,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p snapc %p seq %llu used %d" - " already pending\n", inode, snapc, snapc->seq, used); + dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + struct ceph_snap_context *snapc = ci->i_head_snapc; + igrab(inode); atomic_set(&capsnap->nref, 1); @@ -462,7 +462,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, INIT_LIST_HEAD(&capsnap->flushing_item); capsnap->follows = snapc->seq - 1; - capsnap->context = ceph_get_snap_context(snapc); capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = __ceph_caps_dirty(ci); @@ -479,7 +478,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, snapshot. */ capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; - ceph_put_snap_context(ci->i_head_snapc); + capsnap->context = snapc; ci->i_head_snapc = NULL; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); @@ -603,7 +602,7 @@ more: if (lastinode) iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci, realm->cached_context); + ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); @@ -825,8 +824,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); - ceph_queue_cap_snap(ci, - ci->i_snap_realm->cached_context); + ceph_queue_cap_snap(ci); iput(inode); continue; -- cgit v1.1