From 252c6728de604d6a897d85e212996811d5c8c46c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:25:27 -0700
Subject: ceph: add flags field to file_info

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f5cabef..8febe6f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -544,7 +544,8 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
  * we keep buffered readdir results attached to file->private_data
  */
 struct ceph_file_info {
-	int fmode;     /* initialized on open */
+	short fmode;     /* initialized on open */
+	short flags;     /* CEPH_F_* */
 
 	/* readdir: position within the dir */
 	u32 frag;
-- 
cgit v1.1


From 4918b6d140c4822201ebbe2f070875332aff337b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:26:07 -0700
Subject: ceph: add F_SYNC file flag to force sync (non-O_DIRECT) io

This allows us to force IO through the sync path which you normally only
get when multiple clients are reading/writing to the same file or by
mounting with -o sync.  Among other things, this lets test programs verify
correctness with a single mount.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c  |  6 ++++--
 fs/ceph/ioctl.c | 11 +++++++++++
 fs/ceph/ioctl.h |  1 +
 fs/ceph/super.h |  2 ++
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4698a5c..44e4fe9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -643,7 +643,8 @@ again:
 
 	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+	    (fi->flags & CEPH_F_SYNC))
 		/* hmm, this isn't really async... */
 		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
 	else
@@ -720,7 +721,8 @@ retry_snap:
 
 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+	    (fi->flags & CEPH_F_SYNC)) {
 		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
 			&iocb->ki_pos);
 	} else {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index ef0b5f4..a757a568 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -231,6 +231,14 @@ static long ceph_ioctl_lazyio(struct file *file)
 	return 0;
 }
 
+static long ceph_ioctl_syncio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+
+	fi->flags |= CEPH_F_SYNC;
+	return 0;
+}
+
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -249,6 +257,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case CEPH_IOC_LAZYIO:
 		return ceph_ioctl_lazyio(file);
+
+	case CEPH_IOC_SYNCIO:
+		return ceph_ioctl_syncio(file);
 	}
 
 	return -ENOTTY;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 52e8fd7..0c5167e 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc {
 				   struct ceph_ioctl_dataloc)
 
 #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
 
 #endif
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8febe6f..cdb17d3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -543,6 +543,8 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
 /*
  * we keep buffered readdir results attached to file->private_data
  */
+#define CEPH_F_SYNC     1
+
 struct ceph_file_info {
 	short fmode;     /* initialized on open */
 	short flags;     /* CEPH_F_* */
-- 
cgit v1.1


From 9cfa1098dcfb34f71c5f3b7bcdbbb435a0cecab2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:26:18 -0700
Subject: ceph: use flag bit for at_end readdir flag

This saves us a word of memory per file.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 10 +++++-----
 fs/ceph/super.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ef8f08c..53b441f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -133,7 +133,7 @@ more:
 		     d_unhashed(dentry) ? "!hashed" : "hashed",
 		     parent->d_subdirs.prev, parent->d_subdirs.next);
 		if (p == &parent->d_subdirs) {
-			fi->at_end = 1;
+			fi->flags |= CEPH_F_ATEND;
 			goto out_unlock;
 		}
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@@ -234,7 +234,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
-	if (fi->at_end)
+	if (fi->flags & CEPH_F_ATEND)
 		return 0;
 
 	/* always start with . and .. */
@@ -403,7 +403,7 @@ more:
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
-	fi->at_end = 1;
+	fi->flags |= CEPH_F_ATEND;
 
 	/*
 	 * if dir_release_count still matches the dir, no dentries
@@ -435,7 +435,7 @@ static void reset_readdir(struct ceph_file_info *fi)
 		dput(fi->dentry);
 		fi->dentry = NULL;
 	}
-	fi->at_end = 0;
+	fi->flags &= ~CEPH_F_ATEND;
 }
 
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
@@ -458,7 +458,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
-			fi->at_end = 0;
+			fi->flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cdb17d3..a8a2733 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -544,6 +544,7 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
  * we keep buffered readdir results attached to file->private_data
  */
 #define CEPH_F_SYNC     1
+#define CEPH_F_ATEND    2
 
 struct ceph_file_info {
 	short fmode;     /* initialized on open */
@@ -552,7 +553,6 @@ struct ceph_file_info {
 	/* readdir: position within the dir */
 	u32 frag;
 	struct ceph_mds_request *last_readdir;
-	int at_end;
 
 	/* readdir: position within a frag */
 	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
-- 
cgit v1.1


From af0ed569d7019f1b49e9e51e77b47092e656b00e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:26:31 -0700
Subject: ceph: fix snap writeback when racing with writes

There are two problems that come up when we try to queue a capsnap while a
write is in progress:

 - The FILE_WR cap is held, but not yet dirty, so we may queue a capsnap
   with dirty == 0.  That will crash later in __ceph_flush_snaps().  Or
   on the FILE_WR cap if a write is in progress.
 - We may not have i_head_snapc set, which causes problems pretty quickly.
   Look to the snaprealm in this case.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 54b14de..ac030c9 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 	spin_lock(&inode->i_lock);
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
+
+	/*
+	 * If there is a write in progress, treat that as a dirty Fw,
+	 * even though it hasn't completed yet; by the time we finish
+	 * up this capsnap it will be.
+	 */
+	if (used & CEPH_CAP_FILE_WR)
+		dirty |= CEPH_CAP_FILE_WR;
+
 	if (__ceph_have_pending_cap_snap(ci)) {
 		/* there is no point in queuing multiple "pending" cap_snaps,
 		   as no new writes are allowed to start when pending, so any
@@ -456,14 +465,22 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
-	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+	} else if (ci->i_wrbuffer_ref_head ||
 		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
 			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
-		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
-		     capsnap, snapc);
+		/*
+		 * if we are a sync write, we may need to go to the snaprealm
+		 * to get the current snapc.
+		 */
+		if (!snapc)
+			snapc = ci->i_snap_realm->cached_context;
+
+		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
+		     inode, capsnap, snapc, ceph_cap_string(dirty));
 		ihold(inode);
+		BUG_ON(dirty == 0);
 
 		atomic_set(&capsnap->nref, 1);
 		capsnap->ci = ci;
-- 
cgit v1.1


From e77dc3e9c061e50c67e2e1a604d0a370f40db298 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:26:41 -0700
Subject: ceph: only queue capsnap if caps are dirty

We used to go into this branch if i_wrbuffer_ref_head was non-zero.  This
was an ancient check from before we were careful about dealing with all
kinds of caps (and not just dirty pages).  It is cleaner to only queue a
capsnap if there is an actual dirty cap.  If we are racing with...
something...we will end up here with ci->i_wrbuffer_refs but no dirty
caps.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ac030c9..e264371 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -465,9 +465,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
-	} else if (ci->i_wrbuffer_ref_head ||
-		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
+	} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			    CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
 		/*
@@ -480,7 +479,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
 		     inode, capsnap, snapc, ceph_cap_string(dirty));
 		ihold(inode);
-		BUG_ON(dirty == 0);
 
 		atomic_set(&capsnap->nref, 1);
 		capsnap->ci = ci;
-- 
cgit v1.1


From 8f04d42276048b3baff5a5d8fa769f433c62b63e Mon Sep 17 00:00:00 2001
From: Greg Farnum <gregory.farnum@dreamhost.com>
Date: Tue, 26 Jul 2011 11:26:54 -0700
Subject: ceph: report f_bfree based on kb_avail rather than diffing.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Greg Farnum <gregory.farnum@dreamhost.com>
---
 fs/ceph/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f2f77fd..d4757c9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	 */
 	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
 	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
-	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
-		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 
 	buf->f_files = le64_to_cpu(st.num_objects);
-- 
cgit v1.1


From d8de9ab63a57326d21154c13c365f949f53ce8e1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:27:34 -0700
Subject: ceph: avoid carrying Fw cap during write into page cache

The generic_file_aio_write call may block on balance_dirty_pages while we
flush data to the OSDs.  If we hold a reference to the FILE_WR cap during
that interval revocation by the MDS (e.g., to do a stat(2)) may be very
slow.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 44e4fe9..6c90cf0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -713,7 +713,7 @@ retry_snap:
 		want = CEPH_CAP_FILE_BUFFER;
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
 	if (ret < 0)
-		goto out;
+		goto out_put;
 
 	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
@@ -726,8 +726,18 @@ retry_snap:
 		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
 			&iocb->ki_pos);
 	} else {
-		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+		/*
+		 * buffered write; drop Fw early to avoid slow
+		 * revocation if we get stuck on balance_dirty_pages
+		 */
+		int dirty;
+
+		spin_lock(&inode->i_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&inode->i_lock);
+		ceph_put_cap_refs(ci, got);
 
+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
 		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
 		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
@@ -735,7 +745,12 @@ retry_snap:
 			if (err < 0)
 				ret = err;
 		}
+
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+		goto out;
 	}
+
 	if (ret >= 0) {
 		int dirty;
 		spin_lock(&inode->i_lock);
@@ -745,12 +760,13 @@ retry_snap:
 			__mark_inode_dirty(inode, dirty);
 	}
 
-out:
+out_put:
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
 	     ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
 
+out:
 	if (ret == -EOLDSNAPC) {
 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
 		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
-- 
cgit v1.1


From acda76578813ef893004ecad0e5ad2bb6039e5f7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:27:48 -0700
Subject: ceph: fix bad parent_inode calc in ceph_lookup_open

We were always getting NULL here because the intent file f_dentry is always
NULL at this point, which means we were always passing NULL to
ceph_mdsc_do_request.  In reality, this was fine, since this isn't
currently ever a write operation that needs to get strung on the dir's
unsafe list.

Use the dir explicitly, and only pass it if this open has side-effects that
a dir fsync should flush.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6c90cf0..9b667e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -223,7 +223,6 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct file *file = nd->intent.open.file;
-	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
 	struct ceph_mds_request *req;
 	int err;
 	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
@@ -242,7 +241,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 	}
 	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	err = ceph_mdsc_do_request(mdsc,
+				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
+				   req);
 	dentry = ceph_finish_lookup(req, dentry, err);
 	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
-- 
cgit v1.1


From 9bae113a085b790de384bf86f09e15b42a65a985 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:27:59 -0700
Subject: ceph: only link open operations to directory unsafe list if
 O_CREAT|O_TRUNC

We only need to put these on the directory unsafe list if they have
side effects that fsync(2) should flush out.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 9b667e9..e0115eb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -194,7 +194,8 @@ int ceph_open(struct inode *inode, struct file *file)
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_num_caps = 1;
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ?
+				   parent_inode : NULL, req);
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
-- 
cgit v1.1


From 468640e32c7f6bfdaaa011095cc388786755d159 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:28:11 -0700
Subject: ceph: fix ceph_lookup_open intent usage

We weren't properly calling lookup_instantiate_filp when setting up the
lookup intent, which could lead to file leakage on errors.  So:

 - use separate helper for the hidden snapdir translation, immediately
   following the mds request
 - use ceph_finish_lookup for the final dentry/return value dance in the
   exit path
 - lookup_instantiate_filp on success

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 31 ++++++++++++++++++++-----------
 fs/ceph/file.c  | 23 +++++++++++++++--------
 fs/ceph/super.h |  2 ++
 3 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 53b441f..f39a409 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -482,18 +482,10 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 }
 
 /*
- * Process result of a lookup/open request.
- *
- * Mainly, make sure we return the final req->r_dentry (if it already
- * existed) in place of the original VFS-provided dentry when they
- * differ.
- *
- * Gracefully handle the case where the MDS replies with -ENOENT and
- * no trace (which it may do, at its discretion, e.g., if it doesn't
- * care to issue a lease on the negative dentry).
+ * Handle lookups for the hidden .snap directory.
  */
-struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
-				  struct dentry *dentry, int err)
+int ceph_handle_snapdir(struct ceph_mds_request *req,
+			struct dentry *dentry, int err)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
@@ -510,7 +502,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 		d_add(dentry, inode);
 		err = 0;
 	}
+	return err;
+}
 
+/*
+ * Figure out final result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
 	if (err == -ENOENT) {
 		/* no trace? */
 		err = 0;
@@ -605,6 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_locked_dir = dir;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	err = ceph_handle_snapdir(req, dentry, err);
 	dentry = ceph_finish_lookup(req, dentry, err);
 	ceph_mdsc_put_request(req);  /* will dput(dentry) */
 	dout("lookup result=%p\n", dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e0115eb..f34d47d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -223,8 +223,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	struct file *file = nd->intent.open.file;
+	struct file *file;
 	struct ceph_mds_request *req;
+	struct dentry *ret;
 	int err;
 	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
 
@@ -245,15 +246,21 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
-	dentry = ceph_finish_lookup(req, dentry, err);
-	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+	err = ceph_handle_snapdir(req, dentry, err);
+	if (err)
+		goto out;
+	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
-	if (!err)
-		err = ceph_init_file(req->r_dentry->d_inode, file,
-				     req->r_fmode);
+	if (err)
+		goto out;
+	file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open);
+	if (IS_ERR(file))
+		err = PTR_ERR(file);
+out:
+	ret = ceph_finish_lookup(req, dentry, err);
 	ceph_mdsc_put_request(req);
-	dout("ceph_lookup_open result=%p\n", dentry);
-	return dentry;
+	dout("ceph_lookup_open result=%p\n", ret);
+	return ret;
 }
 
 int ceph_release(struct inode *inode, struct file *file)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a8a2733..c24891a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -791,6 +791,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+			       struct dentry *dentry, int err);
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 					 struct dentry *dentry, int err);
 
-- 
cgit v1.1


From 2f90b852e3ae73889d7f6de6ecf429b9b6a6b103 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:28:25 -0700
Subject: ceph: ignore lease mask

The lease mask is no longer used (and it changed a while back).  Instead,
use a non-zero duration to indicate that there is a lease being issued.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c      | 10 ++++------
 fs/ceph/mds_client.c | 18 +++++++-----------
 fs/ceph/mds_client.h |  2 +-
 3 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d8858e9..f51e873 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -805,14 +805,14 @@ static void update_dentry_lease(struct dentry *dentry,
 		return;
 
 	spin_lock(&dentry->d_lock);
-	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
-	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
+	     dentry, duration, ttl);
 
 	/* make lease_rdcache_gen match directory */
 	dir = dentry->d_parent->d_inode;
 	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
 
-	if (lease->mask == 0)
+	if (duration == 0)
 		goto out_unlock;
 
 	if (di->lease_gen == session->s_cap_gen &&
@@ -1022,9 +1022,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 
 		/* do we have a dn lease? */
 		have_lease = have_dir_cap ||
-			(le16_to_cpu(rinfo->dlease->mask) &
-			 CEPH_LOCK_DN);
-
+			le32_to_cpu(rinfo->dlease->duration_ms);
 		if (!have_lease)
 			dout("fill_trace  no dentry lease or dir cap\n");
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0c1d917..3b1e743 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2714,7 +2714,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	struct ceph_mds_lease *h = msg->front.iov_base;
 	u32 seq;
 	struct ceph_vino vino;
-	int mask;
 	struct qstr dname;
 	int release = 0;
 
@@ -2725,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 		goto bad;
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
-	mask = le16_to_cpu(h->mask);
 	seq = le32_to_cpu(h->seq);
 	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
 	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
@@ -2737,8 +2735,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 	/* lookup inode */
 	inode = ceph_find_inode(sb, vino);
-	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
-	     ceph_lease_op_name(h->action), mask, vino.ino, inode,
+	dout("handle_lease %s, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), vino.ino, inode,
 	     dname.len, dname.name);
 	if (inode == NULL) {
 		dout("handle_lease no inode %llx\n", vino.ino);
@@ -2828,7 +2826,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
-	lease->mask = cpu_to_le16(1);
 	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
 	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
 	lease->seq = cpu_to_le32(seq);
@@ -2850,7 +2847,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
  * Pass @inode always, @dentry is optional.
  */
 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
-			     struct dentry *dentry, int mask)
+			     struct dentry *dentry)
 {
 	struct ceph_dentry_info *di;
 	struct ceph_mds_session *session;
@@ -2858,7 +2855,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 
 	BUG_ON(inode == NULL);
 	BUG_ON(dentry == NULL);
-	BUG_ON(mask == 0);
 
 	/* is dentry lease valid? */
 	spin_lock(&dentry->d_lock);
@@ -2868,8 +2864,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 	    di->lease_gen != di->lease_session->s_cap_gen ||
 	    !time_before(jiffies, dentry->d_time)) {
 		dout("lease_release inode %p dentry %p -- "
-		     "no lease on %d\n",
-		     inode, dentry, mask);
+		     "no lease\n",
+		     inode, dentry);
 		spin_unlock(&dentry->d_lock);
 		return;
 	}
@@ -2880,8 +2876,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 	__ceph_mdsc_drop_dentry_lease(dentry);
 	spin_unlock(&dentry->d_lock);
 
-	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
-	     inode, dentry, mask, session->s_mds);
+	dout("lease_release inode %p dentry %p to mds%d\n",
+	     inode, dentry, session->s_mds);
 	ceph_mdsc_lease_send_msg(session, inode, dentry,
 				 CEPH_MDS_LEASE_RELEASE, seq);
 	ceph_put_mds_session(session);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7d8a0d6..8a40f06 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -333,7 +333,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
 				    struct inode *inode,
-				    struct dentry *dn, int mask);
+				    struct dentry *dn);
 
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 
-- 
cgit v1.1


From e9852227431a0ed6ceda064f33e4218757acab6c Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 22 Jul 2011 11:12:28 -0700
Subject: ceph: set up readahead size when rsize is not passed

This should improve the default read performance, as without it
readahead is practically disabled.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d4757c9..d47c5ec 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -779,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb,
 		fsc->backing_dev_info.ra_pages =
 			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
+	else
+		fsc->backing_dev_info.ra_pages =
+			default_backing_dev_info.ra_pages;
+
 	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
-- 
cgit v1.1


From dfabbed6fdd509dc2beb89c954bc36014a1bc7cb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:30:02 -0700
Subject: ceph: set dir complete frag after adding capability

Curretly ceph_add_cap clears the complete bit if we are newly issued the
FILE_SHARED cap, which is normally the case for a newly issue cap on a new
directory.  That means we clear the just-set bit.  Move the check that sets
the flag to after the cap is added/updated.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f51e873..2717dc4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode,
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int i;
-	int issued, implemented;
+	int issued = 0, implemented;
+	int updating_inode = 0;
 	struct timespec mtime, atime, ctime;
 	u32 nsplits;
 	struct ceph_buffer *xattr_blob = NULL;
@@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode,
 	if (le64_to_cpu(info->version) > 0 &&
 	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
 		goto no_change;
-
+	
+	updating_inode = 1;
 	issued = __ceph_caps_issued(ci, &implemented);
 	issued |= implemented | __ceph_caps_dirty(ci);
 
@@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode,
 		ci->i_rfiles = le64_to_cpu(info->rfiles);
 		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
 		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
-
-		/* set dir completion flag? */
-		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
-		    ceph_snap(inode) == CEPH_NOSNAP &&
-		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
-		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
-			dout(" marking %p complete (empty)\n", inode);
-			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
-			ci->i_max_offset = 2;
-		}
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -774,6 +765,19 @@ no_change:
 		__ceph_get_fmode(ci, cap_fmode);
 	}
 
+	/* set dir completion flag? */
+	if (S_ISDIR(inode->i_mode) &&
+	    updating_inode &&                 /* didn't jump to no_change */
+	    ci->i_files == 0 && ci->i_subdirs == 0 &&
+	    ceph_snap(inode) == CEPH_NOSNAP &&
+	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+	    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+		dout(" marking %p complete (empty)\n", inode);
+		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+		ci->i_max_offset = 2;
+	}
+
 	/* update delegation info? */
 	if (dirinfo)
 		ceph_fill_dirfrag(inode, dirinfo);
-- 
cgit v1.1


From 48d0cbd1242aac969560ef8b90f26ee3b09a6a5c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:30:15 -0700
Subject: ceph: handle racing calls to ceph_init_dentry

The ->lookup() and prepopulate_readdir() callers are working with unhashed
dentries, so we don't have to worry.  The export.c callers, though, need
to initialize something they got back from d_obtain_alias() and are
potentially racing with other callers.  Make sure we don't return unless
the dentry is properly initialized (by us or someone else).

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f39a409..883c954 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry)
 	if (dentry->d_fsdata)
 		return 0;
 
-	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
-		d_set_d_op(dentry, &ceph_dentry_ops);
-	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
-	else
-		d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
 	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
 	if (!di)
 		return -ENOMEM;          /* oh well */
@@ -58,10 +50,21 @@ int ceph_init_dentry(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		goto out_unlock;
 	}
+
+	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
+	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		d_set_d_op(dentry, &ceph_dentry_ops);
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
+	else
+		d_set_d_op(dentry, &ceph_snap_dentry_ops);
+
 	di->dentry = dentry;
 	di->lease_session = NULL;
-	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
+	/* avoid reordering d_fsdata setup so that the check above is safe */
+	smp_mb();
+	dentry->d_fsdata = di;
 	ceph_dentry_lru_add(dentry);
 out_unlock:
 	spin_unlock(&dentry->d_lock);
-- 
cgit v1.1


From 5f21c96dd5c615341963036ae8f5e4f5227a818d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:30:29 -0700
Subject: ceph: protect access to d_parent

d_parent is protected by d_lock: use it when looking up a dentry's parent
directory inode.  Also take a reference and drop it in the caller to avoid
a use-after-free.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 15 +++++++++++++++
 fs/ceph/file.c  |  8 +++++---
 fs/ceph/inode.c |  4 +++-
 fs/ceph/ioctl.c |  4 +++-
 fs/ceph/super.h |  9 +--------
 fs/ceph/xattr.c |  8 ++++++--
 6 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 883c954..ed296ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -71,6 +71,21 @@ out_unlock:
 	return 0;
 }
 
+struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
+{
+	struct inode *inode = NULL;
+
+	if (!dentry)
+		return NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_parent) {
+		inode = dentry->d_parent->d_inode;
+		ihold(inode);
+	}
+	spin_unlock(&dentry->d_lock);
+	return inode;
+}
 
 
 /*
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f34d47d..45fbd69 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct inode *parent_inode = NULL;
 	int err;
 	int flags, fmode, wanted;
 
@@ -194,8 +194,10 @@ int ceph_open(struct inode *inode, struct file *file)
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_num_caps = 1;
-	err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ?
-				   parent_inode : NULL, req);
+	if (flags & (O_CREAT|O_TRUNC))
+		parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2717dc4..a7db56f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1562,7 +1562,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct inode *parent_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1745,7 +1745,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		req->r_inode_drop = release;
 		req->r_args.setattr.mask = cpu_to_le32(mask);
 		req->r_num_caps = 1;
+		parent_inode = ceph_get_dentry_parent_inode(dentry);
 		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+		iput(parent_inode);
 	}
 	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
 	     ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index a757a568..3b256b5 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct inode *parent_inode;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
@@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	req->r_args.setlayout.layout.fl_pg_preferred =
 		cpu_to_le32(l.preferred_osd);
 
+	parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
 	ceph_mdsc_put_request(req);
 	return err;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c24891a..c1eb9a0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -801,6 +801,7 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 extern unsigned ceph_dentry_hash(struct dentry *dn);
+extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
 
 /*
  * our d_ops vary depending on whether the inode is live,
@@ -823,14 +824,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
 			     int p_locks, int f_locks);
 extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 
-static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
-{
-	if (dentry && dentry->d_parent)
-		return dentry->d_parent->d_inode;
-
-	return NULL;
-}
-
 /* debugfs.c */
 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f42d730..96c6739 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct inode *parent_inode;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
@@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	req->r_data_len = size;
 
 	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	parent_inode = ceph_get_dentry_parent_inode(dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
 	ceph_mdsc_put_request(req);
 	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
 
@@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
-	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct inode *parent_inode;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 	req->r_num_caps = 1;
 	req->r_path2 = kstrdup(name, GFP_NOFS);
 
+	parent_inode = ceph_get_dentry_parent_inode(dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
 	ceph_mdsc_put_request(req);
 	return err;
 }
-- 
cgit v1.1


From bf1c6aca96c9d2f117dc7e590c2bc2304e7febe1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:30:43 -0700
Subject: ceph: protect d_parent access in ceph_d_revalidate

Protect d_parent with d_lock.  Carry a reference.  Simplify the flow so
that there is a single exit point and cleanup.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ed296ec..31d27f8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1024,36 +1024,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
  */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	int valid = 0;
 	struct inode *dir;
 
 	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	dir = dentry->d_parent->d_inode;
-
 	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
 	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
 	     ceph_dentry(dentry)->offset);
 
+	dir = ceph_get_dentry_parent_inode(dentry);
+
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
 		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
 		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-		goto out_touch;
+		valid = 1;
+	} else if (dentry->d_inode &&
+		   ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
+		valid = 1;
+	} else if (dentry_lease_is_valid(dentry) ||
+		   dir_lease_is_valid(dir, dentry)) {
+		valid = 1;
 	}
-	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
-		goto out_touch;
 
-	if (dentry_lease_is_valid(dentry) ||
-	    dir_lease_is_valid(dir, dentry))
-		goto out_touch;
-
-	dout("d_revalidate %p invalid\n", dentry);
-	d_drop(dentry);
-	return 0;
-out_touch:
-	ceph_dentry_lru_touch(dentry);
-	return 1;
+	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+	if (valid)
+		ceph_dentry_lru_touch(dentry);
+	else
+		d_drop(dentry);
+	iput(dir);
+	return valid;
 }
 
 /*
-- 
cgit v1.1


From e5f86dc377e7ff2b4195831153a85a3e76fefff2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:30:55 -0700
Subject: ceph: avoid d_parent in ceph_dentry_hash; fix ceph_encode_fh()
 hashing bug

Have caller pass in a safely-obtained reference to the parent directory
for calculating a dentry's hash valud.

While we're here, simpify the flow through ceph_encode_fh() so that there
is a single exit point and cleanup.

Also fix a bug with the dentry hash calculation: calculate the hash for the
dentry we were given, not its parent.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c        |  3 +--
 fs/ceph/export.c     | 24 +++++++++++++++---------
 fs/ceph/mds_client.c |  2 +-
 fs/ceph/super.h      |  2 +-
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 31d27f8..33a19df 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1244,9 +1244,8 @@ void ceph_dentry_lru_del(struct dentry *dn)
  * Return name hash for a given dentry.  This is dependent on
  * the parent directory's hash function.
  */
-unsigned ceph_dentry_hash(struct dentry *dn)
+unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 {
-	struct inode *dir = dn->d_parent->d_inode;
 	struct ceph_inode_info *dci = ceph_inode(dir);
 
 	switch (dci->i_dir_layout.dl_dir_hash) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index f67b687..9fbcdeca 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 	int type;
 	struct ceph_nfs_fh *fh = (void *)rawfh;
 	struct ceph_nfs_confh *cfh = (void *)rawfh;
-	struct dentry *parent = dentry->d_parent;
+	struct dentry *parent;
 	struct inode *inode = dentry->d_inode;
 	int connected_handle_length = sizeof(*cfh)/4;
 	int handle_length = sizeof(*fh)/4;
@@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EINVAL;
 
+	spin_lock(&dentry->d_lock);
+	parent = dget(dentry->d_parent);
+	spin_unlock(&dentry->d_lock);
+
 	if (*max_len >= connected_handle_length) {
 		dout("encode_fh %p connectable\n", dentry);
 		cfh->ino = ceph_ino(dentry->d_inode);
 		cfh->parent_ino = ceph_ino(parent->d_inode);
-		cfh->parent_name_hash = ceph_dentry_hash(parent);
+		cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
+							 dentry);
 		*max_len = connected_handle_length;
 		type = 2;
 	} else if (*max_len >= handle_length) {
 		if (connectable) {
 			*max_len = connected_handle_length;
-			return 255;
+			type = 255;
+		} else {
+			dout("encode_fh %p\n", dentry);
+			fh->ino = ceph_ino(dentry->d_inode);
+			*max_len = handle_length;
+			type = 1;
 		}
-		dout("encode_fh %p\n", dentry);
-		fh->ino = ceph_ino(dentry->d_inode);
-		*max_len = handle_length;
-		type = 1;
 	} else {
 		*max_len = handle_length;
-		return 255;
+		type = 255;
 	}
+	dput(parent);
 	return type;
 }
 
@@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
-
 	if (err < 0) {
 		iput(inode);
 		return ERR_PTR(err);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3b1e743..8a09cd5 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -670,7 +670,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		} else {
 			/* dir + name */
 			inode = dir;
-			hash = ceph_dentry_hash(req->r_dentry);
+			hash = ceph_dentry_hash(dir, req->r_dentry);
 			is_hash = true;
 		}
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c1eb9a0..35dc965 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -800,7 +800,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
-extern unsigned ceph_dentry_hash(struct dentry *dn);
+extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
 extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
 
 /*
-- 
cgit v1.1


From 4f1772645296a230e04f5c53e79cfb6f841ce634 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:31:08 -0700
Subject: ceph: document locking for ceph_set_dentry_offset

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a7db56f..5f0013a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -843,11 +843,13 @@ out_unlock:
 /*
  * Set dentry's directory position based on the current dir's max, and
  * order it in d_subdirs, so that dcache_readdir behaves.
+ *
+ * Always called under directory's i_mutex.
  */
 static void ceph_set_dentry_offset(struct dentry *dn)
 {
 	struct dentry *dir = dn->d_parent;
-	struct inode *inode = dn->d_parent->d_inode;
+	struct inode *inode = dir->d_inode;
 	struct ceph_dentry_info *di;
 
 	BUG_ON(!inode);
-- 
cgit v1.1


From 41b02e1f9bb87b07d792b64aaeb7af3d00d69cd2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:31:14 -0700
Subject: ceph: explicitly reference rename old_dentry parent dir in request

We carry a pin on the parent directory for the rename source and dest
dentries.  For the source it's r_locked_dir; we need to explicitly
reference the old_dentry parent as well, since the dentry's d_parent may
change between when the request was created and pinned and when it is
freed.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    |  2 +-
 fs/ceph/dir.c        |  2 ++
 fs/ceph/mds_client.c | 23 +++++++++++++----------
 fs/ceph/mds_client.h |  1 +
 4 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 0dba691..fb962ef 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 				path = NULL;
 			spin_lock(&req->r_old_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
-			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+			   ceph_ino(req->r_old_dentry_dir),
 				   req->r_old_dentry->d_name.len,
 				   req->r_old_dentry->d_name.name,
 				   path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 33a19df..7263f82 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -811,6 +811,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
 	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
 	req->r_locked_dir = dir;
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -909,6 +910,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 	req->r_dentry = dget(new_dentry);
 	req->r_num_caps = 2;
 	req->r_old_dentry = dget(old_dentry);
+	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
 	req->r_locked_dir = new_dir;
 	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8a09cd5..66a8939 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref)
 		destroy_reply_info(&req->r_reply_info);
 	}
 	if (req->r_inode) {
-		ceph_put_cap_refs(ceph_inode(req->r_inode),
-				  CEPH_CAP_PIN);
+		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
 		iput(req->r_inode);
 	}
 	if (req->r_locked_dir)
-		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
-				  CEPH_CAP_PIN);
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
 	if (req->r_target_inode)
 		iput(req->r_target_inode);
 	if (req->r_dentry)
 		dput(req->r_dentry);
 	if (req->r_old_dentry) {
-		ceph_put_cap_refs(
-			ceph_inode(req->r_old_dentry->d_parent->d_inode),
-			CEPH_CAP_PIN);
+		/*
+		 * track (and drop pins for) r_old_dentry_dir
+		 * separately, since r_old_dentry's d_parent may have
+		 * changed between the dir mutex being dropped and
+		 * this request being freed.
+		 */
+		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
 		dput(req->r_old_dentry);
+		iput(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
@@ -1931,9 +1935,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	if (req->r_locked_dir)
 		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
 	if (req->r_old_dentry)
-		ceph_get_cap_refs(
-			ceph_inode(req->r_old_dentry->d_parent->d_inode),
-			CEPH_CAP_PIN);
+		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
 
 	/* issue */
 	mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 8a40f06..4bb2399 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -171,6 +171,7 @@ struct ceph_mds_request {
 	struct inode *r_inode;              /* arg1 */
 	struct dentry *r_dentry;            /* arg1 */
 	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
 	char *r_path1, *r_path2;
 	struct ceph_vino r_ino1, r_ino2;
 
-- 
cgit v1.1


From d79698da32b317e96216236f265a9b72b78ae568 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 26 Jul 2011 11:31:26 -0700
Subject: ceph: document unlocked d_parent accesses

For the most part we don't care about racing with rename when directing
MDS requests; either the old or new parent is fine.  Document that, and
do some minor cleanup.

Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c        |  2 +-
 fs/ceph/mds_client.c | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7263f82..852ff86 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -506,7 +506,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
 			struct dentry *dentry, int err)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
-	struct inode *parent = dentry->d_parent->d_inode;
+	struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 66a8939..fee028b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -621,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  */
 struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
+	/*
+	 * we don't need to worry about protecting the d_parent access
+	 * here because we never renaming inside the snapped namespace
+	 * except to resplice to another snapdir, and either the old or new
+	 * result is a valid result.
+	 */
 	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
 		dentry = dentry->d_parent;
 	return dentry;
@@ -656,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	if (req->r_inode) {
 		inode = req->r_inode;
 	} else if (req->r_dentry) {
-		struct inode *dir = req->r_dentry->d_parent->d_inode;
+		/* ignore race with rename; old or new d_parent is okay */
+		struct dentry *parent = req->r_dentry->d_parent;
+		struct inode *dir = parent->d_inode;
 
 		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
@@ -664,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
-			struct dentry *dn =
-				get_nonsnap_parent(req->r_dentry->d_parent);
+			struct dentry *dn = get_nonsnap_parent(parent);
 			inode = dn->d_inode;
 			dout("__choose_mds using nonsnap parent %p\n", inode);
 		} else if (req->r_dentry->d_inode) {
-- 
cgit v1.1