summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dlmglue.c119
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/inode.c93
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c3
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/super.c1
10 files changed, 205 insertions, 38 deletions
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df..9606111 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * New in version 8:
+ * - Replace delete inode votes with a cluster lock
+ *
* New in version 7:
* - DLM join domain includes the live nodemap
*
@@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 31d519a..ca4f0e0 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+ .get_osb = ocfs2_get_inode_osb,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
- lockres->l_type == OCFS2_LOCK_TYPE_RW;
+ lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+ lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ ops = &ocfs2_inode_open_lops;
+ break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail;
}
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
bail:
mlog_exit(ret);
return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void();
}
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu take PRMODE open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+ int status = 0, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu try to take %s open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ level = write ? LKM_EXMODE : LKM_PRMODE;
+
+ /*
+ * The file system may already holding a PRMODE/EXMODE open lock.
+ * Since we pass LKM_NOQUEUE, the request won't block waiting on
+ * other nodes and the -EAGAIN will indicate to the caller that
+ * this inode is still in use.
+ */
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ level, LKM_NOQUEUE, 0);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu drop open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ if(lockres->l_ro_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE);
+ if(lockres->l_ex_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_EXMODE);
+
+out:
+ mlog_exit_void();
+}
+
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
@@ -2455,13 +2563,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
+ &OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+ &OCFS2_I(inode)->ip_data_lockres);
+ if (err < 0)
+ mlog_errno(err);
+ if (err < 0 && !status)
+ status = err;
+
+ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)
mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca..59cb566 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f..10d16a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -289,7 +289,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)fe->i_blkno);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_nlink = le16_to_cpu(fe->i_links_count);
@@ -347,6 +346,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
+
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +423,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
- && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+ && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
/*
@@ -438,7 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META,
generation, inode);
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN,
+ 0, inode);
+
if (can_lock) {
+ status = ocfs2_open_lock(inode);
+ if (status) {
+ make_bad_inode(inode);
+ mlog_errno(status);
+ return status;
+ }
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
@@ -447,6 +459,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
+ if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+ status = ocfs2_try_open_lock(inode, 0);
+ if (status) {
+ make_bad_inode(inode);
+ return status;
+ }
+ }
+
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
@@ -678,10 +698,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di;
- /* We've already voted on this so it should be readonly - no
- * spinlock needed. */
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
@@ -787,6 +807,35 @@ bail:
return ret;
}
+static int ocfs2_request_delete(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (ocfs2_inode_is_new(inode))
+ return 0;
+
+ if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ return 0;
+ /*
+ * This is how ocfs2 determines whether an inode is still live
+ * within the cluster. Every node takes a shared read lock on
+ * the inode open lock in ocfs2_read_locked_inode(). When we
+ * get to ->delete_inode(), each node tries to convert it's
+ * lock to an exclusive. Trylocks are serialized by the inode
+ * meta data lock. If the upconvert suceeds, we know the inode
+ * is no longer live and can be deleted.
+ *
+ * Though we call this with the meta data lock held, the
+ * trylock keeps us from ABBA deadlock.
+ */
+ status = ocfs2_try_open_lock(inode, 1);
+ if (status < 0 && status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+}
+
/* Query the cluster to determine whether we should wipe an inode from
* disk or not.
*
@@ -839,11 +888,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- status = ocfs2_request_delete_vote(inode);
- /* -EBUSY means that other nodes are still using the
+ status = ocfs2_request_delete(inode);
+ /* -EAGAIN means that other nodes are still using the
* inode. We're done here though, so avoid doing anything on
* disk and let them worry about deleting it. */
- if (status == -EBUSY) {
+ if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +903,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- spin_lock(&oi->ip_lock);
- if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
- /* Nobody knew which slot this inode was orphaned
- * into. This may happen during node death and
- * recovery knows how to clean it up so we can safely
- * ignore this inode for now on. */
- mlog(0, "Nobody knew where inode %llu was orphaned!\n",
- (unsigned long long)oi->ip_blkno);
- } else {
- *wipe = 1;
-
- mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
- (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
- }
- spin_unlock(&oi->ip_lock);
+ *wipe = 1;
+ mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(di->i_orphaned_slot));
bail:
return status;
@@ -1001,11 +1039,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ /* For remove delete_inode vote, we hold open lock before,
+ * now it is time to unlock PR and EX open locks. */
+ ocfs2_open_unlock(inode);
+
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1030,6 +1073,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1130,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
- /* Testing ip_orphaned_slot here wouldn't work because we may
- * not have gotten a delete_inode vote from any other nodes
- * yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd29..92d4feb 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,6 +34,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
@@ -119,8 +120,8 @@ void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_NOLOCK 0x8
+#define OCFS2_FI_FLAG_SYSFILE 0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0a..12445a3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1306,7 +1306,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
- OCFS2_FI_FLAG_NOLOCK);
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter))
continue;
@@ -1418,7 +1418,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 1fff0c0..a93c15f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -187,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
@@ -2220,9 +2219,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218..a476b63 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -446,7 +446,9 @@ struct ocfs2_dinode {
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
__le32 i_attr;
- __le32 i_reserved1;
+ __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
+ was set in i_flags */
+ __le16 i_reserved1;
/*70*/ __le64 i_reserved2[8];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d565..4ca02b1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
+ OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES
};
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ c = 'O';
+ break;
default:
c = '\0';
}
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+ [OCFS2_LOCK_TYPE_OPEN] = "Open",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92..16564ea 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -963,6 +963,7 @@ static void ocfs2_inode_init_once(void *data,
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);
OpenPOWER on IntegriCloud