diff options
-rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 119 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 93 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 4 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_lockid.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 1 |
10 files changed, 205 insertions, 38 deletions
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4dae5df..9606111 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,9 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 8: + * - Replace delete inode votes with a cluster lock + * * New in version 7: * - DLM join domain includes the live nodemap * @@ -57,7 +60,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 7ULL +#define O2NET_PROTOCOL_VERSION 8ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 31d519a..ca4f0e0 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || lockres->l_type == OCFS2_LOCK_TYPE_DATA || - lockres->l_type == OCFS2_LOCK_TYPE_RW; + lockres->l_type == OCFS2_LOCK_TYPE_RW || + lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) @@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, case OCFS2_LOCK_TYPE_DATA: ops = &ocfs2_inode_data_lops; break; + case OCFS2_LOCK_TYPE_OPEN: + ops = &ocfs2_inode_open_lops; + break; default: mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ @@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) goto bail; } + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + bail: mlog_exit(ret); return ret; @@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) mlog_exit_void(); } +/* + * ocfs2_open_lock always get PR mode lock. + */ +int ocfs2_open_lock(struct inode *inode) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu take PRMODE open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE, 0, 0); + if (status < 0) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +int ocfs2_try_open_lock(struct inode *inode, int write) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu try to take %s open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + /* + * The file system may already holding a PRMODE/EXMODE open lock. + * Since we pass LKM_NOQUEUE, the request won't block waiting on + * other nodes and the -EAGAIN will indicate to the caller that + * this inode is still in use. + */ + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + level, LKM_NOQUEUE, 0); + +out: + mlog_exit(status); + return status; +} + +/* + * ocfs2_open_unlock unlock PR and EX mode open locks. + */ +void ocfs2_open_unlock(struct inode *inode) +{ + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry_void(); + + mlog(0, "inode %llu drop open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + if(lockres->l_ro_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE); + if(lockres->l_ex_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_EXMODE); + +out: + mlog_exit_void(); +} + int ocfs2_data_lock_full(struct inode *inode, int write, int arg_flags) @@ -2455,13 +2563,20 @@ int ocfs2_drop_inode_locks(struct inode *inode) * ocfs2_clear_inode has done it for us. */ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_data_lockres); + &OCFS2_I(inode)->ip_open_lockres); if (err < 0) mlog_errno(err); status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_data_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), &OCFS2_I(inode)->ip_meta_lockres); if (err < 0) mlog_errno(err); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index c343fca..59cb566 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, int write); int ocfs2_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_open_lock(struct inode *inode); +int ocfs2_try_open_lock(struct inode *inode, int write); +void ocfs2_open_unlock(struct inode *inode); int ocfs2_meta_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, int *level); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab56f..10d16a9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -289,7 +289,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)fe->i_blkno); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); inode->i_nlink = le16_to_cpu(fe->i_links_count); @@ -347,6 +346,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, OCFS2_LOCK_TYPE_META, 0, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, 0, inode); } ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, @@ -421,7 +423,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); /* @@ -438,7 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_LOCK_TYPE_META, generation, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, + 0, inode); + if (can_lock) { + status = ocfs2_open_lock(inode); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } status = ocfs2_meta_lock(inode, NULL, 0); if (status) { make_bad_inode(inode); @@ -447,6 +459,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } + if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { + status = ocfs2_try_open_lock(inode, 0); + if (status) { + make_bad_inode(inode); + return status; + } + } + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, can_lock ? inode : NULL); if (status < 0) { @@ -678,10 +698,10 @@ static int ocfs2_wipe_inode(struct inode *inode, struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; - /* We've already voted on this so it should be readonly - no - * spinlock needed. */ - orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + di = (struct ocfs2_dinode *) di_bh->b_data; + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); if (status) @@ -787,6 +807,35 @@ bail: return ret; } +static int ocfs2_request_delete(struct inode *inode) +{ + int status = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_inode_is_new(inode)) + return 0; + + if (ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) + return 0; + /* + * This is how ocfs2 determines whether an inode is still live + * within the cluster. Every node takes a shared read lock on + * the inode open lock in ocfs2_read_locked_inode(). When we + * get to ->delete_inode(), each node tries to convert it's + * lock to an exclusive. Trylocks are serialized by the inode + * meta data lock. If the upconvert suceeds, we know the inode + * is no longer live and can be deleted. + * + * Though we call this with the meta data lock held, the + * trylock keeps us from ABBA deadlock. + */ + status = ocfs2_try_open_lock(inode, 1); + if (status < 0 && status != -EAGAIN) + mlog_errno(status); + return status; +} + /* Query the cluster to determine whether we should wipe an inode from * disk or not. * @@ -839,11 +888,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - status = ocfs2_request_delete_vote(inode); - /* -EBUSY means that other nodes are still using the + status = ocfs2_request_delete(inode); + /* -EAGAIN means that other nodes are still using the * inode. We're done here though, so avoid doing anything on * disk and let them worry about deleting it. */ - if (status == -EBUSY) { + if (status == -EAGAIN) { status = 0; mlog(0, "Skipping delete of %llu because it is in use on" "other nodes\n", (unsigned long long)oi->ip_blkno); @@ -854,21 +903,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - spin_lock(&oi->ip_lock); - if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { - /* Nobody knew which slot this inode was orphaned - * into. This may happen during node death and - * recovery knows how to clean it up so we can safely - * ignore this inode for now on. */ - mlog(0, "Nobody knew where inode %llu was orphaned!\n", - (unsigned long long)oi->ip_blkno); - } else { - *wipe = 1; - - mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", - (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); - } - spin_unlock(&oi->ip_lock); + *wipe = 1; + mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(di->i_orphaned_slot)); bail: return status; @@ -1001,11 +1039,16 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + /* For remove delete_inode vote, we hold open lock before, + * now it is time to unlock PR and EX open locks. */ + ocfs2_open_unlock(inode); + /* Do these before all the other work so that we don't bounce * the vote thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster @@ -1030,6 +1073,7 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_meta_lockres); ocfs2_lock_res_free(&oi->ip_data_lockres); + ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_purge(inode); @@ -1086,9 +1130,6 @@ void ocfs2_drop_inode(struct inode *inode) mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - /* Testing ip_orphaned_slot here wouldn't work because we may - * not have gotten a delete_inode vote from any other nodes - * yet. */ if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) generic_delete_inode(inode); else diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1a7dd29..92d4feb 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -34,6 +34,7 @@ struct ocfs2_inode_info struct ocfs2_lock_res ip_rw_lockres; struct ocfs2_lock_res ip_meta_lockres; struct ocfs2_lock_res ip_data_lockres; + struct ocfs2_lock_res ip_open_lockres; /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; @@ -119,8 +120,8 @@ void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_NOWAIT 0x1 #define OCFS2_FI_FLAG_DELETE 0x2 -#define OCFS2_FI_FLAG_SYSFILE 0x4 -#define OCFS2_FI_FLAG_NOLOCK 0x8 +#define OCFS2_FI_FLAG_SYSFILE 0x4 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, u64 blkno, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 825cb0a..12445a3 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1306,7 +1306,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, continue; iter = ocfs2_iget(osb, le64_to_cpu(de->inode), - OCFS2_FI_FLAG_NOLOCK); + OCFS2_FI_FLAG_ORPHAN_RECOVERY); if (IS_ERR(iter)) continue; @@ -1418,7 +1418,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = slot; spin_unlock(&oi->ip_lock); iput(inode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 1fff0c0..a93c15f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -187,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, * unlink. */ spin_lock(&oi->ip_lock); oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; spin_unlock(&oi->ip_lock); bail_add: @@ -2220,9 +2219,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, /* Record which orphan dir our inode now resides * in. delete_inode will use this to determine which orphan * dir to lock. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; - spin_unlock(&OCFS2_I(inode)->ip_lock); + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); mlog(0, "Inode %llu orphaned in slot %d\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e61e218..a476b63 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -446,7 +446,9 @@ struct ocfs2_dinode { __le32 i_ctime_nsec; __le32 i_mtime_nsec; __le32 i_attr; - __le32 i_reserved1; + __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL + was set in i_flags */ + __le16 i_reserved1; /*70*/ __le64 i_reserved2[8]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4d5d565..4ca02b1 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -44,6 +44,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, + OCFS2_LOCK_TYPE_OPEN, OCFS2_NUM_LOCK_TYPES }; @@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_DENTRY: c = 'N'; break; + case OCFS2_LOCK_TYPE_OPEN: + c = 'O'; + break; default: c = '\0'; } @@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", + [OCFS2_LOCK_TYPE_OPEN] = "Open", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6534f92..16564ea 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -963,6 +963,7 @@ static void ocfs2_inode_init_once(void *data, ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_meta_lockres); ocfs2_lock_res_init_once(&oi->ip_data_lockres); + ocfs2_lock_res_init_once(&oi->ip_open_lockres); ocfs2_metadata_cache_init(&oi->vfs_inode); |