From 02b9984d640873b7b3809e63f81a0d7e13496886 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Mar 2014 10:14:33 -0400 Subject: fs: push sync_filesystem() down to the file system's remount_fs() Previously, the no-op "mount -o mount /dev/xxx" operation when the file system is already mounted read-write causes an implied, unconditional syncfs(). This seems pretty stupid, and it's certainly documented or guaraunteed to do this, nor is it particularly useful, except in the case where the file system was mounted rw and is getting remounted read-only. However, it's possible that there might be some file systems that are actually depending on this behavior. In most file systems, it's probably fine to only call sync_filesystem() when transitioning from read-write to read-only, and there are some file systems where this is not needed at all (for example, for a pseudo-filesystem or something like romfs). Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org Cc: Christoph Hellwig Cc: Artem Bityutskiy Cc: Adrian Hunter Cc: Evgeniy Dushistov Cc: Jan Kara Cc: OGAWA Hirofumi Cc: Anders Larsen Cc: Phillip Lougher Cc: Kees Cook Cc: Mikulas Patocka Cc: Petr Vandrovec Cc: xfs@oss.sgi.com Cc: linux-btrfs@vger.kernel.org Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: codalist@coda.cs.cmu.edu Cc: linux-ext4@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: fuse-devel@lists.sourceforge.net Cc: cluster-devel@redhat.com Cc: linux-mtd@lists.infradead.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux-nfs@vger.kernel.org Cc: linux-nilfs@vger.kernel.org Cc: linux-ntfs-dev@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: reiserfs-devel@vger.kernel.org --- fs/ocfs2/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 49d84f8..5f9bf8f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -631,6 +631,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; + sync_filesystem(sb); + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; -- cgit v1.1 From 58f86cc89c3372d3e61d5b71e5513ec5a0b02848 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Mar 2014 12:00:34 +1030 Subject: VERIFY_OCTAL_PERMISSIONS: stricter checking for sysfs perms. Summary of http://lkml.org/lkml/2014/3/14/363 : Ted: module_param(queue_depth, int, 444) Joe: 0444! Rusty: User perms >= group perms >= other perms? Joe: CLASS_ATTR, DEVICE_ATTR, SENSOR_ATTR and SENSOR_ATTR_2? Side effect of stricter permissions means removing the unnecessary S_IFREG from several callers. Note that the BUILD_BUG_ON_ZERO((perm) & 2) test was removed: a fair number of drivers fail this test, so that will be the debate for a future patch. Suggested-by: Joe Perches Acked-by: Bjorn Helgaas for drivers/pci/slot.c Acked-by: Greg Kroah-Hartman Cc: Miklos Szeredi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Rusty Russell --- fs/ocfs2/cluster/sys.c | 2 +- fs/ocfs2/stackglue.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b0773..b7f5727 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 1324e66..25e9f7b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -494,7 +494,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_max_locking_protocol = - __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -526,7 +526,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = - __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -548,7 +548,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = - __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -597,7 +597,7 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, static struct kobj_attribute ocfs2_attr_cluster_stack = - __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); -- cgit v1.1 From 66f5dcef137beef14560f1f6eba6717028b89089 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 13:27:03 -0500 Subject: ocfs2: don't open-code kernel_sendmsg() Signed-off-by: Al Viro --- fs/ocfs2/cluster/tcp.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406..68d80c3 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -940,33 +940,21 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } -- cgit v1.1 From 920220c11122952061f2d81a9f06ca077bb744d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 21:09:07 -0500 Subject: ocfs2: don't open-code kernel_recvmsg() Signed-off-by: Al Viro --- fs/ocfs2/cluster/tcp.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 68d80c3..ea63d64 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -916,24 +916,9 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, -- cgit v1.1 From fcacafd269adc88f41b68cb77a3f957a66563428 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 13:37:49 -0500 Subject: kill the 5th argument of generic_file_buffered_write() same story - it's &iocb->ki_pos in all cases Signed-off-by: Al Viro --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c4..89099cc 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2383,7 +2383,7 @@ relock: } else { current->backing_dev_info = file->f_mapping->backing_dev_info; written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + count, 0); current->backing_dev_info = NULL; } -- cgit v1.1 From 5cb6c6c7eb1ed24744b41fad47d9a25b72207098 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 20:58:20 -0500 Subject: generic_file_direct_write(): get rid of ppos argument always equal to &iocb->ki_pos. Signed-off-by: Al Viro --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 89099cc..77b8a74 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2375,7 +2375,7 @@ relock: if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + count, ocount); if (written < 0) { ret = written; goto out_dio; -- cgit v1.1 From 58bfab395b302306baccbd1b5f38a9b890acb4e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 22:34:52 -0500 Subject: ocfs2_file_aio_write(): switch to generic_perform_write() Signed-off-by: Al Viro --- fs/ocfs2/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 77b8a74..9c27adf 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2381,9 +2381,12 @@ relock: goto out_dio; } } else { + struct iov_iter from; + iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - count, 0); + written = generic_perform_write(file, &from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } -- cgit v1.1 From 181a9a043b5424f2e6451297bbc27b196fe88475 Mon Sep 17 00:00:00 2001 From: Zongxun Wang Date: Thu, 3 Apr 2014 14:46:45 -0700 Subject: ocfs2: fix null pointer dereference when access dlm_state before launching dlm thread When mounting an ocfs2 volume, it will firstly generate a file /sys/kernel/debug/o2dlm//dlm_state, and then launch the dlm thread. So the following situation will cause a null pointer dereference. dlm_debug_init -> access file dlm_state which will call dlm_state_print -> dlm_launch_thread Move dlm_debug_init after dlm_launch_thread and dlm_launch_recovery_thread can fix this issue. Signed-off-by: Zongxun Wang Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 33660a4..1307a8c 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1877,19 +1877,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); + status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_thread(dlm); + status = dlm_launch_recovery_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_recovery_thread(dlm); + status = dlm_debug_init(dlm); if (status < 0) { mlog_errno(status); goto bail; -- cgit v1.1 From c18ceab01240fd4c354b78d877571b729908e4a3 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:46:46 -0700 Subject: ocfs2: change ip_unaligned_aio to of type mutex from atomit_t There is a problem that waitqueue_active() may check stale data thus miss a wakeup of threads waiting on ip_unaligned_aio. The valid value of ip_unaligned_aio is only 0 and 1 so we can change it to be of type mutex thus the above prolem is avoid. Another benifit is that mutex which works as FIFO is fairer than wake_up_all(). Signed-off-by: Wengang Wang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 6 +----- fs/ocfs2/aops.h | 5 ----- fs/ocfs2/file.c | 15 +++------------ fs/ocfs2/inode.h | 2 +- fs/ocfs2/super.c | 9 ++------- 5 files changed, 7 insertions(+), 30 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index aeb44e8..ebe44f7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); - if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49..6cae155 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_is_unaligned_aio(iocb) \ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define OCFS2_IOEND_WQ_HASH_SZ 37 -#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ - OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c4..1673438 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2061,13 +2061,6 @@ out: return ret; } -static void ocfs2_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} - static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) { int blockmask = inode->i_sb->s_blocksize - 1; @@ -2345,10 +2338,8 @@ relock: * Wait on previous unaligned aio to complete before * proceeding. */ - ocfs2_aiodio_wait(inode); - - /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ - atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); + /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ ocfs2_iocb_set_unaligned_aio(iocb); } @@ -2428,7 +2419,7 @@ out_dio: if (unaligned_dio) { ocfs2_iocb_clear_unaligned_aio(iocb); - atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } out: diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73..9f1580b 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -44,7 +44,7 @@ struct ocfs2_inode_info struct rw_semaphore ip_xattr_sem; /* Number of outstanding AIO's which are not page aligned */ - atomic_t ip_unaligned_aio; + struct mutex ip_unaligned_aio; /* These fields are protected by ip_lock */ spinlock_t ip_lock; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 49d84f8..d171455 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1612,14 +1612,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) return 0; } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - static int __init ocfs2_init(void) { - int status, i; - - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) - init_waitqueue_head(&ocfs2__ioend_wq[i]); + int status; status = init_ocfs2_uptodate_cache(); if (status < 0) @@ -1761,7 +1756,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - atomic_set(&oi->ip_unaligned_aio, 0); + mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); -- cgit v1.1 From a75fe48cad2fb81e0e2671c73aea6c78ce5626d4 Mon Sep 17 00:00:00 2001 From: "joyce.xue" Date: Thu, 3 Apr 2014 14:46:47 -0700 Subject: ocfs2: remove unused variable uuid_net_key in ocfs2_initialize_super Variable uuid_net_key in ocfs2_initialize_super() is not used. Clean it up. Signed-off-by: joyce.xue Signed-off-by: Joseph Qi Acked-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d171455..d7190b2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct ocfs2_journal *journal; - __le32 uuid_net_key; struct ocfs2_super *osb; u64 total_blocks; @@ -2306,8 +2305,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); osb->vol_label[63] = '\0'; osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); -- cgit v1.1 From 2931cdcb49194503b19345c597b68fdcf78396f8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Apr 2014 14:46:48 -0700 Subject: ocfs2: improve fsync efficiency and fix deadlock between aio_write and sync_file Currently, ocfs2_sync_file grabs i_mutex and forces the current journal transaction to complete. This isn't terribly efficient, since sync_file really only needs to wait for the last transaction involving that inode to complete, and this doesn't require i_mutex. Therefore, implement the necessary bits to track the newest tid associated with an inode, and teach sync_file to wait for that instead of waiting for everything in the journal to commit. Furthermore, only issue the flush request to the drive if jbd2 hasn't already done so. This also eliminates the deadlock between ocfs2_file_aio_write() and ocfs2_sync_file(). aio_write takes i_mutex then calls ocfs2_aiodio_wait() to wait for unaligned dio writes to finish. However, if that dio completion involves calling fsync, then we can get into trouble when some ocfs2_sync_file tries to take i_mutex. Signed-off-by: Darrick J. Wong Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 1 + fs/ocfs2/aops.c | 1 + fs/ocfs2/dir.c | 4 ++++ fs/ocfs2/file.c | 36 +++++++++++++++--------------------- fs/ocfs2/inode.c | 28 ++++++++++++++++++++++++++++ fs/ocfs2/inode.h | 7 +++++++ fs/ocfs2/journal.h | 11 +++++++++++ fs/ocfs2/namei.c | 4 ++++ fs/ocfs2/super.c | 3 +++ 9 files changed, 74 insertions(+), 21 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e2edff3..6b97d68 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6932,6 +6932,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ebe44f7..d310d12 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2039,6 +2039,7 @@ out_write_size: inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 91a7e85..8b48e9b 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3338,6 +3339,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; @@ -3896,6 +3898,7 @@ out_commit: dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: @@ -4134,6 +4137,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, mlog_errno(ret); did_quota = 0; + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1673438..bd94d26 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int err = 0; - journal_t *journal; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, OCFS2_I(inode)->ip_blkno, @@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (err) return err; - /* - * Probably don't need the i_mutex at all in here, just putting it here - * to be consistent with how fsync used to be called, someone more - * familiar with the fs could possibly remove it. - */ - mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { - /* - * We still have to flush drive's caches to get data to the - * platter - */ - if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - goto bail; + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!err) + err = ret; } - journal = osb->journal->j_journal; - err = jbd2_journal_force_commit(journal); - -bail: if (err) mlog_errno(err); - mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -650,7 +644,7 @@ restarted_transaction: mlog_errno(status); goto leave; } - + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90f..28ab8a9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, goto bail; } + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, @@ -1260,6 +1287,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 9f1580b..837e5e4 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -73,6 +73,13 @@ struct ocfs2_inode_info u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 9ff4e8c..7f8cde9 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode, new_size); } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3683643..e61e4c9 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -495,6 +495,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); *new_fe_bh = NULL; @@ -576,6 +577,9 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } + oi->i_sync_tid = handle->h_transaction->t_tid; + oi->i_datasync_tid = handle->h_transaction->t_tid; + status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d7190b2..9fef73d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } -- cgit v1.1 From 34aa8dac482f1358d59110d5e3a12f4351f6acaa Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 3 Apr 2014 14:46:49 -0700 Subject: ocfs2: dlm: fix lock migration crash This issue was introduced by commit 800deef3f6f8 ("ocfs2: use list_for_each_entry where benefical") in 2007 where it replaced list_for_each with list_for_each_entry. The variable "lock" will point to invalid data if "tmpq" list is empty and a panic will be triggered due to this. Sunil advised reverting it back, but the old version was also not right. At the end of the outer for loop, that list_for_each_entry will also set "lock" to an invalid data, then in the next loop, if the "tmpq" list is empty, "lock" will be an stale invalid data and cause the panic. So reverting the list_for_each back and reset "lock" to NULL to fix this issue. Another concern is that this seemes can not happen because the "tmpq" list should not be empty. Let me describe how. old lock resource owner(node 1): migratation target(node 2): image there's lockres with a EX lock from node 2 in granted list, a NR lock from node x with convert_type EX in converting list. dlm_empty_lockres() { dlm_pick_migration_target() { pick node 2 as target as its lock is the first one in granted list. } dlm_migrate_lockres() { dlm_mark_lockres_migrating() { res->state |= DLM_LOCK_RES_BLOCK_DIRTY; wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); //after the above code, we can not dirty lockres any more, // so dlm_thread shuffle list will not run downconvert lock from EX to NR upconvert lock from NR to EX <<< migration may schedule out here, then <<< node 2 send down convert request to convert type from EX to <<< NR, then send up convert request to convert type from NR to <<< EX, at this time, lockres granted list is empty, and two locks <<< in the converting list, node x up convert lock followed by <<< node 2 up convert lock. // will set lockres RES_MIGRATING flag, the following // lock/unlock can not run dlm_lockres_release_ast(dlm, res); } dlm_send_one_lockres() dlm_process_recovery_data() for (i=0; inum_locks; i++) if (ml->node == dlm->node_num) for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { list_for_each_entry(lock, tmpq, list) if (lock) break; <<< lock is invalid as grant list is empty. } if (lock->ml.node != ml->node) BUG() >>> crash here } I see the above locks status from a vmcore of our internal bug. Signed-off-by: Junxiao Bi Reviewed-by: Wengang Wang Cc: Sunil Mushran Reviewed-by: Srinivas Eeda Cc: Joel Becker Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7035af0..c2dd258 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1750,13 +1750,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; unsigned int added = 0; __be64 c; @@ -1791,14 +1791,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* MIGRATION ONLY! */ BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + lock = NULL; spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) break; + lock = NULL; } if (lock) break; -- cgit v1.1 From ded2cf71419b9353060e633b59e446c42a6a2a09 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 3 Apr 2014 14:46:51 -0700 Subject: ocfs2: dlm: fix recovery hung There is a race window in dlm_do_recovery() between dlm_remaster_locks() and dlm_reset_recovery() when the recovery master nearly finish the recovery process for a dead node. After the master sends FINALIZE_RECO message in dlm_remaster_locks(), another node may become the recovery master for another dead node, and then send the BEGIN_RECO message to all the nodes included the old master, in the handler of this message dlm_begin_reco_handler() of old master, dlm->reco.dead_node and dlm->reco.new_master will be set to the second dead node and the new master, then in dlm_reset_recovery(), these two variables will be reset to default value. This will cause new recovery master can not finish the recovery process and hung, at last the whole cluster will hung for recovery. old recovery master: new recovery master: dlm_remaster_locks() become recovery master for another dead node. dlm_send_begin_reco_message() dlm_begin_reco_handler() { if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { return -EAGAIN; } dlm_set_reco_master(dlm, br->node_idx); dlm_set_reco_dead_node(dlm, br->dead_node); } dlm_reset_recovery() { dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } will hang in dlm_remaster_locks() for request dlm locks info Before send FINALIZE_RECO message, recovery master should set DLM_RECO_STATE_FINALIZE for itself and clear it after the recovery done, this can break the race windows as the BEGIN_RECO messages will not be handled before DLM_RECO_STATE_FINALIZE flag is cleared. A similar race may happen between new recovery master and normal node which is in dlm_finalize_reco_handler(), also fix it. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Wengang Wang Cc: Joel Becker Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index c2dd258..fe29f79 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", dlm->name, dlm->reco.dead_node, dlm->node_num); - dlm_reset_recovery(dlm); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); } dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -2884,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; default: -- cgit v1.1 From 765aabbbc72923bdb9116e49b1fc27ef22c6e65a Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 3 Apr 2014 14:46:52 -0700 Subject: ocfs2: add dlm_recover_callback_support in sysfs This is a part of the nocontrold feature which was incorporated sometime back. This is required for backward compatibility of the tools, specifically the scenario where the tools with recovery callback is used with a kernel not using the recovery callbacks (older kernel + newer tools). The tools look for this file to understand if the kernel supports DLM recovery callbacks. For kernels which support recovery callbacks but will miss this patch, ocfs2 will continue to use the older API and would still be able to mount the filesystem. [akpm@linux-foundation.org: simplify] [sfr@canb.auug.org.au: VERIFY_OCTAL_PERMISSIONS fix up] Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ca5ce14..5c8343f 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -603,11 +603,25 @@ static struct kobj_attribute ocfs2_attr_cluster_stack = ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, NULL, }; -- cgit v1.1 From 7bf619c1425d5f03e33c744921f6251f4d0d745f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:53 -0700 Subject: ocfs2: remove OCFS2_INODE_SKIP_DELETE flag The flag was never set, delete it. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 6 ------ fs/ocfs2/inode.h | 8 +++----- fs/ocfs2/journal.c | 6 ------ 3 files changed, 3 insertions(+), 17 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab8a9..2f7d75d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -849,12 +849,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 837e5e4..a6c991c 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -91,8 +91,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x00000004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x00000010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -107,11 +105,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e5..03ea931 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; spin_lock(&oi->ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; -- cgit v1.1 From bd62ad7aebd8e8895bb7649ace948040332f27d3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:54 -0700 Subject: ocfs2: move dquot_initialize() in ocfs2_delete_inode() somewhat later Move dquot_initalize() call in ocfs2_delete_inode() after the moment we verify inode is actually a sane one to delete. We certainly don't want to initialize quota for system inodes etc. This also avoids calling into quota code from downconvert thread. Add more details into the comment why bailing out from ocfs2_delete_inode() when we are in downconvert thread is OK. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 2f7d75d..809b5d5 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -831,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb->dc_task) goto bail; @@ -981,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -991,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned -- cgit v1.1 From e3a767b60fd8a9f5e133f42f4970cff77ec43173 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:56 -0700 Subject: ocfs2: implement delayed dropping of last dquot reference We cannot drop last dquot reference from downconvert thread as that creates the following deadlock: NODE 1 NODE2 holds dentry lock for 'foo' holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE dquot_initialize(bar) ocfs2_dquot_acquire() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) ... downconvert thread (triggered from another node or a different process from NODE2) ocfs2_dentry_post_unlock() ... iput(foo) ocfs2_evict_inode(foo) ocfs2_clear_inode(foo) dquot_drop(inode) ... ocfs2_dquot_release() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) - blocks finds we need more space in quota file ... ocfs2_extend_no_holes() ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) - deadlocks waiting for downconvert thread We solve the problem by postponing dropping of the last dquot reference to a workqueue if it happens from the downconvert thread. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 5 +++++ fs/ocfs2/quota.h | 2 ++ fs/ocfs2/quota_global.c | 35 +++++++++++++++++++++++++++++++++++ fs/ocfs2/super.c | 8 ++++++++ 4 files changed, 50 insertions(+) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53c..64c0223 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -419,6 +420,10 @@ struct ocfs2_super struct ocfs2_dentry_lock *dentry_lock_list; struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56c..f266d67 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d7b5108..b990a62 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(ocfs2_wq, &osb->dquot_drop_work); + goto out; + } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9fef73d..b800a1f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1941,6 +1941,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_disable_quotas(osb); + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + ocfs2_shutdown_local_alloc(osb); /* This will disable recovery and flush any recovery work. */ @@ -2276,6 +2281,9 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); osb->dentry_lock_list = NULL; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); + /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); -- cgit v1.1 From 84d86f83f9d0e8431a3c9eae4c47e9d7ff49a411 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:57 -0700 Subject: ocfs2: avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread If we are dropping last inode reference from downconvert thread, we will end up calling ocfs2_mark_lockres_freeing() which can block if the lock we are freeing is queued thus creating an A-A deadlock. Luckily, since we are the downconvert thread, we can immediately dequeue the lock and thus avoid waiting in this case. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 44 +++++++++++++++++++++++++++++++++++++++++--- fs/ocfs2/dlmglue.h | 3 ++- fs/ocfs2/inode.c | 7 ++++--- 3 files changed, 47 insertions(+), 7 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1998695..6bd690b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3144,22 +3144,60 @@ out: return 0; } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; - unsigned long flags; + unsigned long flags, flags2; ocfs2_init_mask_waiter(&mw); spin_lock_irqsave(&lockres->l_lock, flags); lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } while (lockres->l_flags & OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8..d293a22 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 809b5d5..d437f3b 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1080,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, @@ -1096,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, &oi->ip_la_data_resv); -- cgit v1.1 From 8ed6b23709b346f7bfc1edab47003a205a6a9f69 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 3 Apr 2014 14:46:59 -0700 Subject: ocfs2: revert iput deferring code in ocfs2_drop_dentry_lock The following patches are reverted in this patch because these patches caused performance regression in the remote unlink() calls. ea455f8ab683 - ocfs2: Push out dropping of dentry lock to ocfs2_wq f7b1aa69be13 - ocfs2: Fix deadlock on umount 5fd131893793 - ocfs2: Don't oops in ocfs2_kill_sb on a failed mount Previous patches in this series removed the possible deadlocks from downconvert thread so the above patches shouldn't be needed anymore. The regression is caused because these patches delay the iput() in case of dentry unlocks. This also delays the unlocking of the open lockres. The open lockresource is required to test if the inode can be wiped from disk or not. When the deleting node does not get the open lock, it marks it as orphan (even though it is not in use by another node/process) and causes a journal checkpoint. This delays operations following the inode eviction. This also moves the inode to the orphaned inode which further causes more I/O and a lot of unneccessary orphans. The following script can be used to generate the load causing issues: declare -a create declare -a remove declare -a iterations=(1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384) unique="`mktemp -u XXXXX`" script="/tmp/idontknow-${unique}.sh" cat < "${script}" for n in {1..8}; do mkdir -p test/dir\${n} eval touch test/dir\${n}/foo{1.."\$1"} done EOF chmod 700 "${script}" function fcreate () { exec 2>&1 /usr/bin/time --format=%E "${script}" "$1" } function fremove () { exec 2>&1 /usr/bin/time --format=%E ssh node2 "cd `pwd`; rm -Rf test*" } function fcp () { exec 2>&1 /usr/bin/time --format=%E ssh node3 "cd `pwd`; cp -R test test.new" } echo ------------------------------------------------- echo "| # files | create #s | copy #s | remove #s |" echo ------------------------------------------------- for ((x=0; x < ${#iterations[*]} ; x++)) do create[$x]="`fcreate ${iterations[$x]}`" copy[$x]="`fcp ${iterations[$x]}`" remove[$x]="`fremove`" printf "| %8d | %9s | %9s | %9s |\n" ${iterations[$x]} ${create[$x]} ${copy[$x]} ${remove[$x]} done rm "${script}" echo "------------------------" Signed-off-by: Srinivas Eeda Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dcache.c | 61 +++---------------------------------------------------- fs/ocfs2/dcache.h | 12 +---------- fs/ocfs2/ocfs2.h | 28 ++++--------------------- fs/ocfs2/super.c | 30 +-------------------------- 4 files changed, 9 insertions(+), 122 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d..e2e05a1 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff7..55f5889 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 64c0223..a780e20 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -275,19 +275,16 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -415,11 +412,6 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; - /* List of dquot structures to drop last reference to */ struct llist_head dquot_drop_list; struct work_struct dquot_drop_work; @@ -584,18 +576,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b800a1f..888a145 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1241,30 +1241,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; - - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); -} - static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1930,12 +1911,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -2278,9 +2253,6 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); -- cgit v1.1 From 41b63efb68115eaf48197672d59005765884b078 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 3 Apr 2014 14:47:00 -0700 Subject: ocfs2: fix type conversion risk when get cluster attributes In o2nm_cluster, cl_idle_timeout_ms, cl_keepalive_delay_ms, as well as cl_reconnect_delay_ms, are defined as type of unsigned int. So we should also use unsigned int in the helper functions. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406..b4ab371 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } -- cgit v1.1 From c8d888d9f1469806c339218fc342817eb3b628a8 Mon Sep 17 00:00:00 2001 From: Jensen Date: Thu, 3 Apr 2014 14:47:01 -0700 Subject: ocfs2: llseek requires ocfs2 inode lock for the file in SEEK_END llseek requires ocfs2 inode lock for updating the file size in SEEK_END. because the file size maybe update on another node. This bug can be reproduce the following scenario: at first, we dd a test fileA, the file size is 10k. on NodeA: --------- 1) open the test fileA, lseek the end of file. and print the position. 2) close the test fileA on NodeB: 1) open the test fileA, append the 5k data to test FileA. 2) lseek the end of file. and print the position. 3) close file. At first we run the test program1 on NodeA , the result is 10k. And then run the test program2 on NodeB, the result is 15k. At last, we run the test program1 on NodeA again, the result is 10k. After applying this patch the three step result is 15k. test result: 1000000 times lseek call; index lseek with inode lock (unit:us) lseek without inode lock (unit:us) 1 1168162 555383 2 1168011 549504 3 1170538 549396 4 1170375 551685 5 1170444 556719 6 1174364 555307 7 1163294 551552 8 1170080 549350 9 1162464 553700 10 1165441 552594 avg 1168317 552519 avg with lock - avg without lock = 615798 (avg with lock - avg without lock)/1000000=0.615798 us Signed-off-by: Jensen Cc: Jie Liu Acked-by: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bd94d26..0f14f90 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2630,7 +2630,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_SET: break; case SEEK_END: - offset += inode->i_size; + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); break; case SEEK_CUR: if (offset == 0) { -- cgit v1.1 From a35ad97cd48cfab45f4ee5eb0e6f5787a05cab25 Mon Sep 17 00:00:00 2001 From: Zhonghua Guo Date: Thu, 3 Apr 2014 14:47:02 -0700 Subject: ocfs2: fix deadlock risk when kmalloc failed in dlm_query_region_handler In dlm_query_region_handler(), once kmalloc failed, it will unlock dlm_domain_lock without lock first, then deadlock happens. Signed-off-by: Zhonghua Guo Signed-off-by: Joseph Qi Reviewed-by: Srinivas Eeda Tested-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 1307a8c..c973690 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, struct dlm_ctxt *dlm = NULL; char *local = NULL; int status = 0; - int locked = 0; qr = (struct dlm_query_region *) msg->buf; @@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, /* buffer used in dlm_mast_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); - if (!local) { - status = -ENOMEM; - goto bail; - } + if (!local) + return -ENOMEM; status = -EINVAL; @@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, if (!dlm) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "before join domain\n", qr->qr_node, qr->qr_domain); - goto bail; + goto out_domain_lock; } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qr->qr_node) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "but joining node is %d\n", qr->qr_node, qr->qr_domain, dlm->joining_node); - goto bail; + goto out_dlm_lock; } /* Support for global heartbeat was added in 1.1 */ @@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qr->qr_node, qr->qr_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto out_dlm_lock; } status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: - if (locked) - spin_unlock(&dlm->spinlock); +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: spin_unlock(&dlm_domain_lock); kfree(local); -- cgit v1.1 From 3ed2be719eb9770139da791342b190b20a7a9270 Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Thu, 3 Apr 2014 14:47:03 -0700 Subject: ocfs2: allow for more than one data extent when creating xattr Orabug: 18108070 ocfs2_xattr_extend_allocation() hits panic when creating xattr during data extent alloc phase. The problem occurs if due to local alloc fragmentation, clusters are spread over multiple extents. In this case ocfs2_add_clusters_in_btree() finds no space to store more than one extent record and therefore fails returning RESTART_META. The situation is anticipated for xattr update case but not xattr create case. This fix simply ports that code to create case. Signed-off-by: Tariq Saeed Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 185fa3b7..4217fed 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3200,8 +3200,15 @@ meta_guess: clusters_add += 1; } } else { - meta_add += 1; credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } } out: if (clusters_need) -- cgit v1.1 From 466e68c4306317e7d239e3100f612d403e3e2c3c Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Thu, 3 Apr 2014 14:47:04 -0700 Subject: ocfs2: __ocfs2_mknod_locked should return error when ocfs2_create_new_inode_locks() failed When ocfs2_create_new_inode_locks() return error, inode open lock may not be obtainted for this inode. So other nodes can remove this file and free dinode when inode still remain in memory on this node, which is not correct and may trigger BUG. So __ocfs2_mknod_locked should return error when ocfs2_create_new_inode_locks() failed. Node_1 Node_2 create fileA, call ocfs2_mknod() -> ocfs2_get_init_inode(), allocate inodeA -> ocfs2_claim_new_inode(), claim dinode(dinodeA) -> call ocfs2_create_new_inode_locks(), create open lock failed, return error -> __ocfs2_mknod_locked return success unlink fileA try open lock succeed, and free dinodeA create another file, call ocfs2_mknod() -> ocfs2_get_init_inode(), allocate inodeB -> ocfs2_claim_new_inode(), as Node_2 had freed dinodeA, so claim dinodeA and update generation for dinodeA call __ocfs2_drop_dl_inodes()->ocfs2_delete_inode() to free inodeA, and finally triggers BUG on(inode->i_generation != le32_to_cpu(fe->i_generation)) in function ocfs2_inode_lock_update(). Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e61e4c9..931fd50 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -580,9 +580,6 @@ static int __ocfs2_mknod_locked(struct inode *dir, oi->i_sync_tid = handle->h_transaction->t_tid; oi->i_datasync_tid = handle->h_transaction->t_tid; - status = 0; /* error in ocfs2_create_new_inode_locks is not - * critical */ - leave: if (status < 0) { if (*new_fe_bh) { -- cgit v1.1 From f7cf4f5bfe073ad792ab49c04f247626b3e38db6 Mon Sep 17 00:00:00 2001 From: alex chen Date: Thu, 3 Apr 2014 14:47:05 -0700 Subject: ocfs2: do not put bh when buffer_uptodate failed Do not put bh when buffer_uptodate failed in ocfs2_write_block and ocfs2_write_super_or_backup, because it will put bh in b_end_io. Otherwise it will hit a warning "VFS: brelse: Trying to free free buffer". Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: Srinivas Eeda Cc: Mark Fasheh Acked-by: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/buffer_head_io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5b704c6..1edcb14 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - put_bh(bh); mlog_errno(ret); } @@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, if (!buffer_uptodate(bh)) { ret = -EIO; - put_bh(bh); mlog_errno(ret); } -- cgit v1.1 From f81c20158f8d5f7938d5eb86ecc42ecc09273ce6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 3 Apr 2014 14:47:07 -0700 Subject: ocfs2: fix panic on kfree(xattr->name) Commit 9548906b2bb7 ('xattr: Constify ->name member of "struct xattr"') missed that ocfs2 is calling kfree(xattr->name). As a result, kernel panic occurs upon calling kfree(xattr->name) because xattr->name refers static constant names. This patch removes kfree(xattr->name) from ocfs2_mknod() and ocfs2_symlink(). Signed-off-by: Tetsuo Handa Reported-by: Tariq Saeed Tested-by: Tariq Saeed Reviewed-by: Srinivas Eeda Cc: Joel Becker Cc: Mark Fasheh Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 931fd50..4a797f2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -450,7 +450,6 @@ leave: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); @@ -1856,7 +1855,6 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) -- cgit v1.1 From 6fdb702d6262b18b1b41a35f1f81903b0a2bc2c9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Apr 2014 14:47:08 -0700 Subject: ocfs2: call ocfs2_update_inode_fsync_trans when updating any inode Ensure that ocfs2_update_inode_fsync_trans() is called any time we touch an inode in a given transaction. This is a follow-on to the previous patch to reduce lock contention and deadlocking during an fsync operation. Signed-off-by: Darrick J. Wong Cc: Mark Fasheh Cc: Joel Becker Cc: Wengang Cc: Greg Marsden Cc: Srinivas Eeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 1 + fs/ocfs2/alloc.c | 2 ++ fs/ocfs2/dir.c | 2 ++ fs/ocfs2/file.c | 7 +++++++ fs/ocfs2/move_extents.c | 2 ++ fs/ocfs2/namei.c | 1 + fs/ocfs2/suballoc.c | 3 ++- fs/ocfs2/xattr.c | 3 +++ 8 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 555f4cd..7e8282d 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 6b97d68..b4deb5f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode, } ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, et->et_root_bh); @@ -7209,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8b48e9b..0717662 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3006,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all @@ -4405,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0f14f90..ff33c5e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -286,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); out_commit: @@ -335,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_commit_trans(osb, handle); out: return ret; @@ -429,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_size = cpu_to_le64(new_i_size); di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -737,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); out: if (ret) { @@ -834,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); } @@ -1338,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -1570,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (ret) mlog_errno(ret); } + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(osb, handle); out: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 64c304d..3ca9395 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle, old_blkno, len); } + ocfs2_update_inode_fsync_trans(handle, inode, 0); out: ocfs2_free_path(path); return ret; @@ -957,6 +958,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4a797f2..2060fc3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2480,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 47ae266..482d6c2 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -2091,7 +2092,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir, ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; - + ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4217fed..14b8c46 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2602,6 +2602,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -3621,6 +3622,7 @@ int ocfs2_xattr_set(struct inode *inode, } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); ocfs2_commit_trans(osb, ctxt.handle); @@ -5483,6 +5485,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); out_commit: ocfs2_commit_trans(osb, handle); -- cgit v1.1 From e228f6439862359f9b26f9d62634e4fce180b54a Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:47:09 -0700 Subject: ocfs2: flock: drop cross-node lock when failed locally ocfs2_do_flock() calls ocfs2_file_lock() to get the cross-node clock and then call flock_lock_file_wait() to compete with local processes. In case flock_lock_file_wait() failed, say -ENOMEM, clean up work is not done. This patch adds the cleanup --drop the cross-node lock which was just granted. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Wengang Wang Cc: Joel Becker Reviewed-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/locks.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804..6b6d092 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, } ret = flock_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); -- cgit v1.1 From db66c71577d525c0cd65e66ff675747565783ba4 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Thu, 3 Apr 2014 14:47:10 -0700 Subject: ocfs2: rollback alloc_dinode counts when ocfs2_block_group_set_bits() failed After updating alloc_dinode counts in ocfs2_alloc_dinode_update_counts(), if ocfs2_alloc_dinode_update_bitmap() failed, there is a rare case that some space may be lost. So, roll back alloc_dinode counts when ocfs2_block_group_set_bits() failed. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Younger Liu Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/move_extents.c | 5 ++++- fs/ocfs2/suballoc.c | 25 ++++++++++++++++++++++++- fs/ocfs2/suballoc.h | 4 ++++ 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3ca9395..599eb4c 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -691,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len); - if (ret) + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } /* * Here we should write the new page out first if we are diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 482d6c2..94fb1f3 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1608,6 +1608,21 @@ out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) @@ -1708,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits); - if (ret < 0) + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1839,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, res->sr_bit_offset, res->sr_bits); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } @@ -2150,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, res->sr_bit_offset, res->sr_bits); if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 218d803..2d25017 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, -- cgit v1.1 From da8ded405de74de4b189d1128f4c102d06328c29 Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Thu, 3 Apr 2014 14:47:11 -0700 Subject: ocfs2/o2net: o2net_listen_data_ready should do nothing if socket state is not TCP_LISTEN Orabug: 17330860 When accepting an incomming connection o2net_accept_one clones a child data socket from the parent listening socket. It then proceeds to setup the child with callback o2net_data_ready() and sk_user_data to NULL. If data arrives in this window, o2net_listen_data_ready will be called with some non-deterministic value in sk_user_data (not inherited). We panic when we page fault on sk_user_data -- in parent it is sock_def_readable(). The fix is to recognize that this is a data socket being set up by looking at the socket state and do nothing. Signed-off-by: Tariq Saseed Signed-off-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b4ab371..eb649d2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1964,18 +1964,30 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + if (ready != NULL) + ready(sk, bytes); } static int o2net_open_listening_sock(__be32 addr, __be16 port) -- cgit v1.1 From 7dc3e83901b342ea7fe36262329c3784f2937361 Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 3 Apr 2014 14:47:12 -0700 Subject: ocfs2: iput inode alloc when failed locally In ocfs2_info_handle_freeinode() and ocfs2_test_inode_bit() func, after calls ocfs2_get_system_file_inode() to get inode ref, if calls ocfs2_info_scan_inode_alloc() or ocfs2_inode_lock() failed, we should iput inode alloc to avoid leaking the inode. Signed-off-by: jiangyiwen Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 5 +++-- fs/ocfs2/suballoc.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 8ca3c29..490229f 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode, } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; iput(inode_alloc); inode_alloc = NULL; + + if (status < 0) + goto bail; } o2info_set_request_filled(&oifi->ifi_req); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 94fb1f3..0cb889a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2894,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; -- cgit v1.1 From 43b10a20372d9a1c08391f33f1c8bd86179ddc5f Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 3 Apr 2014 14:47:13 -0700 Subject: ocfs2: avoid system inode ref confusion by adding mutex lock The following case may lead to the same system inode ref in confusion. A thread B thread ocfs2_get_system_file_inode ->get_local_system_inode ->_ocfs2_get_system_file_inode because of *arr == NULL, ocfs2_get_system_file_inode ->get_local_system_inode ->_ocfs2_get_system_file_inode gets first ref thru _ocfs2_get_system_file_inode, gets second ref thru igrab and set *arr = inode at the moment, B thread also gets two refs, so lead to one more inode ref. So add mutex lock to avoid multi thread set two inode ref once at the same time. Signed-off-by: jiangyiwen Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 2 ++ fs/ocfs2/super.c | 2 ++ fs/ocfs2/sysfile.c | 3 +++ 3 files changed, 7 insertions(+) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a780e20..8d64a97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -446,6 +446,8 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 888a145..1aecd62 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2100,6 +2100,8 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); + mutex_init(&osb->system_file_mutex); + atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688..af155c1 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); + mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, *arr = igrab(inode); BUG_ON(!*arr); } + mutex_unlock(&osb->system_file_mutex); return inode; } -- cgit v1.1 From 9c339255cb01806efbaec3d296f3e44c9357d574 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 3 Apr 2014 14:47:15 -0700 Subject: ocfs2: pass "new" parameter to ocfs2_init_xattr_bucket This patch fixes the following crash: kernel BUG at fs/ocfs2/uptodate.c:530! Modules linked in: ocfs2(F) ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs bridge xen_pciback xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn xenfs xen_privcmd sunrpc 8021q garp stp llc bonding be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iTCO_wdt iTCO_vendor_support dcdbas coretemp freq_table mperf microcode pcspkr serio_raw bnx2 lpc_ich mfd_core i5k_amb i5000_edac edac_core e1000e sg shpchp ext4(F) jbd2(F) mbcache(F) dm_round_robin(F) sr_mod(F) cdrom(F) usb_storage(F) sd_mod(F) crc_t10dif(F) pata_acpi(F) ata_generic(F) ata_piix(F) mptsas(F) mptscsih(F) mptbase(F) scsi_transport_sas(F) radeon(F) ttm(F) drm_kms_helper(F) drm(F) hwmon(F) i2c_algo_bit(F) i2c_core(F) dm_multipath(F) dm_mirror(F) dm_region_hash(F) dm_log(F) dm_mod(F) CPU 5 Pid: 21303, comm: xattr-test Tainted: GF W 3.8.13-30.el6uek.x86_64 #2 Dell Inc. PowerEdge 1950/0M788G RIP: ocfs2_set_new_buffer_uptodate+0x51/0x60 [ocfs2] Process xattr-test (pid: 21303, threadinfo ffff880017aca000, task ffff880016a2c480) Call Trace: ocfs2_init_xattr_bucket+0x8a/0x120 [ocfs2] ocfs2_cp_xattr_bucket+0xbb/0x1b0 [ocfs2] ocfs2_extend_xattr_bucket+0x20a/0x2f0 [ocfs2] ocfs2_add_new_xattr_bucket+0x23e/0x4b0 [ocfs2] ocfs2_xattr_set_entry_index_block+0x13c/0x3d0 [ocfs2] ocfs2_xattr_block_set+0xf9/0x220 [ocfs2] __ocfs2_xattr_set_handle+0x118/0x710 [ocfs2] ocfs2_xattr_set+0x691/0x880 [ocfs2] ocfs2_xattr_user_set+0x46/0x50 [ocfs2] generic_setxattr+0x96/0xa0 __vfs_setxattr_noperm+0x7b/0x170 vfs_setxattr+0xbc/0xc0 setxattr+0xde/0x230 sys_fsetxattr+0xc6/0xf0 system_call_fastpath+0x16/0x1b Code: 41 80 0c 24 01 48 89 df e8 7d f0 ff ff 4c 89 e6 48 89 df e8 a2 fe ff ff 48 89 df e8 3a f0 ff ff 48 8b 1c 24 4c 8b 64 24 08 c9 c3 <0f> 0b eb fe 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 66 66 RIP ocfs2_set_new_buffer_uptodate+0x51/0x60 [ocfs2] It hit the BUG_ON() in ocfs2_set_new_buffer_uptodate(): void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { /* This should definitely *not* exist in our cache */ if (ocfs2_buffer_cached(ci, bh)) printk(KERN_ERR "bh->b_blocknr: %lu @ %p\n", bh->b_blocknr, bh); BUG_ON(ocfs2_buffer_cached(ci, bh)); set_buffer_uptodate(bh); ocfs2_metadata_cache_io_lock(ci); ocfs2_set_buffer_uptodate(ci, bh); ocfs2_metadata_cache_io_unlock(ci); } The problem here is: We cached a block, but the buffer_head got reused. When we are to pick up this block again, a new buffer_head created with UPTODATE flag cleared. ocfs2_buffer_uptodate() returned false since no UPTODATE is set on the buffer_head. so we set this block to cache as a NEW block, then it failed at asserting block is not in cache. The fix is to add a new parameter indicating the bucket is a new allocated or not to ocfs2_init_xattr_bucket(). ocfs2_init_xattr_bucket() assert block not cached accordingly. Signed-off-by: Wengang Wang Cc: Joel Becker Reviewed-by: Mark Fasheh Cc: Joe Jin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 14b8c46..016f01d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) * them fully. */ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, - u64 xb_blkno) + u64 xb_blkno, int new) { int i, rc = 0; @@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, } if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i]); + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } } if (rc) @@ -4303,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); - ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); if (ret) { mlog_errno(ret); goto out; @@ -4647,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * Even if !new_bucket_head, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); if (ret) { mlog_errno(ret); goto out; @@ -4813,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, * Even if !t_is_new, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); if (ret) goto out; @@ -6840,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle, break; } - ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); if (ret) { mlog_errno(ret); break; -- cgit v1.1 From 91b0abe36a7b2b3b02d7500925a5f8455334f0e5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:49 -0700 Subject: mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d437f3b..437de7f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -964,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) @@ -1181,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode) (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } -- cgit v1.1 From 676d23690fb62b5d51ba5d659935e9f7d9da9f8e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 11 Apr 2014 16:15:36 -0400 Subject: net: Fix use after free by removing length arg from sk_data_ready callbacks. Several spots in the kernel perform a sequence like: skb_queue_tail(&sk->s_receive_queue, skb); sk->sk_data_ready(sk, skb->len); But at the moment we place the SKB onto the socket receive queue it can be consumed and freed up. So this skb->len access is potentially to freed up memory. Furthermore, the skb->len can be modified by the consumer so it is possible that the value isn't accurate. And finally, no actual implementation of this callback actually uses the length argument. And since nobody actually cared about it's value, lots of call sites pass arbitrary values in such as '0' and even '1'. So just remove the length argument from the callback, that way there is no confusion whatsoever and all of these use-after-free cases get fixed as a side effect. Based upon a patch by Eric Dumazet and his suggestion to audit this issue tree-wide. Signed-off-by: David S. Miller --- fs/ocfs2/cluster/tcp.c | 15 +++++++-------- fs/ocfs2/cluster/tcp_internal.h | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/ocfs2') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eb649d2..d857534 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) } read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -1953,9 +1953,9 @@ static void o2net_accept_many(struct work_struct *work) cond_resched(); } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; @@ -1978,7 +1978,6 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) */ if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); } else { ready = NULL; @@ -1987,7 +1986,7 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); if (ready != NULL) - ready(sk, bytes); + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65..dc024367 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; -- cgit v1.1