diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext3/super.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 8 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 4 | ||||
-rw-r--r-- | fs/libfs.c | 29 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 9 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/dcache.c | 33 | ||||
-rw-r--r-- | fs/ocfs2/dcache.h | 1 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmdebug.c | 8 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 8 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 73 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 1 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 12 | ||||
-rw-r--r-- | fs/ocfs2/ioctl.c | 356 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 9 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/mmap.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 3 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 32 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_ioctl.h | 95 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.c | 43 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.h | 7 | ||||
-rw-r--r-- | fs/ocfs2/slot_map.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/suballoc.c | 16 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 104 | ||||
-rw-r--r-- | fs/ocfs2/sysfile.c | 60 |
27 files changed, 820 insertions, 115 deletions
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5dbf4db..a367dd0 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - if (le32_to_cpu(es->s_blocks_count) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + if (generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count))) { ext3_msg(sb, KERN_ERR, "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2614774..7f47c36 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - if ((ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || - (ext4_blocks_count(es) > - (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { + ret = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (ret) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = -EFBIG; goto failed_mount; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0e8014e..262419f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; + /* Load journal superblock if it is not loaded yet. */ + if (journal->j_format_version == 0 && + journal_get_superblock(journal) != 0) + return 0; if (journal->j_format_version == 1) return 0; @@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync) } EXPORT_SYMBOL(generic_file_fsync); +/** + * generic_check_addressable - Check addressability of file system + * @blocksize_bits: log of file system block size + * @num_blocks: number of blocks in file system + * + * Determine whether a file system with @num_blocks blocks (and a + * block size of 2**@blocksize_bits) is addressable by the sector_t + * and page cache of the system. Return 0 if so and -EFBIG otherwise. + */ +int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) +{ + u64 last_fs_block = num_blocks - 1; + u64 last_fs_page = + last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); + + if (unlikely(num_blocks == 0)) + return 0; + + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + return -EINVAL; + + if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { + return -EFBIG; + } + return 0; +} +EXPORT_SYMBOL(generic_check_addressable); + /* * No-op implementation of ->fsync for in-memory filesystems. */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0de69c9..5cfeee1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -883,8 +883,8 @@ struct ocfs2_write_ctxt { * out in so that future reads from that region will get * zero's. */ - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; /* @@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) @@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, di_bh, + ret = ocfs2_refcount_cow(inode, filp, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); @@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93f..7606f66 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b4957c7..edaded4 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -40,6 +40,14 @@ #include "inode.h" #include "super.h" +void ocfs2_dentry_attach_gen(struct dentry *dentry) +{ + unsigned long gen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + BUG_ON(dentry->d_inode); + dentry->d_fsdata = (void *)gen; +} + static int ocfs2_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); - /* Never trust a negative dentry - force a new lookup. */ + /* For a negative dentry - + * check the generation number of the parent and compare with the + * one stored in the inode. + */ if (inode == NULL) { - mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, - dentry->d_name.name); - goto bail; + unsigned long gen = (unsigned long) dentry->d_fsdata; + unsigned long pgen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + mlog(0, "negative dentry: %.*s parent gen: %lu " + "dentry gen: %lu\n", + dentry->d_name.len, dentry->d_name.name, pgen, gen); + if (gen != pgen) + goto bail; + goto valid; } BUG_ON(!osb); @@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } +valid: ret = 1; bail: @@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; + if (!dentry->d_inode && dentry->d_fsdata) { + /* Converting a negative dentry to positive + Clear dentry->d_fsdata */ + dentry->d_fsdata = dl = NULL; + } + if (dl) { mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, " \"%.*s\": old parent: %llu, new: %llu\n", @@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) out: iput(inode); + ocfs2_dentry_attach_gen(dentry); } /* diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index f5dd178..b79eff7 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, struct inode *old_dir, struct inode *new_dir); extern spinlock_t dentry_attach_lock; +void ocfs2_dentry_attach_gen(struct dentry *dentry); #endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index f693ab8..272ec86 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) struct hlist_head *bucket; struct hlist_node *list; int i, out = 0; - unsigned long total = 0, longest = 0, bktcnt; + unsigned long total = 0, longest = 0, bucket_count = 0; out += snprintf(db->buf + out, db->len - out, "Dumping MLEs for Domain: %s\n", dlm->name); @@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) mle = hlist_entry(list, struct dlm_master_list_entry, master_hash_node); ++total; - ++bktcnt; + ++bucket_count; if (db->len - out < 200) continue; out += dump_mle(mle, db->buf + out, db->len - out); } - longest = max(longest, bktcnt); - bktcnt = 0; + longest = max(longest, bucket_count); + bucket_count = 0; } spin_unlock(&dlm->master_lock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5e02a89..e8d94d7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, { struct inode *inode; struct address_space *mapping; + struct ocfs2_inode_info *oi; inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + if (S_ISDIR(inode->i_mode)) { + oi = OCFS2_I(inode); + oi->ip_dir_lock_gen++; + mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); + goto out; + } + if (!S_ISREG(inode->i_mode)) goto out; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9a03c15..9e8cc43 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -64,12 +64,6 @@ #include "buffer_head_io.h" -static int ocfs2_sync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - return sync_mapping_buffers(inode->i_mapping); -} - static int ocfs2_init_file_private(struct inode *inode, struct file *file) { struct ocfs2_file_private *fp; @@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync) { int err = 0; journal_t *journal; - struct dentry *dentry = file->f_path.dentry; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, - dentry->d_name.len, dentry->d_name.name); - - err = ocfs2_sync_inode(dentry->d_inode); - if (err) - goto bail; + mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync, + file->f_path.dentry, file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* @@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); out: return status; @@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, - UINT_MAX); + rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); goto out; @@ -2062,6 +2052,7 @@ out: } static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, loff_t pos, size_t count, int *meta_level) { @@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: @@ -2087,7 +2078,7 @@ out: return ret; } -static int ocfs2_prepare_inode_for_write(struct dentry *dentry, +static int ocfs2_prepare_inode_for_write(struct file *file, loff_t *ppos, size_t count, int appending, @@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, int *has_refcount) { int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos, end; @@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, meta_level = -1; ret = ocfs2_prepare_inode_for_refcount(inode, + file, saved_pos, count, &meta_level); @@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, @@ -2255,16 +2250,39 @@ relock: have_alloc_sem = 1; } - /* concurrent O_DIRECT writes are allowed */ - rw_level = !direct_io; + /* + * Concurrent O_DIRECT writes are allowed with + * mount_option "coherency=buffered". + */ + rw_level = (!direct_io || full_coherency); + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { mlog_errno(ret); goto out_sems; } + /* + * O_DIRECT writes with "coherency=full" need to take EX cluster + * inode_lock to guarantee coherency. + */ + if (direct_io && full_coherency) { + /* + * We need to take and drop the inode lock to force + * other nodes to drop their caches. Buffered I/O + * already does this in write_begin(). + */ + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_sems; + } + + ocfs2_inode_unlock(inode, 1); + } + can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + ret = ocfs2_prepare_inode_for_write(file, ppos, iocb->ki_left, appending, &can_do_direct, &has_refcount); if (ret < 0) { @@ -2312,17 +2330,6 @@ relock: written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { - /* - * direct write may have instantiated a few - * blocks outside i_size. Trim these off again. - * Don't need i_size_read because we hold i_mutex. - * - * XXX(truncate): this looks buggy because ocfs2 did not - * actually implement ->truncate. Take a look at - * the new truncate sequence and update this accordingly - */ - if (*ppos + count > inode->i_size) - truncate_setsize(inode, inode->i_size); ret = written; goto out_dio; } @@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, { int ret; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + ret = ocfs2_prepare_inode_for_write(out, &sd->pos, sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index eece3e0..f935fd6 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, else inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); + OCFS2_I(inode)->ip_dir_lock_gen = 1; break; case S_IFLNK: if (ocfs2_inode_is_fast_symlink(inode)) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 6de5a86..1c508b1 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,30 +46,28 @@ struct ocfs2_inode_info /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; - u32 ip_clusters; struct list_head ip_io_markers; + u32 ip_clusters; + u16 ip_dyn_features; struct mutex ip_io_mutex; - u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ - u16 ip_dyn_features; /* protected by recovery_lock. */ struct inode *ip_next_orphan; - u32 ip_dir_start_lookup; - struct ocfs2_caching_info ip_metadata_cache; - struct ocfs2_extent_map ip_extent_map; - struct inode vfs_inode; struct jbd2_inode ip_jinode; + u32 ip_dir_start_lookup; + /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; + u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; }; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7d9d9c1..7a48681 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -26,6 +26,26 @@ #include <linux/ext2_fs.h> +#define o2info_from_user(a, b) \ + copy_from_user(&(a), (b), sizeof(a)) +#define o2info_to_user(a, b) \ + copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) + +/* + * This call is void because we are already reporting an error that may + * be -EFAULT. The error will be returned from the ioctl(2) call. It's + * just a best-effort to tell userspace that this request caused the error. + */ +static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, + struct ocfs2_info_request __user *req) +{ + kreq->ir_flags |= OCFS2_INFO_FL_ERROR; + (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); +} + +#define o2info_set_request_error(a, b) \ + __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) + static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { int status; @@ -109,6 +129,328 @@ bail: return status; } +int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_blocksize oib; + + if (o2info_from_user(oib, req)) + goto bail; + + oib.ib_blocksize = inode->i_sb->s_blocksize; + oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oib, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oib, req); + + return status; +} + +int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_clustersize oic; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oic, req)) + goto bail; + + oic.ic_clustersize = osb->s_clustersize; + oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oic, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oic, req); + + return status; +} + +int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_maxslots oim; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oim, req)) + goto bail; + + oim.im_max_slots = osb->max_slots; + oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oim, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oim, req); + + return status; +} + +int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_label oil; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oil, req)) + goto bail; + + memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); + oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oil, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oil, req); + + return status; +} + +int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_uuid oiu; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oiu, req)) + goto bail; + + memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); + oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oiu, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oiu, req); + + return status; +} + +int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_fs_features oif; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oif, req)) + goto bail; + + oif.if_compat_features = osb->s_feature_compat; + oif.if_incompat_features = osb->s_feature_incompat; + oif.if_ro_compat_features = osb->s_feature_ro_compat; + oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oif, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oif, req); + + return status; +} + +int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_journal_size oij; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oij, req)) + goto bail; + + oij.ij_journal_size = osb->journal->j_inode->i_size; + + oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oij, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oij, req); + + return status; +} + +int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oir, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oir, req); + + return status; +} + +/* + * Validate and distinguish OCFS2_IOC_INFO requests. + * + * - validate the magic number. + * - distinguish different requests. + * - validate size of different requests. + */ +int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + status = -EINVAL; + if (oir.ir_magic != OCFS2_INFO_MAGIC) + goto bail; + + switch (oir.ir_code) { + case OCFS2_INFO_BLOCKSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) + status = ocfs2_info_handle_blocksize(inode, req); + break; + case OCFS2_INFO_CLUSTERSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) + status = ocfs2_info_handle_clustersize(inode, req); + break; + case OCFS2_INFO_MAXSLOTS: + if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) + status = ocfs2_info_handle_maxslots(inode, req); + break; + case OCFS2_INFO_LABEL: + if (oir.ir_size == sizeof(struct ocfs2_info_label)) + status = ocfs2_info_handle_label(inode, req); + break; + case OCFS2_INFO_UUID: + if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) + status = ocfs2_info_handle_uuid(inode, req); + break; + case OCFS2_INFO_FS_FEATURES: + if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) + status = ocfs2_info_handle_fs_features(inode, req); + break; + case OCFS2_INFO_JOURNAL_SIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) + status = ocfs2_info_handle_journal_size(inode, req); + break; + default: + status = ocfs2_info_handle_unknown(inode, req); + break; + } + +bail: + return status; +} + +int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) +{ + int status = -EFAULT; + u64 __user *bp = NULL; + + if (compat_flag) { +#ifdef CONFIG_COMPAT + /* + * pointer bp stores the base address of a pointers array, + * which collects all addresses of separate request. + */ + bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); +#else + BUG(); +#endif + } else + bp = (u64 __user *)(unsigned long)(info->oi_requests); + + if (o2info_from_user(*req_addr, bp + idx)) + goto bail; + + status = 0; +bail: + return status; +} + +/* + * OCFS2_IOC_INFO handles an array of requests passed from userspace. + * + * ocfs2_info_handle() recevies a large info aggregation, grab and + * validate the request count from header, then break it into small + * pieces, later specific handlers can handle them one by one. + * + * Idea here is to make each separate request small enough to ensure + * a better backward&forward compatibility, since a small piece of + * request will be less likely to be broken if disk layout get changed. + */ +int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) +{ + int i, status = 0; + u64 req_addr; + struct ocfs2_info_request __user *reqp; + + if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || + (!info->oi_requests)) { + status = -EINVAL; + goto bail; + } + + for (i = 0; i < info->oi_count; i++) { + + status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); + if (status) + break; + + reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; + if (!reqp) { + status = -EINVAL; + goto bail; + } + + status = ocfs2_info_handle_request(inode, reqp); + if (status) + break; + } + +bail: + return status; +} + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_path.dentry->d_inode; @@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct reflink_arguments args; const char *old_path, *new_path; bool preserve; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 0); default: return -ENOTTY; } @@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) bool preserve; struct reflink_arguments args; struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC32_GETFLAGS: @@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 1); default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9b57c03..faa2303 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; unsigned int flushed; - unsigned long old_id; struct ocfs2_journal *journal = NULL; mlog_entry_void(); @@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - old_id = ocfs2_inc_trans_id(journal); + ocfs2_inc_trans_id(journal); flushed = atomic_read(&journal->j_num_trans); atomic_set(&journal->j_num_trans, 0); @@ -342,9 +341,6 @@ finally: return status; } -/* pass it NULL and it will allocate a new handle object for you. If - * you pass it a handle however, it may still return error, in which - * case it has free'd the passed handle for you. */ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) { journal_t *journal = osb->journal->j_journal; @@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) os = &osb->osb_orphan_scan; + mlog(0, "Begin orphan scan\n"); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) goto out; @@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: + mlog(0, "Orphan scan completed\n"); return; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index b5baaa8..43e56b9 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -67,11 +67,12 @@ struct ocfs2_journal { struct buffer_head *j_bh; /* Journal disk inode block */ atomic_t j_num_trans; /* Number of transactions * currently in the system. */ + spinlock_t j_lock; unsigned long j_trans_id; struct rw_semaphore j_trans_barrier; wait_queue_head_t j_checkpointed; - spinlock_t j_lock; + /* both fields protected by j_lock*/ struct list_head j_la_cleanups; struct work_struct j_recovery_work; }; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 4c18f4a..7e32db9 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, +static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { int ret; + struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); unsigned int len = PAGE_CACHE_SIZE; @@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, if (page->index == last_index) len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; - ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) @@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(inode, di_bh, page); + ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); up_write(&OCFS2_I(inode)->ip_alloc_sem); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a00dda2..e7bde21 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -171,7 +171,8 @@ bail_add: ret = ERR_PTR(status); goto bail_unlock; } - } + } else + ocfs2_dentry_attach_gen(dentry); bail_unlock: /* Don't drop the cluster lock until *after* the d_add -- diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 481387b..d840821 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data); struct ocfs2_lock_res { void *l_priv; struct ocfs2_lock_res_ops *l_ops; - spinlock_t l_lock; + struct list_head l_blocked_list; struct list_head l_mask_waiters; - enum ocfs2_lock_type l_type; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; - int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - struct ocfs2_dlm_lksb l_lksb; + unsigned char l_level; + + /* Data packed - type enum ocfs2_lock_type */ + unsigned char l_type; /* used from AST/BAST funcs. */ - enum ocfs2_ast_action l_action; - enum ocfs2_unlock_action l_unlock_action; - int l_requested; - int l_blocking; + /* Data packed - enum type ocfs2_ast_action */ + unsigned char l_action; + /* Data packed - enum type ocfs2_unlock_action */ + unsigned char l_unlock_action; + unsigned char l_requested; + unsigned char l_blocking; unsigned int l_pending_gen; + spinlock_t l_lock; + + struct ocfs2_dlm_lksb l_lksb; + wait_queue_head_t l_event; struct list_head l_debug_list; @@ -256,8 +263,10 @@ enum ocfs2_mount_options control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ - OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */ - OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */ + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT + writes */ + OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -279,7 +288,8 @@ struct ocfs2_super struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; - struct inode *system_inodes[NUM_SYSTEM_INODES]; + struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; + struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 28ff536..c2e4f82 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -338,6 +338,7 @@ enum { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE +#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, @@ -346,8 +347,12 @@ enum { TRUNCATE_LOG_SYSTEM_INODE, LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE +#define NUM_LOCAL_SYSTEM_INODES \ + (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Global system inodes (single copy) */ diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 5d24150..b46f39b 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -76,4 +76,99 @@ struct reflink_arguments { }; #define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) +/* Following definitions dedicated for ocfs2_info_request ioctls. */ +#define OCFS2_INFO_MAX_REQUEST (50) +#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) + +/* Magic number of all requests */ +#define OCFS2_INFO_MAGIC (0x4F32494E) + +/* + * Always try to separate info request into small pieces to + * guarantee the backward&forward compatibility. + */ +struct ocfs2_info { + __u64 oi_requests; /* Array of __u64 pointers to requests */ + __u32 oi_count; /* Number of requests in info_requests */ + __u32 oi_pad; +}; + +struct ocfs2_info_request { +/*00*/ __u32 ir_magic; /* Magic number */ + __u32 ir_code; /* Info request code */ + __u32 ir_size; /* Size of request */ + __u32 ir_flags; /* Request flags */ +/*10*/ /* Request specific fields */ +}; + +struct ocfs2_info_clustersize { + struct ocfs2_info_request ic_req; + __u32 ic_clustersize; + __u32 ic_pad; +}; + +struct ocfs2_info_blocksize { + struct ocfs2_info_request ib_req; + __u32 ib_blocksize; + __u32 ib_pad; +}; + +struct ocfs2_info_maxslots { + struct ocfs2_info_request im_req; + __u32 im_max_slots; + __u32 im_pad; +}; + +struct ocfs2_info_label { + struct ocfs2_info_request il_req; + __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; +} __attribute__ ((packed)); + +struct ocfs2_info_uuid { + struct ocfs2_info_request iu_req; + __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; +} __attribute__ ((packed)); + +struct ocfs2_info_fs_features { + struct ocfs2_info_request if_req; + __u32 if_compat_features; + __u32 if_incompat_features; + __u32 if_ro_compat_features; + __u32 if_pad; +}; + +struct ocfs2_info_journal_size { + struct ocfs2_info_request ij_req; + __u64 ij_journal_size; +}; + +/* Codes for ocfs2_info_request */ +enum ocfs2_info_type { + OCFS2_INFO_CLUSTERSIZE = 1, + OCFS2_INFO_BLOCKSIZE, + OCFS2_INFO_MAXSLOTS, + OCFS2_INFO_LABEL, + OCFS2_INFO_UUID, + OCFS2_INFO_FS_FEATURES, + OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_NUM_TYPES +}; + +/* Flags for struct ocfs2_info_request */ +/* Filled by the caller */ +#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not + required. This is a hint. + It is up to ocfs2 whether + the request can be fulfilled + without locking. */ +/* Filled by ocfs2 */ +#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood + this request and + filled in the answer */ + +#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during + request handling. */ + +#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) + #endif /* OCFS2_IOCTL_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index efdd756..b5f9160 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,6 +49,7 @@ struct ocfs2_cow_context { struct inode *inode; + struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to; + unsigned int from, to, readahead_pages; loff_t offset, end, map_end; struct address_space *mapping = context->inode->i_mapping; mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, new_cluster, new_len, cpos); + readahead_pages = + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* @@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); + if (PageReadahead(page) && context->file) { + page_cache_async_readahead(mapping, + &context->file->f_ra, + context->file, + page, page_index, + readahead_pages); + } + if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } +static void ocfs2_readahead_for_cow(struct inode *inode, + struct file *file, + u32 start, u32 len) +{ + struct address_space *mapping; + pgoff_t index; + unsigned long num_pages; + int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + + if (!file) + return; + + mapping = file->f_mapping; + num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; + if (!num_pages) + num_pages = 1; + + index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; + page_cache_sync_readahead(mapping, &file->f_ra, file, + index, num_pages); +} + /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); + ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; @@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; + context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); @@ -3492,6 +3530,7 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1..c8ce46f7 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -21,14 +21,14 @@ struct ocfs2_refcount_tree { struct rb_node rf_node; u64 rf_blkno; u32 rf_generation; + struct kref rf_getcnt; struct rw_semaphore rf_sem; struct ocfs2_lock_res rf_lockres; - struct kref rf_getcnt; int rf_removed; /* the following 4 fields are used by caching_info. */ - struct ocfs2_caching_info rf_ci; spinlock_t rf_lock; + struct ocfs2_caching_info rf_ci; struct mutex rf_io_mutex; struct super_block *rf_sb; }; @@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u32 clusters, int *credits, int *ref_blocks); -int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_refcount_cow(struct inode *inode, + struct file *filep, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bfbd7e9..ab4e017 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, { int status = 0; u64 blkno; - unsigned long long blocks, bytes; + unsigned long long blocks, bytes = 0; unsigned int i; struct buffer_head *bh; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 849c2f0..5fed60d 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, (unsigned long *) undo_bg->bg_bitmap); } le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } if (undo_fn) jbd_unlock_bh_state(group_bh); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4e009ad..a8a0ca4 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -178,6 +178,8 @@ enum { Opt_noacl, Opt_usrquota, Opt_grpquota, + Opt_coherency_buffered, + Opt_coherency_full, Opt_resv_level, Opt_dir_resv_level, Opt_err, @@ -207,6 +209,8 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_coherency_buffered, "coherency=buffered"}, + {Opt_coherency_full, "coherency=full"}, {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_err, NULL} @@ -516,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) mlog_entry_void(); - for (i = 0; i < NUM_SYSTEM_INODES; i++) { - inode = osb->system_inodes[i]; + for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { + inode = osb->global_system_inodes[i]; if (inode) { iput(inode); - osb->system_inodes[i] = NULL; + osb->global_system_inodes[i] = NULL; } } @@ -536,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) osb->root_inode = NULL; } + if (!osb->local_system_inodes) + goto out; + + for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { + if (osb->local_system_inodes[i]) { + iput(osb->local_system_inodes[i]); + osb->local_system_inodes[i] = NULL; + } + } + + kfree(osb->local_system_inodes); + osb->local_system_inodes = NULL; + +out: mlog_exit(0); } @@ -1452,6 +1470,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_grpquota: mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; + case Opt_coherency_buffered: + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_coherency_full: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + break; case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; @@ -1563,6 +1587,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_GRPQUOTA) seq_printf(s, ",grpquota"); + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) + seq_printf(s, ",coherency=buffered"); + else + seq_printf(s, ",coherency=full"); + if (opts & OCFS2_MOUNT_NOUSERXATTR) seq_printf(s, ",nouser_xattr"); else @@ -2017,6 +2046,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu return 0; } +/* Make sure entire volume is addressable by our journal. Requires + osb_clusters_at_boot to be valid and for the journal to have been + initialized by ocfs2_journal_init(). */ +static int ocfs2_journal_addressable(struct ocfs2_super *osb) +{ + int status = 0; + u64 max_block = + ocfs2_clusters_to_blocks(osb->sb, + osb->osb_clusters_at_boot) - 1; + + /* 32-bit block number is always OK. */ + if (max_block <= (u32)~0ULL) + goto out; + + /* Volume is "huge", so see if our journal is new enough to + support it. */ + if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_JBD2_SB) && + jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT))) { + mlog(ML_ERROR, "The journal cannot address the entire volume. " + "Enable the 'block64' journal option with tunefs.ocfs2"); + status = -EFBIG; + goto out; + } + + out: + return status; +} + static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, int sector_size, @@ -2029,6 +2088,7 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_journal *journal; __le32 uuid_net_key; struct ocfs2_super *osb; + u64 total_blocks; mlog_entry_void(); @@ -2087,6 +2147,15 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + mlog(0, "max_slots for this device: %u\n", osb->max_slots); + ocfs2_orphan_scan_init(osb); status = ocfs2_recovery_init(osb); @@ -2125,15 +2194,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); - if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { - mlog(ML_ERROR, "Invalid number of node slots (%u)\n", - osb->max_slots); - status = -EINVAL; - goto bail; - } - mlog(0, "max_slots for this device: %u\n", osb->max_slots); - osb->slot_recovery_generations = kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), GFP_KERNEL); @@ -2243,11 +2303,15 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) - > (u32)~0UL) { - mlog(ML_ERROR, "Volume might try to write to blocks beyond " - "what jbd can address in 32 bits.\n"); - status = -EINVAL; + total_blocks = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(di->i_clusters)); + + status = generic_check_addressable(osb->sb->s_blocksize_bits, + total_blocks); + if (status) { + mlog(ML_ERROR, "Volume too large " + "to mount safely on this system"); + status = -EFBIG; goto bail; } @@ -2409,6 +2473,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } + /* Now that journal has been initialized, check to make sure + entire volume is addressable. */ + status = ocfs2_journal_addressable(osb); + if (status) + goto finally; + /* If the journal was unmounted cleanly then we don't want to * recover anything. Otherwise, journal_load will do that * dirty work for us :) */ diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index bfe7190..902efb2 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, int type, u32 slot); -static inline int is_global_system_inode(int type); -static inline int is_in_system_inode_array(struct ocfs2_super *osb, - int type, - u32 slot); - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; #endif @@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type) type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; } -static inline int is_in_system_inode_array(struct ocfs2_super *osb, - int type, - u32 slot) +static struct inode **get_local_system_inode(struct ocfs2_super *osb, + int type, + u32 slot) { - return slot == osb->slot_num || is_global_system_inode(type); + int index; + struct inode **local_system_inodes, **free = NULL; + + BUG_ON(slot == OCFS2_INVALID_SLOT); + BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || + type > OCFS2_LAST_LOCAL_SYSTEM_INODE); + + spin_lock(&osb->osb_lock); + local_system_inodes = osb->local_system_inodes; + spin_unlock(&osb->osb_lock); + + if (unlikely(!local_system_inodes)) { + local_system_inodes = kzalloc(sizeof(struct inode *) * + NUM_LOCAL_SYSTEM_INODES * + osb->max_slots, + GFP_NOFS); + if (!local_system_inodes) { + mlog_errno(-ENOMEM); + /* + * return NULL here so that ocfs2_get_sytem_file_inodes + * will try to create an inode and use it. We will try + * to initialize local_system_inodes next time. + */ + return NULL; + } + + spin_lock(&osb->osb_lock); + if (osb->local_system_inodes) { + /* Someone has initialized it for us. */ + free = local_system_inodes; + local_system_inodes = osb->local_system_inodes; + } else + osb->local_system_inodes = local_system_inodes; + spin_unlock(&osb->osb_lock); + if (unlikely(free)) + kfree(free); + } + + index = (slot * NUM_LOCAL_SYSTEM_INODES) + + (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); + + return &local_system_inodes[index]; } struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, @@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, struct inode **arr = NULL; /* avoid the lookup if cached in local system file array */ - if (is_in_system_inode_array(osb, type, slot)) - arr = &(osb->system_inodes[type]); + if (is_global_system_inode(type)) { + arr = &(osb->global_system_inodes[type]); + } else + arr = get_local_system_inode(osb, type, slot); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ |