diff options
Diffstat (limited to 'fs')
80 files changed, 1363 insertions, 449 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 510040b..b1dc518 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -540,8 +540,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 09e44337..e8aa57d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -149,8 +149,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc1..81220b2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* DIO is ready to submit */ +#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80a9aef..aac314e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1778,6 +1778,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; + struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 862fbc2..564a7de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -378,7 +378,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_error(root->fs_info, ret, "kobj add dev failed"); + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f43bfea..f556c37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1751,6 +1751,7 @@ static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1772,7 +1773,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1781,6 +1781,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1789,6 +1799,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } @@ -2492,6 +2530,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); @@ -2803,6 +2842,7 @@ int open_ctree(struct super_block *sb, !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2840,7 +2880,7 @@ retry_root_backup: !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 38b76cc..07204bf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2296,9 +2296,22 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { + struct btrfs_delayed_ref_node *ref; + if (list_empty(&head->ref_list)) return NULL; + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + list_for_each_entry(ref, &head->ref_list, list) { + if (ref->action == BTRFS_ADD_DELAYED_REF) + return ref; + } + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, list); } @@ -4214,6 +4227,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -9889,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); @@ -9983,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans, root); next: + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f6a596d..d4a582a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; struct btrfs_free_space *info; struct rb_node *n; u64 count; @@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) return; while (1) { + bool add_to_ctl = true; + + spin_lock(rbroot_lock); n = rb_first(rbroot); - if (!n) + if (!n) { + spin_unlock(rbroot_lock); break; + } info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->ino_cache_progress) - goto free; + add_to_ctl = false; else if (info->offset + info->bytes > root->ino_cache_progress) count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; - __btrfs_add_free_space(ctl, info->offset, count); -free: rb_erase(&info->offset_index, rbroot); - kfree(info); + spin_unlock(rbroot_lock); + if (add_to_ctl) + __btrfs_add_free_space(ctl, info->offset, count); + kmem_cache_free(btrfs_free_space_cachep, info); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 855935f..e33dff3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4209,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4493,8 +4493,7 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); @@ -4989,8 +4988,9 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readpages (called from readahead) - * that didn't get their end io callbacks called yet or they are still - * in progress ((extent_io.c:end_bio_extent_readpage()). This means some + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken @@ -7546,6 +7546,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7871,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -7895,7 +7894,6 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); @@ -8163,9 +8161,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8182,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8210,25 +8207,55 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + bio_endio(io_bio, ret); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, @@ -8330,9 +8357,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c86b835..0770c91 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 { static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff); + u64 off, u64 olen, u64 olen_aligned, u64 destoff, + int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -2765,14 +2766,11 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, u64 off) +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - pgoff_t index; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - index = off >> PAGE_CACHE_SHIFT; - page = grab_cache_page(inode->i_mapping, index); if (!page) return NULL; @@ -2793,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) return page; } +static int gather_extent_pages(struct inode *inode, struct page **pages, + int num_pages, u64 off) +{ + int i; + pgoff_t index = off >> PAGE_CACHE_SHIFT; + + for (i = 0; i < num_pages; i++) { + pages[i] = extent_same_get_page(inode, index + i); + if (!pages[i]) + return -ENOMEM; + } + return 0; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2818,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); - mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); } -static void btrfs_double_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + if (inode1 != inode2) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) { if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + if (inode1 != inode2) lock_extent_range(inode2, loff2, len); +} + +struct cmp_pages { + int num_pages; + struct page **src_pages; + struct page **dst_pages; +}; + +static void btrfs_cmp_data_free(struct cmp_pages *cmp) +{ + int i; + struct page *pg; + + for (i = 0; i < cmp->num_pages; i++) { + pg = cmp->src_pages[i]; + if (pg) + page_cache_release(pg); + pg = cmp->dst_pages[i]; + if (pg) + page_cache_release(pg); + } + kfree(cmp->src_pages); + kfree(cmp->dst_pages); +} + +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, + struct inode *dst, u64 dst_loff, + u64 len, struct cmp_pages *cmp) +{ + int ret; + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + struct page **src_pgarr, **dst_pgarr; + + /* + * We must gather up all the pages before we initiate our + * extent locking. We use an array for the page pointers. Size + * of the array is bounded by len, which is in turn bounded by + * BTRFS_MAX_DEDUPE_LEN. + */ + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + if (!src_pgarr || !dst_pgarr) { + kfree(src_pgarr); + kfree(dst_pgarr); + return -ENOMEM; } + cmp->num_pages = num_pages; + cmp->src_pages = src_pgarr; + cmp->dst_pages = dst_pgarr; + + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + if (ret) + goto out; + + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + +out: + if (ret) + btrfs_cmp_data_free(cmp); + return 0; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len) + u64 dst_loff, u64 len, struct cmp_pages *cmp) { int ret = 0; + int i; struct page *src_page, *dst_page; unsigned int cmp_len = PAGE_CACHE_SIZE; void *addr, *dst_addr; + i = 0; while (len) { if (len < PAGE_CACHE_SIZE) cmp_len = len; - src_page = extent_same_get_page(src, loff); - if (!src_page) - return -EINVAL; - dst_page = extent_same_get_page(dst, dst_loff); - if (!dst_page) { - page_cache_release(src_page); - return -EINVAL; - } + BUG_ON(i >= cmp->num_pages); + + src_page = cmp->src_pages[i]; + dst_page = cmp->dst_pages[i]; + addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -2875,15 +2955,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, kunmap_atomic(addr); kunmap_atomic(dst_addr); - page_cache_release(src_page); - page_cache_release(dst_page); if (ret) break; - loff += cmp_len; - dst_loff += cmp_len; len -= cmp_len; + i++; } return ret; @@ -2914,27 +2991,62 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, { int ret; u64 len = olen; + struct cmp_pages cmp; + int same_inode = 0; + u64 same_lock_start = 0; + u64 same_lock_len = 0; - /* - * btrfs_clone() can't handle extents in the same file - * yet. Once that works, we can drop this check and replace it - * with a check for the same inode, but overlapping extents. - */ if (src == dst) - return -EINVAL; + same_inode = 1; if (len == 0) return 0; - btrfs_double_lock(src, loff, dst, dst_loff, len); + if (same_inode) { + mutex_lock(&src->i_mutex); - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - goto out_unlock; + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - goto out_unlock; + /* + * Single inode case wants the same checks, except we + * don't want our length pushed out past i_size as + * comparing that data range makes no sense. + * + * extent_same_check_offsets() will do this for an + * unaligned length at i_size, so catch it here and + * reject the request. + * + * This effectively means we require aligned extents + * for the single-inode case, whereas the other cases + * allow an unaligned length so long as it ends at + * i_size. + */ + if (len != olen) { + ret = -EINVAL; + goto out_unlock; + } + + /* Check for overlapping ranges */ + if (dst_loff + len > loff && dst_loff < loff + len) { + ret = -EINVAL; + goto out_unlock; + } + + same_lock_start = min_t(u64, loff, dst_loff); + same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + btrfs_double_inode_lock(src, dst); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + } /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != @@ -2943,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, goto out_unlock; } - ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + if (ret) + goto out_unlock; + + if (same_inode) + lock_extent_range(src, same_lock_start, same_lock_len); + else + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + + /* pass original length for comparison so we stay within i_size */ + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); if (ret == 0) - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + + if (same_inode) + unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, + same_lock_start + same_lock_len - 1); + else + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_cmp_data_free(&cmp); out_unlock: - btrfs_double_unlock(src, loff, dst, dst_loff, len); + if (same_inode) + mutex_unlock(&src->i_mutex); + else + btrfs_double_inode_unlock(src, dst); return ret; } @@ -2958,7 +3090,7 @@ out_unlock: static long btrfs_ioctl_file_extent_same(struct file *file, struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_args *same = NULL; struct btrfs_ioctl_same_extent_info *info; struct inode *src = file_inode(file); u64 off; @@ -2988,6 +3120,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (IS_ERR(same)) { ret = PTR_ERR(same); + same = NULL; goto out; } @@ -3058,6 +3191,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, out: mnt_drop_write_file(file); + kfree(same); return ret; } @@ -3100,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, - const u64 olen) + const u64 olen, + int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (!no_time_update) + inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3191,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode, * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen, extent_same uses - * identical values here + * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff) + const u64 destoff, int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -3452,6 +3588,20 @@ process_slot: u64 trim = 0; u64 aligned_end = 0; + /* + * Don't copy an inline extent into an offset + * greater than zero. Having an inline extent + * at such an offset results in chaos as btrfs + * isn't prepared for such cases. Just skip + * this case for the same reasons as commented + * at btrfs_ioctl_clone(). + */ + if (last_dest_end > 0) { + ret = -EOPNOTSUPP; + btrfs_end_transaction(trans, root); + goto out; + } + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3521,7 +3671,8 @@ process_slot: root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen); + destoff, olen, + no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) @@ -3559,7 +3710,7 @@ process_slot: clone_update_extent_map(inode, trans, NULL, last_dest_end, destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); + destoff, olen, no_time_update); } out: @@ -3696,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(inode, destoff, len); } - ret = btrfs_clone(src, inode, off, olen, len, destoff); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); if (same_inode) { u64 lock_start = min_t(u64, off, destoff); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 89656d7..52170cf 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -552,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->log_list)); + ASSERT(list_empty(&entry->trans_list)); + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { @@ -579,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d5f1f03..8a82029 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1349,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; @@ -1364,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) - qgroup->max_rfer = limit->max_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - qgroup->max_excl = limit->max_excl; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) - qgroup->rsv_rfer = limit->rsv_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) - qgroup->rsv_excl = limit->rsv_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; + } else { + qgroup->max_rfer = limit->max_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; + } else { + qgroup->max_excl = limit->max_excl; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; + } + } qgroup->lim_flags |= limit->flags; spin_unlock(&fs_info->qgroup_lock); @@ -1618,6 +1651,11 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, /* Exclusive -> exclusive, nothing changed */ } } + + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 827951f..88cbb59 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4049,7 +4049,7 @@ restart: if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->extent_root, rc->block_group->flags); - if (ret == 0) { + if (ret == 1) { err = 0; progress = 0; goto restart; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9f2feab..94db0fa 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3571,7 +3571,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - int ret = 0; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; @@ -3584,34 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, fs_info->scrub_workers = btrfs_alloc_workqueue("btrfs-scrub", flags, max_active, 4); - if (!fs_info->scrub_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_workers) + goto fail_scrub_workers; + fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue("btrfs-scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_wr_completion_workers) + goto fail_scrub_wr_completion_workers; + fs_info->scrub_nocow_workers = btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_nocow_workers) + goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = btrfs_alloc_workqueue("btrfs-scrubparity", flags, max_active, 2); - if (!fs_info->scrub_parity_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_parity_workers) + goto fail_scrub_parity_workers; } ++fs_info->scrub_workers_refcnt; -out: - return ret; + return 0; + +fail_scrub_parity_workers: + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); +fail_scrub_nocow_workers: + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +fail_scrub_wr_completion_workers: + btrfs_destroy_workqueue(fs_info->scrub_workers); +fail_scrub_workers: + return -ENOMEM; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c0f18e7..f5021fc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -761,7 +761,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->ordered)) { spin_lock(&info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); spin_unlock(&info->trans_lock); } @@ -1866,7 +1866,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -2152,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1ce80c1..9c45431 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4117,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4155,6 +4336,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4263,11 +4445,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4290,6 +4467,28 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4357,9 +4556,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4b438b4..fbe7c10 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2766,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ret = btrfs_can_relocate(extent_root, chunk_offset); if (ret) return -ENOSPC; @@ -2814,13 +2828,18 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto error; + } BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -2843,6 +2862,7 @@ again: else BUG_ON(ret); } + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3299,9 +3319,12 @@ again: goto error; } + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } /* * this shouldn't happen, it means the last relocate @@ -3313,6 +3336,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); ret = 0; break; } @@ -3321,8 +3345,10 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != key.objectid) + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); break; + } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3335,10 +3361,13 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); - if (!ret) + if (!ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; + } if (counting) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3348,6 +3377,7 @@ again: ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; if (ret == -ENOSPC) { @@ -4087,11 +4117,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto done; + } ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4105,6 +4140,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4113,6 +4149,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4122,6 +4159,7 @@ again: btrfs_release_path(path); ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -5715,7 +5753,6 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; - struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5723,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; + struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dc10c9d..ddd5e94 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1506,7 +1506,6 @@ static int __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; - cf->kick = false; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); @@ -2123,8 +2122,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, static int __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, - struct ceph_inode_info *ci, - bool kick_all) + struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; @@ -2150,9 +2148,7 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc, for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { cf = rb_entry(n, struct ceph_cap_flush, i_node); - if (cf->tid < first_tid) - continue; - if (kick_all || cf->kick) + if (cf->tid >= first_tid) break; } if (!n) { @@ -2161,7 +2157,6 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc, } cf = rb_entry(n, struct ceph_cap_flush, i_node); - cf->kick = false; first_tid = cf->tid + 1; @@ -2181,8 +2176,6 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci; struct ceph_cap *cap; - struct ceph_cap_flush *cf; - struct rb_node *n; dout("early_kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { @@ -2205,16 +2198,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { spin_unlock(&ci->i_ceph_lock); - if (!__kick_flushing_caps(mdsc, session, ci, true)) + if (!__kick_flushing_caps(mdsc, session, ci)) continue; spin_lock(&ci->i_ceph_lock); } - for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - cf->kick = true; - } - spin_unlock(&ci->i_ceph_lock); } } @@ -2228,7 +2216,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - int delayed = __kick_flushing_caps(mdsc, session, ci, false); + int delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2261,7 +2249,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); - delayed = __kick_flushing_caps(mdsc, session, ci, true); + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 4347039..6706bde 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 860cc016..2f2460d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -189,7 +189,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) struct ceph_cap_flush { u64 tid; int caps; - bool kick; struct rb_node g_node; // global union { struct rb_node i_node; // inode diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6b8e2f0..48851f6 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -896,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(FITRIM) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) COMPATIBLE_IOCTL(KDGKBTYPE) diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 4d6a30e..b863a09 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -115,7 +115,7 @@ void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type) { - config_item_set_name(item, name); + config_item_set_name(item, "%s", name); item->ci_type = type; config_item_init(item); } @@ -124,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name); void config_group_init_type_name(struct config_group *group, const char *name, struct config_item_type *type) { - config_item_set_name(&group->cg_item, name); + config_item_set_name(&group->cg_item, "%s", name); group->cg_item.ci_type = type; config_group_init(group); } @@ -319,6 +319,12 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks + * @complete_unwritten: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required for + * required by write faults for filesystems that will return unwritten + * extent mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs does + * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all @@ -437,8 +443,12 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !error); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } out: if (error == -ENOMEM) diff --git a/fs/dcache.c b/fs/dcache.c index 7a3f3e5..9b5fe50 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -642,7 +642,7 @@ static inline bool fast_dput(struct dentry *dentry) /* * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put__or_lock". + * let the dentry count go to zero, so use "put_or_lock". */ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) return lockref_put_or_lock(&dentry->d_lockref); @@ -697,7 +697,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = ACCESS_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) @@ -776,6 +776,9 @@ repeat: if (unlikely(d_unhashed(dentry))) goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; @@ -3439,22 +3442,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa..6c55ade 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 72afcc6..feef8a9 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -325,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { - case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aadb728..2553aa8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -504,7 +504,7 @@ __read_extent_tree_block(const char *function, unsigned int line, struct buffer_head *bh; int err; - bh = sb_getblk(inode->i_sb, pblk); + bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -1089,7 +1089,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; @@ -1283,7 +1283,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (newblock == 0) return err; - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 41f8e55..cecf9aa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1323,7 +1323,7 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0; + int to_release = 0, contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; @@ -1344,14 +1344,23 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; + contiguous_blks++; clear_buffer_delay(bh); + } else if (contiguous_blks) { + lblk = page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblk += (curr_off >> inode->i_blkbits) - + contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); + contiguous_blks = 0; } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - if (to_release) { + if (contiguous_blks) { lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, lblk, to_release); + lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); } /* If we have released all the blocks belonging to a cluster, then we @@ -4344,7 +4353,12 @@ static void ext4_update_other_inodes_time(struct super_block *sb, int inode_size = EXT4_INODE_SIZE(sb); oi.orig_ino = orig_ino; - ino = (orig_ino & ~(inodes_per_block - 1)) + 1; + /* + * Calculate the first inode in the inode table block. Inode + * numbers are one-based. That is, the first inode in a block + * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). + */ + ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cb84512..1346cfa 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -755,7 +755,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: - case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f6aedf8..34b610e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4816,18 +4816,12 @@ do_more: /* * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed + * + * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed + * to fail. */ - retry: - new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); - if (!new_entry) { - /* - * We use a retry loop because - * ext4_free_blocks() is not allowed to fail. - */ - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; - } + new_entry = kmem_cache_alloc(ext4_free_data_cachep, + GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b52374e..6163ad2 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -620,6 +620,7 @@ int ext4_ind_migrate(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; + ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret; @@ -633,6 +634,14 @@ int ext4_ind_migrate(struct inode *inode) EXT4_FEATURE_RO_COMPAT_BIGALLOC)) return -EOPNOTSUPP; + /* + * In order to get correct extent info, force all delayed allocation + * blocks to be allocated, otherwise delayed allocation blocks may not + * be reflected and bypass the checks on extent header. + */ + if (test_opt(inode->i_sb, DELALLOC)) + ext4_alloc_da_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -650,11 +659,13 @@ int ext4_ind_migrate(struct inode *inode) goto errout; } if (eh->eh_entries == 0) - blk = len = 0; + blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); - if (len > EXT4_NDIR_BLOCKS) { + start = le32_to_cpu(ex->ee_block); + end = start + len - 1; + if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } @@ -662,7 +673,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); - for (i=0; i < len; i++) + for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ext4_mark_inode_dirty(handle, inode); errout: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9bedfa8..f71e19a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2072,8 +2072,6 @@ static int f2fs_set_data_page_dirty(struct page *page) return 1; } - mark_inode_dirty(inode); - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ada2a3d..b0f38c3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1331,12 +1331,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); commit_inmem_pages(inode, false); + } ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); mnt_drop_write_file(filp); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } @@ -1387,8 +1388,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); if (f2fs_is_atomic_file(inode)) { - commit_inmem_pages(inode, false); clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, false); } if (f2fs_is_volatile_file(inode)) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e1e7361..22fb5ef 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -556,27 +556,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!fio.encrypted_page) goto put_out; - f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(!PageUptodate(fio.encrypted_page))) + goto put_page_out; + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + goto put_page_out; + + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, META); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, &fio.blk_addr, &sum, CURSEG_COLD_DATA); - dn.data_blkaddr = fio.blk_addr; - - /* write page */ - lock_page(fio.encrypted_page); - set_page_writeback(fio.encrypted_page); fio.rw = WRITE_SYNC; f2fs_submit_page_mbio(&fio); + dn.data_blkaddr = fio.blk_addr; set_data_blkaddr(&dn); f2fs_update_extent_cache(&dn); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); if (page->index == 0) set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); - +put_page_out: f2fs_put_page(fio.encrypted_page, 1); put_out: f2fs_put_dnode(&dn); @@ -605,8 +617,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 38e75fb..a13ffcc 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -141,6 +141,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) kunmap_atomic(dst_addr); SetPageUptodate(page); no_update: + set_page_dirty(page); + /* clear dirty state */ dirty = clear_page_dirty_for_io(page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1eb3437..61b97f9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,6 +257,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (!abort) { lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); diff --git a/fs/file_table.c b/fs/file_table.c index 7f9d407..ad17e05 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,6 +25,7 @@ #include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> +#include <linux/swap.h> #include <linux/atomic.h> @@ -308,19 +309,24 @@ void put_filp(struct file *file) } } -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f0520bc..5fa588e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -702,6 +702,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } +EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested @@ -843,14 +844,15 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_iter iter; might_sleep(); - - if (!bdi_has_dirty_io(bdi)) - return; restart: rcu_read_lock(); bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { - if (!wb_has_dirty_io(wb) || - (skip_if_busy && writeback_in_progress(wb))) + /* SYNC_ALL writes out I_DIRTY_TIME too */ + if (!wb_has_dirty_io(wb) && + (base_work->sync_mode == WB_SYNC_NONE || + list_empty(&wb->b_dirty_time))) + continue; + if (skip_if_busy && writeback_in_progress(wb)) continue; base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); @@ -898,8 +900,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, { might_sleep(); - if (bdi_has_dirty_io(bdi) && - (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; base_work->single_wait = 0; base_work->single_done = 0; @@ -2274,8 +2275,12 @@ void sync_inodes_sb(struct super_block *sb) }; struct backing_dev_info *bdi = sb->s_bdi; - /* Nothing to do? */ - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + /* + * Can't skip on !bdi_has_dirty() because we should wait for !dirty + * inodes under writeback and I_DIRTY_TIME inodes ignored by + * bdi_has_dirty() need to be written out too. + */ + if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 80cc1b3..ebb5e37 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2246,7 +2246,15 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, err = -EINVAL; if (old) { - struct fuse_dev *fud = fuse_get_dev(old); + struct fuse_dev *fud = NULL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); if (fud) { mutex_lock(&fuse_mutex); diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index f005046..d6a4b55 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a a->btree.first_free = cpu_to_le16(8); return a; } + +static unsigned find_run(__le32 *bmp, unsigned *idx) +{ + unsigned len; + while (tstbits(bmp, *idx, 1)) { + (*idx)++; + if (unlikely(*idx >= 0x4000)) + return 0; + } + len = 1; + while (!tstbits(bmp, *idx + len, 1)) + len++; + return len; +} + +static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result) +{ + int err; + secno end; + if (fatal_signal_pending(current)) + return -EINTR; + end = start + len; + if (start < limit_start) + start = limit_start; + if (end > limit_end) + end = limit_end; + if (start >= end) + return 0; + if (end - start < minlen) + return 0; + err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0); + if (err) + return err; + *result += end - start; + return 0; +} + +int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result) +{ + int err = 0; + struct hpfs_sb_info *sbi = hpfs_sb(s); + unsigned idx, len, start_bmp, end_bmp; + __le32 *bmp; + struct quad_buffer_head qbh; + + *result = 0; + if (!end || end > sbi->sb_fs_size) + end = sbi->sb_fs_size; + if (start >= sbi->sb_fs_size) + return 0; + if (minlen > 0x4000) + return 0; + if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_1; + } + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + err = -EIO; + goto unlock_1; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_1: + hpfs_unlock(s); + } + start_bmp = start >> 14; + end_bmp = (end + 0x3fff) >> 14; + while (start_bmp < end_bmp && !err) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_2; + } + if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) { + err = -EIO; + goto unlock_2; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_2: + hpfs_unlock(s); + start_bmp++; + } + return err; +} diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2a8e074..dc540bf 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops = .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, + .unlocked_ioctl = hpfs_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 6d8cfe9..7ca28d6 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -203,6 +203,7 @@ const struct file_operations hpfs_file_ops = .release = hpfs_file_release, .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, + .unlocked_ioctl = hpfs_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index bb04b58..c4867b5 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/blkdev.h> #include <asm/unaligned.h> #include "hpfs.h" @@ -200,6 +202,7 @@ void hpfs_free_dnode(struct super_block *, secno); struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); +int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *); /* anode.c */ @@ -318,6 +321,7 @@ __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_get_free_dnodes(struct super_block *); +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 7cd00d3..68a9bed0 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -52,17 +52,20 @@ static void unmark_dirty(struct super_block *s) } /* Filesystem error... */ -static char err_buf[1024]; - void hpfs_error(struct super_block *s, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("filesystem error: %pV", &vaf); + va_end(args); - pr_err("filesystem error: %s", err_buf); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); @@ -196,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } + +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + switch (cmd) { + case FITRIM: { + struct fstrim_range range; + secno n_trimmed; + int r; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) + return -EFAULT; + r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); + if (r) + return r; + range.len = (u64)n_trimmed << 9; + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) + return -EFAULT; + return 0; + } + default: { + return -ENOIOCTLCMD; + } + } +} + + static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; @@ -424,11 +454,14 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int o; struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); - + + if (!new_opts) + return -ENOMEM; + sync_filesystem(s); *flags |= MS_NOATIME; - + hpfs_lock(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0cf74df..973c24c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1010,6 +1010,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, diff --git a/fs/jfs/file.c b/fs/jfs/file.c index e98d39d..b9dc23c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file) if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); - atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 6f1cb2b..41aa3ca 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -134,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) * It has been committed since the last change, but was still * on the dirty inode list. */ - if (!test_cflag(COMMIT_Dirty, inode)) { + if (!test_cflag(COMMIT_Dirty, inode)) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); return 0; - } + } if (jfs_commit_inode(inode, wait)) { jfs_err("jfs_write_inode: jfs_commit_inode failed!"); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 93a1232..8db8b7d 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case JFS_IOC_SETFLAGS32: cmd = JFS_IOC_SETFLAGS; break; - case FITRIM: - cmd = FITRIM; - break; } return jfs_ioctl(filp, cmd, arg); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e33be92..a5ac97b 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1160,7 +1160,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = dtModify(tid, new_dir, &new_dname, &ino, old_ip->i_ino, JFS_RENAME); if (rc) - goto out4; + goto out_tx; drop_nlink(new_ip); if (S_ISDIR(new_ip->i_mode)) { drop_nlink(new_ip); @@ -1185,7 +1185,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; - goto out4; + goto out_tx; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; @@ -1203,7 +1203,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { jfs_err("jfs_rename didn't expect dtSearch to fail " "w/rc = %d", rc); - goto out4; + goto out_tx; } ino = old_ip->i_ino; @@ -1211,7 +1211,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { if (rc == -EIO) jfs_err("jfs_rename: dtInsert returned -EIO"); - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) inc_nlink(new_dir); @@ -1226,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_err("jfs_rename did not expect dtDelete to return rc = %d", rc); txAbort(tid, 1); /* Marks Filesystem dirty */ - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) { drop_nlink(old_dir); @@ -1285,7 +1285,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = txCommit(tid, ipcount, iplist, commit_flag); - out4: + out_tx: txEnd(tid); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); @@ -1308,13 +1308,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_ip && (new_ip->i_nlink == 0)) set_cflag(COMMIT_Nolink, new_ip); - out3: - free_UCSname(&new_dname); - out2: - free_UCSname(&old_dname); - out1: - if (new_ip && !S_ISDIR(new_ip->i_mode)) - IWRITE_UNLOCK(new_ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively @@ -1325,7 +1318,13 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, clear_cflag(COMMIT_Stale, old_dir); } - + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: jfs_info("jfs_rename: returning %d", rc); return rc; } @@ -862,12 +862,11 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ -static int flock_lock_file(struct file *filp, struct file_lock *request) +static int flock_lock_inode(struct inode *inode, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock *fl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); int error = 0; bool found = false; LIST_HEAD(dispose); @@ -890,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto find_conflict; list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (filp != fl->fl_file) + if (request->fl_file != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; @@ -1164,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl, EXPORT_SYMBOL(posix_lock_file); /** - * posix_lock_file_wait - Apply a POSIX-style lock to a file - * @filp: The file to apply the lock to + * posix_lock_inode_wait - Apply a POSIX-style lock to a file + * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Add a POSIX style lock to a file. - * We merge adjacent & overlapping locks whenever possible. - * POSIX locks are sorted by owner task, then by starting address + * Variant of posix_lock_file_wait that does not take a filp, and so can be + * used after the filp has already been torn down. */ -int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { - error = posix_lock_file(filp, fl, NULL); + error = __posix_lock_file(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1189,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_file_wait); +EXPORT_SYMBOL(posix_lock_inode_wait); /** * locks_mandatory_locked - Check for an active lock @@ -1851,18 +1849,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) } /** - * flock_lock_file_wait - Apply a FLOCK-style lock to a file - * @filp: The file to apply the lock to + * flock_lock_inode_wait - Apply a FLOCK-style lock to a file + * @inode: inode of the file to apply to * @fl: The lock to be applied * - * Add a FLOCK style lock to a file. + * Apply a FLOCK style lock request to an inode. */ -int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); for (;;) { - error = flock_lock_file(filp, fl); + error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1874,8 +1872,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } - -EXPORT_SYMBOL(flock_lock_file_wait); +EXPORT_SYMBOL(flock_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -2401,7 +2398,8 @@ locks_remove_flock(struct file *filp) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct file_lock_context *flctx = file_inode(filp)->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2409,7 +2407,7 @@ locks_remove_flock(struct file *filp) if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else - flock_lock_file(filp, &fl); + flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl); @@ -879,7 +879,7 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; + parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; @@ -1954,8 +1954,13 @@ OK: continue; } } - if (unlikely(!d_can_lookup(nd->path.dentry))) + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } return -ENOTDIR; + } } } diff --git a/fs/namespace.c b/fs/namespace.c index c7cb8a5..2b8aa15 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1361,6 +1361,36 @@ enum umount_tree_flags { UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; + +static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) +{ + /* Leaving mounts connected is only valid for lazy umounts */ + if (how & UMOUNT_SYNC) + return true; + + /* A mount without a parent has nothing to be connected to */ + if (!mnt_has_parent(mnt)) + return true; + + /* Because the reference counting rules change when mounts are + * unmounted and connected, umounted mounts may not be + * connected to mounted mounts. + */ + if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) + return true; + + /* Has it been requested that the mount remain connected? */ + if (how & UMOUNT_CONNECTED) + return false; + + /* Is the mount locked such that it needs to remain connected? */ + if (IS_MNT_LOCKED(mnt)) + return false; + + /* By default disconnect the mount */ + return true; +} + /* * mount_lock must be held * namespace_sem must be held for write @@ -1398,10 +1428,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - disconnect = !(((how & UMOUNT_CONNECTED) && - mnt_has_parent(p) && - (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) || - IS_MNT_LOCKED_AND_LAZY(p)); + disconnect = disconnect_mount(p, how); pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, disconnect ? &unmounted : NULL); @@ -1538,11 +1565,8 @@ void __detach_mounts(struct dentry *dentry) while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { - struct mount *p, *tmp; - list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - hlist_add_head(&p->mnt_umount.s_list, &unmounted); - umount_mnt(p); - } + hlist_add_head(&mnt->mnt_umount.s_list, &unmounted); + umount_mnt(mnt); } else umount_tree(mnt, UMOUNT_CONNECTED); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ecebb40..4a90c9b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index c12951b..b3289d7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1852,7 +1852,7 @@ ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, struct nfs42_layoutstat_devinfo *devinfo; int i; - for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) { + for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) { if (*dev_count >= dev_limit) break; mirror = FF_LAYOUT_COMP(pls, i); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b77b328..0adc7d2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -442,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else @@ -1244,9 +1245,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->nrequests == 0) + if (cur_size != new_isize) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; } + if (nfsi->nrequests != 0) + invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1684,13 +1687,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { @@ -1717,7 +1719,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->nrequests == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7e3c460..9b372b8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,6 +296,22 @@ extern struct rpc_procinfo nfs4_procedures[]; #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + if (!dst || !src) + return NULL; + + if (src->len > NFS4_MAXLABELLEN) + return NULL; + + dst->lfs = src->lfs; + dst->pi = src->pi; + dst->len = src->len; + memcpy(dst->label, src->label, src->len); + + return dst; +} static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { @@ -316,6 +332,11 @@ static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + return NULL; +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index f486b80..d731bbf 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -135,7 +135,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } -loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -171,6 +171,23 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_llseek(filep, offset, whence); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + + static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6f228b5..3acb1eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -467,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - do_renew_lease(server->nfs_client, timestamp); + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_has_session(clp)) + do_renew_lease(clp, timestamp); } struct nfs4_call_sync_data { @@ -616,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - if (res->sr_status_flags != 0) - nfs4_schedule_lease_recovery(clp); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -910,6 +912,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs4_string owner_name; struct nfs4_string group_name; + struct nfs4_label *a_label; struct nfs_fattr f_attr; struct nfs4_label *f_label; struct dentry *dir; @@ -1013,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; + p->a_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->a_label)) + goto err_free_f; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); if (IS_ERR(p->o_arg.seqid)) @@ -1041,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = label; + p->o_arg.label = nfs4_label_copy(p->a_label, label); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -1074,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, return p; err_free_label: + nfs4_label_free(p->a_label); +err_free_f: nfs4_label_free(p->f_label); err_free_p: kfree(p); @@ -1093,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + nfs4_label_free(p->a_label); nfs4_label_free(p->f_label); dput(p->dir); @@ -1198,12 +1208,15 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state, static void nfs_resync_open_stateid_locked(struct nfs4_state *state) { + if (!(state->n_wronly || state->n_rdonly || state->n_rdwr)) + return; if (state->n_wronly) set_bit(NFS_O_WRONLY_STATE, &state->flags); if (state->n_rdonly) set_bit(NFS_O_RDONLY_STATE, &state->flags); if (state->n_rdwr) set_bit(NFS_O_RDWR_STATE, &state->flags); + set_bit(NFS_OPEN_STATE, &state->flags); } static void nfs_clear_open_stateid_locked(struct nfs4_state *state, @@ -5439,15 +5452,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) +static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { int res = 0; switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: - res = posix_lock_file_wait(file, fl); + res = posix_lock_inode_wait(inode, fl); break; case FL_FLOCK: - res = flock_lock_file_wait(file, fl); + res = flock_lock_inode_wait(inode, fl); break; default: BUG(); @@ -5484,7 +5497,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5496,7 +5508,6 @@ static void nfs4_locku_release_calldata(void *data) nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); put_nfs_open_context(calldata->ctx); - fput(calldata->fl.fl_file); kfree(calldata); } @@ -5509,7 +5520,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; @@ -5617,7 +5628,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + if (do_vfs_lock(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5758,7 +5769,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -6000,7 +6011,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); if (status < 0) goto out; down_read(&nfsi->rwsem); @@ -6008,7 +6019,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); goto out; } @@ -7573,13 +7584,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) { - struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; - - if (task->tk_status == 0) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (!ret) ret = task->tk_status; - } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -7967,16 +7973,17 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; + LIST_HEAD(freeme); dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); + pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); @@ -8590,7 +8597,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8616,7 +8622,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8639,7 +8644,6 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 605840d..f2e2ad8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2191,25 +2191,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) } } -static void nfs41_handle_state_revoked(struct nfs_client *clp) +static void nfs41_handle_all_state_revoked(struct nfs_client *clp) { nfs4_reset_all_state(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } +static void nfs41_handle_some_state_revoked(struct nfs_client *clp) +{ + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_schedule_state_manager(clp); + + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { - /* This will need to handle layouts too */ - nfs_expire_all_delegations(clp); + /* FIXME: For now, we destroy all layouts. */ + pnfs_destroy_all_layouts(clp); + /* FIXME: For now, we test all delegations+open state+locks. */ + nfs41_handle_some_state_revoked(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { - nfs_expire_all_delegations(clp); - if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, clp->cl_hostname); } @@ -2231,10 +2241,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | - SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED)) + nfs41_handle_all_state_revoked(clp); + if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED)) - nfs41_handle_state_revoked(clp); + nfs41_handle_some_state_revoked(clp); if (flags & SEQ4_STATUS_LEASE_MOVED) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1da68d3..4984bbe 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1100,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) mirror->pg_base = 0; mirror->pg_recoalesce = 0; - desc->pg_moreio = 0; - while (!list_empty(&head)) { struct nfs_page *req; @@ -1109,8 +1107,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; - if (desc->pg_error < 0) + if (desc->pg_error < 0) { + list_splice_tail(&head, &mirror->pg_list); + mirror->pg_recoalesce = 1; return 0; + } break; } } while (mirror->pg_recoalesce); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0ba9a02..70bf706 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -352,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, { struct pnfs_layout_segment *s; - if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return false; list_for_each_entry(s, &lo->plh_segs, pls_list) @@ -362,6 +362,18 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, return true; } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, struct pnfs_layout_hdr *lo, struct inode *inode) { @@ -372,17 +384,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, if (pnfs_layout_need_return(lo, lseg)) { nfs4_stateid stateid; enum pnfs_iomode iomode; + bool send; stateid = lo->plh_stateid; iomode = lo->plh_return_iomode; - /* decreased in pnfs_send_layoutreturn() */ - lo->plh_block_lgets++; - lo->plh_return_iomode = 0; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); - pnfs_get_layout_hdr(lo); - - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } } else spin_unlock(&inode->i_lock); } @@ -411,6 +422,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -451,6 +466,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); if (atomic_dec_and_test(&lseg->pls_refcount)) { struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); pnfs_free_lseg_async(lseg); @@ -924,6 +941,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } static int @@ -978,6 +996,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1007,17 +1026,18 @@ _pnfs_return_layout(struct inode *ino) /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - lo->plh_block_lgets++; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1097,13 +1117,9 @@ bool pnfs_roc(struct inode *ino) out_noroc: if (lo) { stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); } spin_unlock(&ino->i_lock); if (layoutreturn) { @@ -1146,15 +1162,18 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) struct pnfs_layout_segment *lseg; nfs4_stateid stateid; u32 current_seqid; - bool found = false, layoutreturn = false; + bool layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); + return true; + } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1162,23 +1181,19 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: - if (!found) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } + stateid = lo->plh_stateid; + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); if (layoutreturn) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); + return true; } - return found; + return false; } /* @@ -1695,7 +1710,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); if (lo->plh_return_iomode == 0) lo->plh_return_iomode = range.iomode; else if (lo->plh_return_iomode != range.iomode) @@ -2207,13 +2221,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { + put_rpccred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - put_rpccred(data->cred); - goto clear_layoutcommitting; + goto out_unlock; } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 65869ca..75a35a1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1379,24 +1379,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, { struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; + u64 size = argp->offset + resp->count; if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + fattr->size = size; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) { + fattr->valid &= ~NFS_ATTR_FATTR_SIZE; return; - if (argp->offset + resp->count != fattr->size) - return; - if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + } + if (size != fattr->size) return; /* Set attribute barrier */ nfs_fattr_set_barrier(fattr); + /* ...and update size */ + fattr->valid |= NFS_ATTR_FATTR_SIZE; } void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) { - struct nfs_fattr *fattr = hdr->res.fattr; + struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; - if (fattr == NULL) - return; spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6904213..ebf90e4 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { + fput(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61dfb33..9520271 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4396,9 +4396,9 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } -static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { - if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4601,9 +4601,6 @@ nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) { __be32 status; - status = nfs4_check_fh(fhp, ols); - if (status) - return status; status = nfsd4_check_openowner_confirmed(ols); if (status) return status; @@ -4690,6 +4687,9 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfserr_bad_stateid; break; } + if (status) + goto out; + status = nfs4_check_fh(fhp, s); done: if (!status && filpp) @@ -4798,7 +4798,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) return status; - return nfs4_check_fh(current_fh, stp); + return nfs4_check_fh(current_fh, &stp->st_stid); } /* diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5463385..75e0563 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2143,6 +2143,7 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ FATTR4_WORD0_RDATTR_ERROR) #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#define WORD2_ABSENT_FS_ATTRS 0 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 @@ -2171,7 +2172,7 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, { return 0; } #endif -static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err) { /* As per referral draft: */ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || @@ -2184,6 +2185,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) } *bmval0 &= WORD0_ABSENT_FS_ATTRS; *bmval1 &= WORD1_ABSENT_FS_ATTRS; + *bmval2 &= WORD2_ABSENT_FS_ATTRS; return 0; } @@ -2246,8 +2248,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { - BUG_ON(bmval[2]); - status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); if (status) goto out; } @@ -2286,8 +2287,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || - bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || + bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 9a20e51..aba4381 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: - case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 92e48c7..39ddcaf 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -412,16 +412,36 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags) { struct fsnotify_mark *lmark, *mark; + LIST_HEAD(to_free); + /* + * We have to be really careful here. Anytime we drop mark_mutex, e.g. + * fsnotify_clear_marks_by_inode() can come and free marks. Even in our + * to_free list so we have to use mark_mutex even when accessing that + * list. And freeing mark requires us to drop mark_mutex. So we can + * reliably free only the first mark in the list. That's why we first + * move marks to free to to_free list in one go and then free marks in + * to_free list one by one. + */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) { - fsnotify_get_mark(mark); - fsnotify_destroy_mark_locked(mark, group); - fsnotify_put_mark(mark); - } + if (mark->flags & flags) + list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&to_free)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_destroy_mark_locked(mark, group); + mutex_unlock(&group->mark_mutex); + fsnotify_put_mark(mark); + } } /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1a35c61..0f5fd9d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -685,7 +685,7 @@ static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { u64 s = i_size_read(inode); - sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + (do_div(s, osb->s_clustersize) >> 9); ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, @@ -910,7 +910,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); ret = blkdev_issue_zeroout(osb->sb->s_bdev, - p_cpos << (osb->s_clustersize_bits - 9), + (u64)p_cpos << (osb->s_clustersize_bits - 9), zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8b23aa2..23157e4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -4025,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 53e6c40..3cb097c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index f140e3db..d9da5a4 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -343,6 +343,9 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) struct path realpath; enum ovl_path_type type; + if (d_is_dir(dentry)) + return d_backing_inode(dentry); + type = ovl_path_real(dentry, &realpath); if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); @@ -20,8 +20,6 @@ #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) -#define IS_MNT_LOCKED_AND_LAZY(m) \ - (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index d751fcb..1ade120 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -75,3 +75,9 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" default n + help + Provides a fast way to retrieve first level children pids of a task. See + <file:Documentation/filesystems/proc.txt> for more information. + + Say Y if you are running any user-space software which takes benefit from + this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/base.c b/fs/proc/base.c index 87782e8..aa50d1a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -243,6 +243,11 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len1 = arg_end - arg_start; len2 = env_end - env_start; + /* Empty ARGV. */ + if (len1 == 0) { + rv = 0; + goto out_free_page; + } /* * Inherently racy -- command line shares address space * with code and data. diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 91a4e64..92e6726 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) roundup(sizeof(CORE_STR), 4)) + roundup(sizeof(struct elf_prstatus), 4) + roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(sizeof(struct task_struct), 4); + roundup(arch_task_struct_size, 4); *elf_buflen = PAGE_ALIGN(*elf_buflen); return size + *elf_buflen; } @@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) /* set up the task structure */ notes[2].name = CORE_STR; notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = sizeof(struct task_struct); + notes[2].datasz = arch_task_struct_size; notes[2].data = current; nhdr->p_filesz += notesize(¬es[2]); diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e412ad..270221f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO) + if (kinfo->si_signo == SIGBUS && + (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO)) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6afac3d..8d0b3ad 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1652,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); - use->descTag.tagLocation = - cpu_to_le32(iinfo->i_location.logicalBlockNum); - crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(struct tag); - use->descTag.descCRCLength = cpu_to_le16(crclen); - use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(struct tag), - crclen)); - use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + crclen = sizeof(struct unallocSpaceEntry); - goto out; + goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) @@ -1782,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } + +finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); @@ -1791,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->icbTag.numEntries = cpu_to_le16(1); } - if (S_ISDIR(inode->i_mode)) + if (iinfo->i_use) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; + else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; @@ -1828,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); -out: set_buffer_uptodate(bh); unlock_buffer(bh); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d..dd71403 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify( ASSERT(len >= blksize); while (len > 0) { + struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } - if (bip) { - struct xfs_attr3_rmt_hdr *rmt; - rmt = (struct xfs_attr3_rmt_hdr *)ptr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Ensure we aren't writing bogus LSNs to disk. See + * xfs_attr3_rmt_hdr_set() for the explanation. + */ + if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -221,6 +226,18 @@ xfs_attr3_rmt_hdr_set( rmt->rm_owner = cpu_to_be64(ino); rmt->rm_blkno = cpu_to_be64(bno); + /* + * Remote attribute blocks are written synchronously, so we don't + * have an LSN that we can stamp in them that makes any sense to log + * recovery. To ensure that log recovery handles overwrites of these + * blocks sanely (i.e. once they've been freed and reallocated as some + * other type of metadata) we need to ensure that the LSN has a value + * that tells log recovery to ignore the LSN and overwrite the buffer + * with whatever is in it's log. To do this, we use the magic + * NULLCOMMITLSN to indicate that the LSN is invalid. + */ + rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN); + return sizeof(struct xfs_attr3_rmt_hdr); } @@ -434,14 +451,21 @@ xfs_attr_rmtval_set( /* * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); + blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, + args->total, &map, &nmap, args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f0e8249..db4acc1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1514,18 +1514,27 @@ xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + struct inode *inode = file_inode(vma->vm_file); int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_fault(XFS_I(inode)); /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) return xfs_filemap_page_mkwrite(vma, vmf); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - ret = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + /* + * we do not want to trigger unwritten extent conversion on read + * faults - that is unnecessary overhead and would also require + * changes to xfs_get_blocks_direct() to map unwritten extent + * ioend for conversion on read-only mappings. + */ + ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + } else + ret = filemap_fault(vma, vmf); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); return ret; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228..480ebba 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1886,9 +1886,14 @@ xlog_recover_get_buf_lsn( uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; break; case XFS_ATTR3_RMT_MAGIC: - lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); - uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; - break; + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; case XFS_SB_MAGIC: lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); uuid = &((struct xfs_dsb *)blk)->sb_uuid; |