diff options
author | Ingo Molnar <mingo@kernel.org> | 2016-09-15 08:24:53 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-09-15 08:24:53 +0200 |
commit | d4b80afbba49e968623330f1336da8c724da8aad (patch) | |
tree | a9478bd77d8b001a6a7119328d34e9666d7bfe93 /fs | |
parent | fcd709ef20a9d83bdb7524d27cd6719dac8690a0 (diff) | |
parent | 4cea8776571b18db7485930cb422faa739580c8c (diff) | |
download | op-kernel-dev-d4b80afbba49e968623330f1336da8c724da8aad.zip op-kernel-dev-d4b80afbba49e968623330f1336da8c724da8aad.tar.gz |
Merge branch 'linus' into x86/asm, to pick up recent fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
79 files changed, 1286 insertions, 541 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f6aff3f..e5495f3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); + install_exec_creds(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - install_exec_creds(bprm); retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) diff --git a/fs/block_dev.c b/fs/block_dev.c index c3cdde8..08ae993 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) * thaw_bdev drops it. */ sb = get_super(bdev); - drop_super(sb); + if (sb) + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type, { struct dentry *dent; dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); - if (dent) + if (!IS_ERR(dent)) dent->d_sb->s_iflags |= SB_I_CGROUPWB; return dent; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2b88439..455a6b2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode) list_del(&ref2->list); kmem_cache_free(btrfs_prelim_ref_cache, ref2); + cond_resched(); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2fe8f89..33fe035 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1028,6 +1029,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; @@ -1079,6 +1081,8 @@ struct btrfs_fs_info { struct list_head pinned_chunks; int creating_free_space_tree; + /* Used to record internally whether fs has been frozen */ + int fs_frozen; }; struct btrfs_subvolume_writers { @@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d9ddcfc..ac02e04 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, - delayed_refs, - qrecord); - if (qexisting) + if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + delayed_refs, qrecord)) kfree(qrecord); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59febfb..54bc8c7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root, u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) + if (nritems == 0) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(root->fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + /* if leaf is the root, then it's fine */ + if (leaf->start != + btrfs_root_bytenr(&check_root->root_item)) { + CORRUPT("non-root leaf's nritems is 0", + leaf, root, 0); + return -EIO; + } + } return 0; + } /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, + "corrupt node: block %llu root %llu nritems %lu", + node->start, root->objectid, nr); + return -EIO; + } + return 0; +} + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; } + if (found_level > 0 && check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -1618,8 +1655,8 @@ fail: return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; @@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); + if (root->reloc_root) { + free_extent_buffer(root->reloc_root->node); + free_extent_buffer(root->reloc_root->commit_root); + btrfs_put_fs_root(root->reloc_root); + root->reloc_root = NULL; + } + } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b3207a0e..f19a982 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61b494e..38c2df8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, - int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -3501,7 +3487,6 @@ again: dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -4472,6 +4457,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } } +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force) { @@ -4882,7 +4876,7 @@ static int flush_space(struct btrfs_root *root, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret == -ENOSPC) + if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: @@ -4907,11 +4901,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim = 0; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - list_for_each_entry(ticket, &space_info->tickets, list) to_reclaim += ticket->bytes; list_for_each_entry(ticket, &space_info->priority_tickets, list) @@ -4919,6 +4908,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, if (to_reclaim) return to_reclaim; + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4972,12 +4966,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4990,8 +4984,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5011,10 +5004,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5390,6 +5383,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5432,6 +5426,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", @@ -6497,19 +6492,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper * ENOSPC accounting. For data we handle the reservation through clearing the * delalloc bits in the io_tree. We have to do this since we could end up * allocating less disk space for the amount of data we have reserved in the @@ -6519,44 +6510,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * make the reservation and return -EAGAIN, otherwise this function always * succeeds. */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - - if (delalloc) - cache->delalloc_bytes += num_bytes; - } + if (cache->ro) { + ret = -EAGAIN; } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + ram_bytes, 0); + space_info->bytes_may_use -= ram_bytes; if (delalloc) - cache->delalloc_bytes -= num_bytes; + cache->delalloc_bytes += num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -7191,7 +7201,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; @@ -7416,9 +7426,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * the free space extent currently. */ static noinline int find_free_extent(struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7430,8 +7440,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); - int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7763,8 +7771,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -7936,7 +7944,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) @@ -7948,8 +7956,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags, delalloc); + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(root->fs_info, ins->objectid); @@ -7958,6 +7966,7 @@ again: num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -7995,7 +8004,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(root, start, len); } @@ -8208,6 +8217,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8223,9 +8233,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); @@ -8368,7 +8383,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; @@ -8521,35 +8536,6 @@ reada: wc->reada_slot = slot; } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ - struct btrfs_qgroup_extent_record *qrecord; - struct btrfs_delayed_ref_root *delayed_refs; - - qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); - if (!qrecord) - return -ENOMEM; - - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; - qrecord->old_roots = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, - delayed_refs, qrecord)) - kfree(qrecord); - spin_unlock(&delayed_refs->lock); - - return 0; -} - static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) @@ -8583,7 +8569,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8732,8 +8719,9 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = record_one_subtree_extent(trans, root, child_bytenr, - root->nodesize); + ret = btrfs_qgroup_insert_dirty_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); if (ret) goto out; } @@ -9906,6 +9894,7 @@ static int find_first_block_group(struct btrfs_root *root, } else { ret = 0; } + free_extent_map(em); goto out; } path->slots[0]++; @@ -9942,6 +9931,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group->iref = 0; block_group->inode = NULL; spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); iput(inode); last = block_group->key.objectid + block_group->key.offset; btrfs_put_block_group(block_group); @@ -9999,6 +9989,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc2729a..28cd88f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) +#define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5842423..fea31a4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx); + btrfs_init_log_ctx(&ctx, inode); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { @@ -2675,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2767,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - cur_offset = alloc_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); @@ -2794,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte - cur_offset); if (ret < 0) break; + } else { + /* + * Do not need to reserve unwritten extent for this + * range, free reserved data space first, otherwise + * it'll result in false ENOSPC error. + */ + btrfs_free_reserved_data_space(inode, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2811,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->start, range->len, 1 << inode->i_blkbits, offset + len, &alloc_hint); + else + btrfs_free_reserved_data_space(inode, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2845,18 +2856,11 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: - /* - * As we waited the extent range, the data_rsv_map must be empty - * in the range, as written data range will be released from it. - * And for prealloacted extent, it will also be released when - * its metadata is written. - * So this is completely used as cleanup. - */ - btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); inode_unlock(inode); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - alloc_start); + if (ret != 0) + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - cur_offset); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index aa6faba..359ee86 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,10 +495,9 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, 0, prealloc); + btrfs_delalloc_release_metadata(inode, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 08dfc57..e6811c42 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -742,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1489,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode, return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -7251,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); @@ -7751,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7785,7 +7799,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -10414,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14ed1e9..b2a2da5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5084,7 +5084,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93ee1c1..8db2e29 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else - return entry; + return 1; } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); - return NULL; + return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag) +{ + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) + return 0; + if (WARN_ON(trans == NULL)) + return -EINVAL; + record = kmalloc(sizeof(*record), gfp_flag); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) + kfree(record); + return 0; } #define UPDATE_NEW 0 @@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + mutex_unlock(&fs_info->qgroup_rescan_lock); + path = btrfs_alloc_path(); if (!path) goto out; @@ -2369,6 +2402,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 710887c..1bc64c8 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, @@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag); + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b26a5ae..c0c13dc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "async-thread.h" #include "free-space-cache.h" #include "inode-map.h" +#include "qgroup.h" /* * backref_node, mapping_node and tree_block start with this @@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 num_bytes; int nr = 0; int ret = 0; + u64 prealloc_start = cluster->start - offset; + u64 prealloc_end = cluster->end - offset; + u64 cur_offset; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + ret = btrfs_check_data_free_space(inode, prealloc_start, + prealloc_end + 1 - prealloc_start); if (ret) goto out; + cur_offset = prealloc_start; while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; + if (cur_offset < start) + btrfs_free_reserved_data_space(inode, cur_offset, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); + cur_offset = end + 1; unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space(inode, cur_offset, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); return ret; @@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc) return 0; } +/* + * Qgroup fixer for data chunk relocation. + * The data relocation is done in the following steps + * 1) Copy data extents into data reloc tree + * 2) Create tree reloc tree(special snapshot) for related subvolumes + * 3) Modify file extents in tree reloc tree + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks + * + * The problem is, data and tree reloc tree are not accounted to qgroup, + * and 4) will only info qgroup to track tree blocks change, not file extents + * in the tree blocks. + * + * The good news is, related data extents are all in data reloc tree, so we + * only need to info qgroup to track all file extents in data reloc tree + * before commit trans. + */ +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct inode *inode = rc->data_inode; + struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + /* + * Only for stage where we update data pointers the qgroup fix is + * valid. + * For MOVING_DATA stage, we will miss the timing of swapping tree + * blocks, and won't fix it. + */ + if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); + while (1) { + struct btrfs_file_extent_item *fi; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid > btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto next; + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(path->nodes[0], fi) != + BTRFS_FILE_EXTENT_REG) + goto next; + ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, + btrfs_file_extent_disk_bytenr(path->nodes[0], fi), + btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), + GFP_NOFS); + if (ret < 0) + break; +next: + ret = btrfs_next_item(data_reloc_root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); +out: + btrfs_free_path(path); + return ret; +} + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -4102,10 +4196,18 @@ restart: /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + ret = qgroup_fix_relocated_data_extents(trans, rc); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + goto out_free; + } + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - err = btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + err = qgroup_fix_relocated_data_extents(trans, rc); + if (err < 0) { + btrfs_abort_transaction(trans, err); + goto out_free; + } + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7fd7e18..0912960 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + root = btrfs_lookup_fs_root(tree_root->fs_info, + root_key.objectid); + if (root) { + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + continue; + } + root = btrfs_read_fs_root(tree_root, &root_key); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - if (err == -EEXIST) - err = 0; if (err) { + BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index efe129f..a87675f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4268,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); + /* + * We don't actually care about pending_move as we are simply + * re-creating this inode and will be rename'ing it into place once we + * rename the parent directory. + */ ret = process_recorded_refs(sctx, &pending_move); - /* Only applicable to an incremental send. */ - ASSERT(pending_move == 0); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 864ce33..4071fe2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; + root->fs_info->fs_frozen = 1; + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ @@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + root->fs_info->fs_frozen = 0; + return 0; +} + static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9cca0a7..95d4191 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); + /* + * If fs has been frozen, we can not handle delayed iputs, otherwise + * it'll result in deadlock about SB_FREEZE_FS. + */ if (current != root->fs_info->transaction_kthread && - current != root->fs_info->cleaner_kthread) + current != root->fs_info->cleaner_kthread && + !root->fs_info->fs_frozen) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fff3f3e..ef9c55b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -4741,7 +4758,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { struct btrfs_key inode_key; struct inode *other_inode; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75..ab858e3 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx { int log_transid; int io_err; bool log_new_dentries; + struct inode *inode; struct list_head list; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; ctx->log_new_dentries = false; + ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 51f1255..035efce 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work) struct btrfs_device *device; device = container_of(work, struct btrfs_device, rcu_work); - - if (device->bdev) - blkdev_put(device->bdev, device->mode); - rcu_string_free(device->name); kfree(device); } @@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_bdev(struct btrfs_device *device) +{ + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + if (device->bdev) + blkdev_put(device->bdev, device->mode); +} + static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; @@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->missing) fs_devices->missing_devices--; - if (device->bdev && device->writeable) { - sync_blockdev(device->bdev); - invalidate_bdev(device->bdev); - } + btrfs_close_bdev(device); new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); @@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } + btrfs_close_bdev(device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } + + btrfs_close_bdev(srcdev); + call_rcu(&srcdev->rcu, free_device); /* @@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, * the device_list_mutex lock. */ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + + btrfs_close_bdev(tgtdev); call_rcu(&tgtdev->rcu, free_device); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c64a0b7..df4b3e6 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag |= fpos_frag(new_pos)) { + } else if (fi->frag != fpos_frag(new_pos)) { return true; } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0f9961e..ed115ac 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/fscrypto.h> +#include <linux/mount.h> static int inode_has_encryption_context(struct inode *inode) { @@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct inode *inode, +int fscrypt_process_policy(struct file *filp, const struct fscrypt_policy *policy) { + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!inode_has_encryption_context(inode)) { - if (!inode->i_sb->s_cop->empty_dir) - return -EOPNOTSUPP; - if (!inode->i_sb->s_cop->empty_dir(inode)) - return -ENOTEMPTY; - return create_encryption_context_from_policy(inode, policy); + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; } - if (is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; + mnt_drop_write_file(filp); + return ret; } EXPORT_SYMBOL(fscrypt_process_policy); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d116453..79a5941 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -585,7 +585,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) */ void *devpts_get_priv(struct dentry *dentry) { - WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); + if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) + return NULL; return dentry->d_fsdata; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index eea6491..466f7d6 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -607,20 +607,54 @@ static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; -static int table_open(struct inode *inode, struct file *file) +static int table_open1(struct inode *inode, struct file *file) { struct seq_file *seq; - int ret = -1; + int ret; - if (file->f_op == &format1_fops) - ret = seq_open(file, &format1_seq_ops); - else if (file->f_op == &format2_fops) - ret = seq_open(file, &format2_seq_ops); - else if (file->f_op == &format3_fops) - ret = seq_open(file, &format3_seq_ops); - else if (file->f_op == &format4_fops) - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format1_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open2(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format2_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open3(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format3_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open4(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + ret = seq_open(file, &format4_seq_ops); if (ret) return ret; @@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file) static const struct file_operations format1_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open1, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -639,7 +673,7 @@ static const struct file_operations format1_fops = { static const struct file_operations format2_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open2, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -647,7 +681,7 @@ static const struct file_operations format2_fops = { static const struct file_operations format3_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open3, .read = seq_read, .llseek = seq_lseek, .release = seq_release @@ -655,7 +689,7 @@ static const struct file_operations format3_fops = { static const struct file_operations format4_fops = { .owner = THIS_MODULE, - .open = table_open, + .open = table_open4, .read = seq_read, .llseek = seq_lseek, .release = seq_release diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3131747..c6ea25a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd..1bb7df5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -776,7 +776,7 @@ resizefs_out: (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - return fscrypt_process_policy(inode, &policy); + return fscrypt_process_policy(filp, &policy); #else return -EOPNOTSUPP; #endif diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1c593aa..3ec8708 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb, grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 39e9cfb..2eb935c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, size_t min_offs, free; int total_ino; void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); + /* + * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty + */ + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + goto out; header = IHDR(inode, raw_inode); entry = IFIRST(header); @@ -1382,7 +1386,7 @@ retry: goto cleanup; free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { + if (free >= isize_diff) { entry = IFIRST(header); ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + @@ -1390,8 +1394,7 @@ retry: (void *)header, total_ino, inode->i_sb->s_blocksize); EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; + goto out; } /* @@ -1414,7 +1417,7 @@ retry: end = bh->b_data + bh->b_size; min_offs = end - base; free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < new_extra_isize) { + if (free < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1428,7 +1431,7 @@ retry: free = inode->i_sb->s_blocksize; } - while (new_extra_isize > 0) { + while (isize_diff > 0) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1459,7 +1462,7 @@ retry: EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { + if (total_size < isize_diff) { small_entry = last; } else { entry = last; @@ -1514,22 +1517,22 @@ retry: error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto cleanup; + total_ino -= entry_size; entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; + if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) + shift_bytes = isize_diff; else - shift_bytes = entry_size + size; + shift_bytes = entry_size + EXT4_XATTR_SIZE(size); /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); + ext4_xattr_shift_entries(entry, -shift_bytes, + (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize + shift_bytes, + (void *)header, total_ino, inode->i_sb->s_blocksize); - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; + isize_diff -= shift_bytes; + EXT4_I(inode)->i_extra_isize += shift_bytes; + header = IHDR(inode, raw_inode); i.name = b_entry_name; i.value = buffer; @@ -1551,6 +1554,8 @@ retry: kfree(bs); } brelse(bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1562,6 +1567,10 @@ cleanup: kfree(is); kfree(bs); brelse(bh); + /* + * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode + * size expansion failed. + */ up_write(&EXT4_I(inode)->xattr_sem); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69dd3e6..a92e783 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -24,6 +24,7 @@ #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 #define EXT4_XATTR_INDEX_ENCRYPTION 9 +#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */ struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d64d2a5..ccb401e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); - f2fs_put_page(page, 1); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); + f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 675fa79..14f5fe2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -538,7 +538,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -787,7 +787,7 @@ struct f2fs_sb_info { struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ @@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - percpu_down_read(&sbi->cp_rwsem); + down_read(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - percpu_up_read(&sbi->cp_rwsem); + up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - percpu_down_write(&sbi->cp_rwsem); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - percpu_up_write(&sbi->cp_rwsem); + up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0e493f6..28f4f4c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - ret = fscrypt_process_policy(inode, &policy); - mnt_drop_write_file(filp); - return ret; + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (unlikely(f2fs_readonly(src->i_sb))) return -EROFS; - if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) - return -EISDIR; + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) return -EOPNOTSUPP; inode_lock(src); - if (src != dst) - inode_lock(dst); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } ret = -EINVAL; if (pos_in + len > src->i_size || pos_in + len < pos_in) @@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, out_unlock: if (src != dst) inode_unlock(dst); +out: inode_unlock(src); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b2fa4b6..f75d197 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need; } @@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need_update; } @@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); @@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - percpu_down_write(&nm_i->nat_tree_lock); + if (!down_write_trylock(&nm_i->nat_tree_lock)) + return 0; while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; @@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->nid = nid; /* Check nat cache */ - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return; } @@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } /* @@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - percpu_down_read(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } up_read(&curseg->journal_rwsem); - percpu_up_read(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - if (percpu_init_rwsem(&nm_i->nat_tree_lock)) - return -ENOMEM; + init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - percpu_down_write(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - percpu_up_write(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); - percpu_free_rwsem(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b86d3f..7f863a6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); - - percpu_free_rwsem(&sbi->cp_rwsem); } static void f2fs_put_super(struct super_block *sb) @@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) { int i, err; - if (percpu_init_rwsem(&sbi->cp_rwsem)) - return -ENOMEM; - for (i = 0; i < NR_COUNT_TYPE; i++) { err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); if (err) @@ -1686,6 +1681,7 @@ try_onemore: sbi->write_io[i].bio = NULL; } + init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f394aff..3988b43 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } @@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { err = req->out.h.error; break; @@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, break; } + if (iomap->flags & IOMAP_F_MERGED) + flags |= FIEMAP_EXTENT_MERGED; + return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, - iomap->length, flags | FIEMAP_EXTENT_MERGED); + iomap->length, flags); } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e157400..2bcb86e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -840,21 +840,35 @@ repeat: mutex_lock(&kernfs_mutex); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct kernfs_node *parent; struct inode *inode; - struct dentry *dentry; + /* + * We want fsnotify_modify() on @kn but as the + * modifications aren't originating from userland don't + * have the matching @file available. Look up the inodes + * and generate the events manually. + */ inode = ilookup(info->sb, kn->ino); if (!inode) continue; - dentry = d_find_any_alias(inode); - if (dentry) { - fsnotify_parent(NULL, dentry, FS_MODIFY); - fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - NULL, 0); - dput(dentry); + parent = kernfs_get_parent(kn); + if (parent) { + struct inode *p_inode; + + p_inode = ilookup(info->sb, parent->ino); + if (p_inode) { + fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, + inode, FSNOTIFY_EVENT_INODE, kn->name, 0); + iput(p_inode); + } + + kernfs_put(parent); } + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + kn->name, 0); iput(inode); } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index f55a4e7..2178476 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work) PAGE_SIZE - 1) & (loff_t)PAGE_MASK; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT); + (end - start) >> SECTOR_SHIFT, end); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 18e6fd0..efc007f 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -141,6 +141,7 @@ struct pnfs_block_layout { struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ bool bl_scsi_layout; + u64 bl_lwb; }; static inline struct pnfs_block_layout * @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl, int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end); int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len); + sector_t len, u64 lwb); bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent *ret, bool rw); int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 992bcb1..c85fbfd 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len) + sector_t len, u64 lwb) { struct rb_root *root = &bl->bl_ext_rw; sector_t end = start + len; @@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } } out: + if (bl->bl_lwb < lwb) + bl->bl_lwb = lwb; spin_unlock(&bl->bl_ext_lock); __ext_put_deviceids(&tmp); @@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) } static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, - size_t buffer_size, size_t *count) + size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; int ret = 0; @@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; } + *lastbyte = bl->bl_lwb - 1; + bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; @@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) arg->layoutupdate_pages = &arg->layoutupdate_page; retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a7f2e6e..52a2831 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, err_socks: svc_rpcb_cleanup(serv, net); err_bind: + nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %p\n", ret, net); return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c92a75e..f953ef6 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp, ((u32 *)&rclist->rcl_sessionid.data)[3], ref->rc_sequenceid, ref->rc_slotid); - spin_lock(&tbl->slot_tbl_lock); - status = (test_bit(ref->rc_slotid, tbl->used_slots) && - tbl->slots[ref->rc_slotid].seq_nr == - ref->rc_sequenceid); - spin_unlock(&tbl->slot_tbl_lock); + status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, + ref->rc_sequenceid, HZ >> 1) < 0; if (status) goto out; } @@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, goto out; tbl = &clp->cl_session->bc_slot_table; - slot = tbl->slots + args->csa_slotid; /* Set up res before grabbing the spinlock */ memcpy(&res->csr_sessionid, &args->csa_sessionid, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 003ebce..1e10678 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready); * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, - unsigned int timeo, unsigned int retrans) + int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; @@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (to->to_initval == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; @@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; - if (!to->to_initval) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7d62097..ca699dd 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - written = generic_write_sync(iocb, result); + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; iocb->ki_pos += written; /* Return error values */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6206ea..51b5136 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) if (ffl) { INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); + ffl->last_report_time = ktime_get(); return &ffl->generic_hdr; } else return NULL; @@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, { static const ktime_t notime = {0}; s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; - if (ktime_equal(mirror->last_report_time, notime)) - mirror->last_report_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; - if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >= report_interval) { - mirror->last_report_time = now; + ffl->last_report_time = now; return true; } @@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; + bool fail_return = false; int idx; /* mirrors are sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (idx+1 == fls->mirror_array_cnt) + fail_return = true; + ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); if (ds) { *best_idx = idx; return ds; @@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs4_pnfs_ds *ds; int ds_idx; +retry: /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); @@ -890,12 +897,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static void @@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; +retry: if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); if (!ds) { - if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) - goto out_pnfs; - else + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); @@ -956,12 +961,6 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); - return; - -out_pnfs: - pnfs_set_lo_fail(pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; } static unsigned int diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 1bcdb15..3ee0c9f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; - ktime_t last_report_time; u32 report_interval; }; @@ -101,6 +100,7 @@ struct nfs4_flexfile_layout { struct pnfs_ds_commit_info commit_info; struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ + ktime_t last_report_time; /* Layoutstat report times */ }; static inline struct nfs4_flexfile_layout * diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 0aa36be..f7a3f6b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -17,8 +17,8 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; -static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; +static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; +static unsigned int dataserver_retrans; void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { @@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) - goto out; + goto out_fail; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].rsize = max_payload; if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) mirror->mirror_ds->ds_versions[0].wsize = max_payload; - } else { - ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, - lseg->pls_range.length, NFS4ERR_NXIO, - OP_ILLEGAL, GFP_NOIO); - if (fail_return || !ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + goto out; } + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); +out_fail: + if (fail_return || !ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(ino, lseg); + ds = NULL; out: return ds; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7ce5e02..74935a1 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -58,6 +58,9 @@ struct nfs_clone_mount { */ #define NFS_UNSPEC_PORT (-1) +#define NFS_UNSPEC_RETRANS (UINT_MAX) +#define NFS_UNSPEC_TIMEO (UINT_MAX) + /* * Maximum number of pages that readdir can use for creating * a vmapped array of pages. @@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); -void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6f47527..64b43b4 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -318,10 +318,22 @@ static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *server = NFS_SERVER(data->args.inode); + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); + spin_unlock(&inode->i_lock); nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, &data->res.seq_res, task); + } static void @@ -341,11 +353,11 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&data->args.stateid, + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); @@ -359,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) } else spin_unlock(&inode->i_lock); break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; - default: - break; } dprintk("%s server returns %d\n", __func__, task->tk_status); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8d7d08d..cd3b7cf 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server, goto error; } + if (server->nfs_client == clp) { + error = -ELOOP; + goto error; + } + /* * Query for the lease time on clientid setup or renewal * diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1949bbd..a9dec32 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -634,15 +634,11 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs40_setup_sequence); -static int nfs40_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (slot == NULL) - goto out; - tbl = slot->table; spin_lock(&tbl->slot_tbl_lock); if (!nfs41_wake_and_assign_slot(tbl, slot)) @@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task, spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; -out: +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); return 1; } @@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) tbl = slot->table; session = tbl->session; + /* Bump the slot sequence number */ + if (slot->seq_done) + slot->seq_nr++; + slot->seq_done = 0; + spin_lock(&tbl->slot_tbl_lock); /* Be nice to the server: try to ensure that the last transmitted * value for highest_user_slotid <= target_highest_slotid @@ -686,9 +693,12 @@ out_unlock: res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); + if (waitqueue_active(&tbl->slot_waitq)) + wake_up_all(&tbl->slot_waitq); } -int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +static int nfs41_sequence_process(struct rpc_task *task, + struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; @@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ @@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) goto retry_nowait; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + slot->seq_done = 1; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(res); out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { + nfs41_sequence_free_slot(res); task->tk_status = 0; ret = 0; } @@ -789,8 +799,37 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (!nfs41_sequence_process(task, res)) + return 0; + if (res->sr_slot != NULL) + nfs41_sequence_free_slot(res); + return 1; + +} EXPORT_SYMBOL_GPL(nfs41_sequence_done); +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (res->sr_slot->table->session != NULL) + return nfs41_sequence_process(task, res); + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) { + if (res->sr_slot->table->session != NULL) + nfs41_sequence_free_slot(res); + else + nfs40_sequence_free_slot(res); + } +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { if (res->sr_slot == NULL) @@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server, args, res, task); } +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref) struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); + nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); @@ -1656,9 +1707,14 @@ err: static struct nfs4_state * nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *ret; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) - return _nfs4_opendata_reclaim_to_nfs4_state(data); - return _nfs4_opendata_to_nfs4_state(data); + ret =_nfs4_opendata_reclaim_to_nfs4_state(data); + else + ret = _nfs4_opendata_to_nfs4_state(data); + nfs4_sequence_free_slot(&data->o_res.seq_res); + return ret; } static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) @@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + if (!nfs4_sequence_process(task, &data->o_res.seq_res)) return; if (task->tk_status == 0) { @@ -7514,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); @@ -7864,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(task, &lgp->res.seq_res); + nfs41_sequence_process(task, &lgp->res.seq_res); dprintk("<-- %s\n", __func__); } @@ -8080,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); + nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8106,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -8118,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; + nfs4_sequence_free_slot(&lrp->res.seq_res); rpc_restart_call_prepare(task); return; } @@ -8132,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); + nfs4_sequence_free_slot(&lrp->res.seq_res); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 332d06e..b629730 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_waitqueue_head(&tbl->slot_waitq); init_completion(&tbl->complete); } @@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) return ERR_PTR(-E2BIG); } +static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid, + u32 *seq_nr) + __must_hold(&tbl->slot_tbl_lock) +{ + struct nfs4_slot *slot; + + slot = nfs4_lookup_slot(tbl, slotid); + if (IS_ERR(slot)) + return PTR_ERR(slot); + *seq_nr = slot->seq_nr; + return 0; +} + +/* + * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use + * + * Given a slot table, slot id and sequence number, determine if the + * RPC call in question is still in flight. This function is mainly + * intended for use by the callback channel. + */ +static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr) +{ + u32 cur_seq; + bool ret = false; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 && + cur_seq == seq_nr && test_bit(slotid, tbl->used_slots)) + ret = true; + spin_unlock(&tbl->slot_tbl_lock); + return ret; +} + +/* + * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete + * + * Given a slot table, slot id and sequence number, wait until the + * corresponding RPC call completes. This function is mainly + * intended for use by the callback channel. + */ +int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout) +{ + if (wait_event_timeout(tbl->slot_waitq, + !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr), + timeout) == 0) + return -ETIMEDOUT; + return 0; +} + /* * nfs4_alloc_slot - efficiently look for a free slot * diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 5b51298..f703b75 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -21,7 +21,8 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1; + unsigned int interrupted : 1, + seq_done : 1; }; /* Sessions */ @@ -36,6 +37,7 @@ struct nfs4_slot_table { unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ spinlock_t slot_tbl_lock; struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + wait_queue_head_t slot_waitq; /* Completion wait on slot */ u32 max_slots; /* # slots in table */ u32 max_slotid; /* Max allowed slotid value */ u32 highest_used_slotid; /* sent to server on each SEQ. @@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid); +extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout); extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 70806ca..2c93a85 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); @@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; - bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; + } + if (pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); /* * Because of wraparound, we want to keep the barrier @@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } @@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); @@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + nfs4_client_recover_expired_lease(clp); first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); @@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (!pnfs_layout_is_valid(lo)) { - pnfs_clear_layoutreturn_info(lo); - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - } if (res->return_on_close) @@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) data->args.fh = NFS_FH(inode); data->args.inode = inode; - nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); status = ld->prepare_layoutstats(&data->args); if (status) goto out_free; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 18d446e..d396013 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { + data->timeo = NFS_UNSPEC_TIMEO; + data->retrans = NFS_UNSPEC_RETRANS; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option, + unsigned long l_bound, unsigned long u_bound) +{ + int ret; + + ret = nfs_get_option_ul(args, option); + if (ret != 0) + return ret; + if (*option < l_bound || *option > u_bound) + return -ERANGE; + return 0; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw, mnt->bsize = option; break; case Opt_timeo: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX)) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX)) goto out_invalid_value; mnt->retrans = option; break; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 54e5d66..43fdc27 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) } for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + if (ovl_is_private_xattr(name)) + continue; retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 12bcd07..1560fdc 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -12,6 +12,8 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (!hardlink && !IS_POSIXACL(udir)) + stat->mode &= ~current_umask(); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); @@ -335,6 +340,32 @@ out_free: return ret; } +static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, + const struct posix_acl *acl) +{ + void *buffer; + size_t size; + int err; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) + return 0; + + size = posix_acl_to_xattr(NULL, acl, NULL, 0); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + err = size; + if (err < 0) + goto out_free; + + err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); +out_free: + kfree(buffer); + return err; +} + static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct kstat *stat, const char *link, struct dentry *hardlink) @@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *upper; struct dentry *newdentry; int err; + struct posix_acl *acl, *default_acl; if (WARN_ON(!workdir)) return -EROFS; + if (!hardlink) { + err = posix_acl_create(dentry->d_parent->d_inode, + &stat->mode, &default_acl, &acl); + if (err) + return err; + } + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } + if (!hardlink) { + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, + acl); + if (err) + goto out_cleanup; + + err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, + default_acl); + if (err) + goto out_cleanup; + } if (!hardlink && S_ISDIR(stat->mode)) { err = ovl_set_opaque(newdentry); @@ -410,6 +460,10 @@ out_dput: out_unlock: unlock_rename(workdir, upperdir); out: + if (!hardlink) { + posix_acl_release(acl); + posix_acl_release(default_acl); + } return err; out_cleanup: @@ -950,9 +1004,9 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_dir_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1b885c1..c75625c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> #include "overlayfs.h" static int ovl_copy_up_truncate(struct dentry *dentry) @@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) return err; } -static bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(const char *name) { -#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "." - return strncmp(name, OVL_XATTR_PRE_NAME, - sizeof(OVL_XATTR_PRE_NAME) - 1) == 0; + return strncmp(name, OVL_XATTR_PREFIX, + sizeof(OVL_XATTR_PREFIX) - 1) == 0; } -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { int err; - struct dentry *upperdentry; + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; + if (!value && !OVL_TYPE_UPPER(type)) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + } + err = ovl_copy_up(dentry); if (err) goto out_drop_write; - upperdentry = ovl_dentry_upper(dentry); + if (!OVL_TYPE_UPPER(type)) + ovl_path_upper(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_setxattr(upperdentry, name, value, size, flags); + if (value) + err = vfs_setxattr(realpath.dentry, name, value, size, flags); + else { + WARN_ON(flags != XATTR_REPLACE); + err = vfs_removexattr(realpath.dentry, name); + } revert_creds(old_cred); out_drop_write: @@ -225,16 +238,13 @@ out: return err; } -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; const struct cred *old_cred; - if (ovl_is_private_xattr(name)) - return -ENODATA; - old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(realdentry, name, value, size); revert_creds(old_cred); @@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; - int off; + size_t len; + char *s; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); @@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; /* filter out private xattrs */ - for (off = 0; off < res;) { - char *s = list + off; - size_t slen = strlen(s) + 1; + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; - BUG_ON(off + slen > res); + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + len -= slen; if (ovl_is_private_xattr(s)) { res -= slen; - memmove(s, s + slen, res - off); + memmove(s, s + slen, len); } else { - off += slen; + s += slen; } } return res; } -int ovl_removexattr(struct dentry *dentry, const char *name) -{ - int err; - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); - const struct cred *old_cred; - - err = ovl_want_write(dentry); - if (err) - goto out; - - err = -ENODATA; - if (ovl_is_private_xattr(name)) - goto out_drop_write; - - if (!OVL_TYPE_UPPER(type)) { - err = vfs_getxattr(realpath.dentry, name, NULL, 0); - if (err < 0) - goto out_drop_write; - - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; - - ovl_path_upper(dentry, &realpath); - } - - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_removexattr(realpath.dentry, name); - revert_creds(old_cred); -out_drop_write: - ovl_drop_write(dentry); -out: - return err; -} - struct posix_acl *ovl_get_acl(struct inode *inode, int type) { struct inode *realinode = ovl_inode_real(inode, NULL); const struct cred *old_cred; struct posix_acl *acl; - if (!IS_POSIXACL(realinode)) + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; if (!realinode->i_op->get_acl) return NULL; old_cred = ovl_override_creds(inode->i_sb); - acl = realinode->i_op->get_acl(realinode, type); + acl = get_acl(realinode, type); revert_creds(old_cred); return acl; @@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; @@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = { .readlink = ovl_readlink, .getattr = ovl_getattr, .setxattr = generic_setxattr, - .getxattr = ovl_getxattr, + .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = ovl_removexattr, + .removexattr = generic_removexattr, .update_time = ovl_update_time, }; @@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode) inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; +#endif mode &= S_IFMT; switch (mode) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e4f5c95..5813ccf 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -24,8 +24,8 @@ enum ovl_path_type { (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) -#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay" -#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque" +#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." +#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_ISUPPER_MASK 1UL @@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); int ovl_check_d_type_supported(struct path *realpath); +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags); -ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size); +int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); -int ovl_removexattr(struct dentry *dentry, const char *name); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); +bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index cf37fc7..f241b4e 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath, err = rdd->err; } while (!err && rdd->count); - if (!err && rdd->first_maybe_whiteout) + if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath->dentry, rdd); fput(realfile); @@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath) return rdd.d_type_supported; } + +static void ovl_workdir_cleanup_recurse(struct path *path, int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = RB_ROOT, + .is_lowest = false, + }; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + dentry = lookup_one_len(p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + ovl_workdir_cleanup(dir, path->mnt, dentry, level); + dput(dentry); + } + inode_unlock(dir); +out: + ovl_cache_free(&list); +} + +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + ovl_cleanup(dir, dentry); + return; + } + + err = ovl_do_rmdir(dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + ovl_workdir_cleanup_recurse(&path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + ovl_cleanup(dir, dentry); + } +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4036132..e2a94a2 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -814,6 +814,10 @@ retry: struct kstat stat = { .mode = S_IFDIR | 0, }; + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat.mode, + }; if (work->d_inode) { err = -EEXIST; @@ -821,7 +825,7 @@ retry: goto out_dput; retried = true; - ovl_cleanup(dir, work); + ovl_workdir_cleanup(dir, mnt, work, 0); dput(work); goto retry; } @@ -829,6 +833,21 @@ retry: err = ovl_create_real(dir, work, &stat, NULL, NULL, true); if (err) goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; } out_unlock: inode_unlock(dir); @@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int __maybe_unused +ovl_posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, handler->name, buffer, size); +} + +static int __maybe_unused +ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { struct dentry *workdir = ovl_workdir(dentry); struct inode *realinode = ovl_inode_real(inode, NULL); @@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, posix_acl_release(acl); - return ovl_setxattr(dentry, inode, handler->name, value, size, flags); + err = ovl_xattr_set(dentry, handler->name, value, size, flags); + if (!err) + ovl_copyattr(ovl_inode_real(inode, NULL), inode); + + return err; out_acl_release: posix_acl_release(acl); return err; } -static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { - return ovl_setxattr(dentry, inode, name, value, size, flags); + return -EPERM; } static int ovl_own_xattr_set(const struct xattr_handler *handler, @@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler, return -EPERM; } -static const struct xattr_handler ovl_posix_acl_access_xattr_handler = { +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, name, value, size, flags); +} + +static const struct xattr_handler __maybe_unused +ovl_posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; -static const struct xattr_handler ovl_posix_acl_default_xattr_handler = { +static const struct xattr_handler __maybe_unused +ovl_posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, + .get = ovl_posix_acl_xattr_get, .set = ovl_posix_acl_xattr_set, }; static const struct xattr_handler ovl_own_xattr_handler = { .prefix = OVL_XATTR_PREFIX, + .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_other_xattr_handler = { .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, .set = ovl_other_xattr_set, }; static const struct xattr_handler *ovl_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL &ovl_posix_acl_access_xattr_handler, &ovl_posix_acl_default_xattr_handler, +#endif &ovl_own_xattr_handler, &ovl_other_xattr_handler, NULL }; -static const struct xattr_handler *ovl_xattr_noacl_handlers[] = { - &ovl_own_xattr_handler, - &ovl_other_xattr_handler, - NULL, -}; - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { NULL, NULL }; @@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directries, limit is %d\n", + pr_err("overlayfs: too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_free_lowertmp; } else if (!ufs->config.upperdir && stacklen == 1) { @@ -1269,10 +1317,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; - if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) - sb->s_xattr = ovl_xattr_handlers; - else - sb->s_xattr = ovl_xattr_noacl_handlers; + sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; sb->s_fs_info = ufs; sb->s_flags |= MS_POSIXACL; diff --git a/fs/proc/base.c b/fs/proc/base.c index e9ff186..3b792ab 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1556,18 +1556,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84e..f6fa99e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/fs/seq_file.c b/fs/seq_file.c index 19f532e..6dc4296 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) + if (!m->count) { + m->from = 0; m->index++; + } if (!size) goto Done; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f35523d..b803213 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ - if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); + if (pos) { + if (len <= pos) + return 0; + len -= pos; + memmove(buf, buf + pos, len); + } return min(count, len); } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index b45345d..51157da 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) p = c->gap_lebs; do { - ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e237811..11a0041 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, inode->i_ino, dentry, size); - return __ubifs_getxattr(inode, name, buffer, size); + name = xattr_full_name(handler, name); + return __ubifs_getxattr(inode, name, buffer, size); } static int ubifs_xattr_set(const struct xattr_handler *handler, @@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); + name = xattr_full_name(handler, name); + if (value) return __ubifs_setxattr(inode, name, value, size, flags); else diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3dd8f1d..05b5243 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2278,6 +2278,8 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), offsetof(xfs_agf_t, agf_rmap_blocks), + /* needed so that we don't log the whole rest of the structure: */ + offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b5c213a..0856979 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1814,6 +1814,10 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); + /* No such thing as a zero-level tree. */ + if (cur->bc_nlevels == 0) + return -EFSCORRUPTED; + block = NULL; keyno = 0; @@ -4554,15 +4558,22 @@ xfs_btree_simple_query_range( if (error) goto out; + /* Nothing? See if there's anything to the right. */ + if (!stat) { + error = xfs_btree_increment(cur, 0, &stat); + if (error) + goto out; + } + while (stat) { /* Find the record. */ error = xfs_btree_get_rec(cur, &recp, &stat); if (error || !stat) break; - cur->bc_ops->init_high_key_from_rec(&rec_key, recp); /* Skip if high_key(rec) < low_key. */ if (firstrec) { + cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; diff = cur->bc_ops->diff_two_keys(cur, low_key, &rec_key); @@ -4571,6 +4582,7 @@ xfs_btree_simple_query_range( } /* Stop if high_key < low_key(rec). */ + cur->bc_ops->init_key_from_rec(&rec_key, recp); diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); if (diff > 0) break; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 054a203..c221d0e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -194,7 +194,7 @@ xfs_defer_trans_abort( /* Abort intent items. */ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { trace_xfs_defer_pending_abort(tp->t_mountp, dfp); - if (dfp->dfp_committed) + if (!dfp->dfp_done) dfp->dfp_type->abort_intent(dfp->dfp_intent); } @@ -290,7 +290,6 @@ xfs_defer_finish( struct xfs_defer_pending *dfp; struct list_head *li; struct list_head *n; - void *done_item = NULL; void *state; int error = 0; void (*cleanup_fn)(struct xfs_trans *, void *, int); @@ -309,19 +308,11 @@ xfs_defer_finish( if (error) goto out; - /* Mark all pending intents as committed. */ - list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) { - if (dfp->dfp_committed) - break; - trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp); - dfp->dfp_committed = true; - } - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop->dop_pending, struct xfs_defer_pending, dfp_list); trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, dfp->dfp_count); cleanup_fn = dfp->dfp_type->finish_cleanup; @@ -331,7 +322,7 @@ xfs_defer_finish( list_del(li); dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, - done_item, &state); + dfp->dfp_done, &state); if (error) { /* * Clean up after ourselves and jump out. @@ -428,8 +419,8 @@ xfs_defer_add( dfp = kmem_alloc(sizeof(struct xfs_defer_pending), KM_SLEEP | KM_NOFS); dfp->dfp_type = defer_op_types[type]; - dfp->dfp_committed = false; dfp->dfp_intent = NULL; + dfp->dfp_done = NULL; dfp->dfp_count = 0; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, &dop->dop_intake); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index cc3981c..e96533d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -30,8 +30,8 @@ struct xfs_defer_op_type; struct xfs_defer_pending { const struct xfs_defer_op_type *dfp_type; /* function pointers */ struct list_head dfp_list; /* pending items */ - bool dfp_committed; /* committed trans? */ void *dfp_intent; /* log intent item */ + void *dfp_done; /* log done item */ struct list_head dfp_work; /* work items */ unsigned int dfp_count; /* # extent items */ }; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e6a8bea..270fb5c 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -674,7 +674,8 @@ typedef struct xfs_agf { #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 #define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_NUM_BITS 14 +#define XFS_AGF_SPARE64 0x00004000 +#define XFS_AGF_NUM_BITS 15 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -691,7 +692,8 @@ typedef struct xfs_agf { { XFS_AGF_LONGEST, "LONGEST" }, \ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ { XFS_AGF_UUID, "UUID" }, \ - { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" } + { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 0e3d4f5..4aecc5f 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -583,7 +583,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + return xfs_mount_validate_sb(mp, &sb, + bp->b_maps[0].bm_bn == XFS_SB_DADDR, check_version); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 607cc29..b5b9bff 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1611,7 +1611,7 @@ xfs_wait_buftarg( */ while (percpu_counter_sum(&btp->bt_io_count)) delay(100); - drain_workqueue(btp->bt_mount->m_buf_workqueue); + flush_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 24ef83e..fd6be45 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1574,9 +1574,16 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, + "EXPERIMENTAL reverse mapping btree not compatible with realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; + } xfs_alert(mp, "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + } error = xfs_mountfs(mp); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7e88bec..d303a66 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2295,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->type = dfp->dfp_type->type; __entry->intent = dfp->dfp_intent; - __entry->committed = dfp->dfp_committed; + __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", |