diff options
Diffstat (limited to 'fs')
69 files changed, 979 insertions, 421 deletions
diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e4..35de0c0 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void) afs_uuid.time_low = uuidtime; afs_uuid.time_mid = uuidtime >> 32; afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; get_random_bytes(&clockseq, 2); afs_uuid.clock_seq_low = clockseq; afs_uuid.clock_seq_hi_and_reserved = (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", afs_uuid.time_low, @@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work) aio_free_ring(ctx); free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } @@ -715,8 +717,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - free_percpu(ctx->reqs.pcpu_count); - free_percpu(ctx->users.pcpu_count); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -830,16 +832,20 @@ void exit_aio(struct mm_struct *mm) static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -847,10 +853,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -869,6 +877,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395..1c55388 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92371c4..1daea0b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2af6e66..eea26e1 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8bb4aa1..08e65e9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -2904,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99c2539..813537f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5678,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5701,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5741,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0d321c2..47aceb4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -3139,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3154,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em->generation = -1; if (btrfs_file_extent_type(path->nodes[0], fi) == @@ -3508,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3562,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e12441c..7187b14 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d..9626b4a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291..4a88f07 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4662d92..8e16bca 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; @@ -1467,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1808,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458..7869936 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,14 +605,37 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab5763..ac46df3 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 511839c..5f379af 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -491,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -1615,11 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c83b242..6cb82f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -1677,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); @@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2205,6 +2224,7 @@ error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 4f19631..b67d8fc 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -EIO; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this diff --git a/fs/coredump.c b/fs/coredump.c index 0b2528f..a93f7e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -306,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(nr < 0)) return nr; - tsk->flags = PF_DUMPCORE; + tsk->flags |= PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 98040ba..17e39b0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,7 +71,6 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ @@ -198,9 +197,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) * L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, - struct dio_submit *sdio, size_t *from, size_t *to) + struct dio_submit *sdio) { - int n; if (dio_pages_present(sdio) == 0) { int ret; @@ -209,10 +207,7 @@ static inline struct page *dio_get_page(struct dio *dio, return ERR_PTR(ret); BUG_ON(dio_pages_present(sdio) == 0); } - n = sdio->head++; - *from = n ? 0 : sdio->from; - *to = (n == sdio->tail - 1) ? sdio->to : PAGE_SIZE; - return dio->pages[n]; + return dio->pages[sdio->head]; } /** @@ -911,11 +906,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, while (sdio->block_in_file < sdio->final_block_in_request) { struct page *page; size_t from, to; - page = dio_get_page(dio, sdio, &from, &to); + + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; while (from < to) { unsigned this_chunk_bytes; /* # of bytes mapped */ @@ -1104,7 +1103,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset + iov_iter_count(iter); + size_t count = iov_iter_count(iter); + loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1287,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (rw == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0762d14..fca3820 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3f5c188..0b7e28e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -966,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6..5b87fc3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } @@ -851,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8a57e9f..fd69da1 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -1310,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); @@ -1329,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59e3162..2dcb936 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9b9aab..6df7bc6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; @@ -2809,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2826,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -3880,38 +3879,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - - /* - * set up enough so that it can read an inode, - * and create new inode for buddy allocator - */ - sbi->s_gdb_count = db_count; - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount2; - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2a; + goto failed_mount2; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2a; + goto failed_mount2; } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3946,6 +3926,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4135,13 +4123,21 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount5; + goto failed_mount4a; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); goto failed_mount5; } @@ -4218,8 +4214,11 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_release_system_zone(sb); + ext4_mb_release(sb); failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4243,14 +4242,11 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); -failed_mount2a: - ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: - ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0924521..f8cf619 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -608,8 +608,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -637,7 +637,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -671,7 +671,7 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -708,10 +708,23 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, get_data_block); + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 966acb0..a4addd7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -376,11 +376,11 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e51c732..58df97e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -342,9 +342,6 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -644,7 +641,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c58e330..7d8b962 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -659,16 +659,19 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -683,8 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index adc622c..2cf6962 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -78,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9138c32..a6bdddc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -417,9 +417,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -448,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -457,9 +458,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); @@ -474,7 +472,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9dfb9a0..4b697cc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -42,6 +42,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f25f0e0..d04613d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,14 +272,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; return err; } - sbi->sm_info->cmd_control_info = fcc; return err; } @@ -1885,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2b1863..8f96d93 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -689,9 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 098f97b..ca88731 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4219835..0c60482 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -814,13 +815,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename_common(olddir, oldent, newdir, newent, 0, - FUSE_RENAME, sizeof(struct fuse_rename_in)); -} - static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) @@ -831,17 +825,30 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - if (fc->no_rename2 || fc->minor < 23) - return -EINVAL; + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, - FUSE_RENAME2, sizeof(struct fuse_rename2_in)); - if (err == -ENOSYS) { - fc->no_rename2 = 1; - err = -EINVAL; + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); } + return err; +} +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); } static int fuse_link(struct dentry *entry, struct inode *newdir, @@ -985,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1171,7 +1178,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6e16dad..40ac262 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1687,7 +1687,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1715,6 +1715,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: @@ -1955,8 +1957,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 754dcf2..03246cd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -478,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -488,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -511,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -895,9 +907,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; - else - fc->sb->s_time_gran = 1000000000; - } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -926,7 +935,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | - FUSE_WRITEBACK_CACHE; + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1006,7 +1015,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4fc3a30..26b3f95 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -981,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -991,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) goto out; flock_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c355f73..ee4e04f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -731,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, cachep = gfs2_glock_aspace_cachep; else cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_KERNEL); + gl = kmem_cache_alloc(cachep, GFP_NOFS); if (!gl) return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; @@ -1404,12 +1404,16 @@ __acquires(&lru_lock) gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: list_add(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); continue; } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } clear_bit(GLF_LRU, &gl->gl_flags); - spin_unlock(&lru_lock); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1417,7 +1421,7 @@ __acquires(&lru_lock) if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + cond_resched_lock(&lru_lock); } } @@ -1442,7 +1446,7 @@ static long gfs2_scan_glock_lru(int nr) gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ - if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fc11007..2ffc67d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -234,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl) * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: - * - * Normally we invlidate everything, but if we are moving into + * + * Normally we invalidate everything, but if we are moving into * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we * can keep hold of the metadata, since it won't have changed. * diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 91f274d..4fafea1 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1036,8 +1036,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, new_size = old_size + RECOVER_SIZE_INC; - submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); - result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); if (!submit || !result) { kfree(submit); kfree(result); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index db629d1..f4cb9c0 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -337,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le /** * gfs2_free_extlen - Return extent length of free blocks - * @rbm: Starting position + * @rrbm: Starting position * @len: Max length to check * * Starting at the block specified by the rbm, see how many free blocks @@ -2522,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5..6f0f590 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1588,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3d37f6..d895b4b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -39,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } -/** - * kernfs_notify - notify a kernfs file - * @kn: file to notify - * - * Notify @kn such that poll(2) on @kn wakes up. - */ -void kernfs_notify(struct kernfs_node *kn) +static void kernfs_notify_workfn(struct work_struct *work) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; - unsigned long flags; - - if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); + spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { @@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn) wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct inode *inode; struct dentry *dentry; @@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn) } mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98..f973ae9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", @@ -325,7 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return -ENOMEM; fl->fl_file = filp; - fl->fl_owner = (fl_owner_t)filp; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; @@ -431,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = (fl_owner_t)current->files; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(posix_lock_file_wait); int locks_mandatory_locked(struct file *file) { struct inode *inode = file_inode(file); - fl_owner_t owner = current->files; struct file_lock *fl; /* @@ -1165,7 +1164,8 @@ int locks_mandatory_locked(struct file *file) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) + if (fl->fl_owner != current->files && + fl->fl_owner != file) break; } spin_unlock(&inode->i_lock); @@ -1205,7 +1205,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { if (filp) { - fl.fl_owner = (fl_owner_t)filp; + fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; error = __posix_lock_file(inode, &fl, NULL); if (!error) @@ -1948,7 +1948,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) cmd = F_GETLK; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2103,7 +2103,7 @@ again: cmd = F_SETLK; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2112,7 +2112,7 @@ again: cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; @@ -2170,7 +2170,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) cmd = F_GETLK64; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2242,7 +2242,7 @@ again: cmd = F_SETLK64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2251,7 +2251,7 @@ again: cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; @@ -2324,11 +2324,11 @@ void locks_remove_file(struct file *filp) if (!inode->i_flock) return; - locks_remove_posix(filp, (fl_owner_t)filp); + locks_remove_posix(filp, filp); if (filp->f_op->flock) { struct file_lock fl = { - .fl_owner = (fl_owner_t)filp, + .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e3..187477d 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include <linux/mbcache.h> #include <linux/init.h> #include <linux/blockgroup_lock.h> +#include <linux/log2.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) @@ -2256,9 +2256,10 @@ done: goto out; } path->dentry = dentry; - path->mnt = mntget(nd->path.mnt); + path->mnt = nd->path.mnt; if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; + mntget(path->mnt); follow_mount(path); error = 0; out: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8f98138..f11b9ee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -756,7 +756,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { - bool do_destroy = true; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); @@ -765,7 +764,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) case NFS_IOHDR_NEED_COMMIT: kref_get(&req->wb_kref); nfs_mark_request_commit(req, hdr->lseg, &cinfo); - do_destroy = false; } nfs_unlock_and_release_request(req); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 82ddbf4..f415cbf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -244,6 +244,7 @@ void nfs_pgio_data_release(struct nfs_pgio_data *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); static inline void nfs_iocounter_init(struct nfs_io_counter *c) { diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 871d6ed..8f854dd 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -247,3 +247,46 @@ const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_default_xattr_handler, NULL, }; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) +{ + struct posix_acl *acl; + char *p = data + *result; + + acl = get_acl(inode, type); + if (!acl) + return 0; + + posix_acl_release(acl); + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e7daa42..f0afa29 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,7 +885,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, @@ -899,7 +899,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b6ee3a6..17fab89 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,8 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static void nfs_free_request(struct nfs_page *); - static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; @@ -239,20 +237,28 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) WARN_ON_ONCE(prev == req); if (!prev) { + /* a head request */ req->wb_head = req; req->wb_this_page = req; } else { + /* a subrequest */ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); req->wb_head = prev->wb_head; req->wb_this_page = prev->wb_this_page; prev->wb_this_page = req; + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + /* grab extra ref if head request has extra ref from * the write/commit path to handle handoff between write * and commit lists */ - if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); + } } } @@ -269,6 +275,10 @@ nfs_page_group_destroy(struct kref *kref) struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); struct nfs_page *tmp, *next; + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) return; @@ -394,7 +404,7 @@ static void nfs_clear_request(struct nfs_page *req) * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct nfs_page *req) +void nfs_free_request(struct nfs_page *req) { WARN_ON_ONCE(req->wb_this_page != req); @@ -925,7 +935,6 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; - desc->pg_moreio = 0; if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ @@ -972,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 98ff061..5e2f1030 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -46,6 +46,7 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -91,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; @@ -104,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) /* Linearly search the commit list for the correct req */ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { if (freq->wb_page == page) { - req = freq; + req = freq->wb_head; break; } } } - if (req) + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -274,36 +290,246 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; int ret; +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + spin_lock(&inode->i_lock); - for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); - if (req == NULL) - break; - if (nfs_lock_request(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - if (ret != 0) + return NULL; + } + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + return ERR_PTR(ret); - spin_lock(&inode->i_lock); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); - return req; + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; } /* @@ -316,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page, nonblock); + req = nfs_lock_and_join_requests(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -448,7 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; - set_bit(PG_INODE_REF, &req->wb_flags); + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -474,7 +702,9 @@ static void nfs_inode_remove_request(struct nfs_page *req) nfsi->npages--; spin_unlock(&inode->i_lock); } - nfs_release_request(req); + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void @@ -638,7 +868,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; unsigned long bytes = 0; - bool do_destroy; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -668,7 +897,6 @@ remove_req: next: nfs_unlock_request(req); nfs_end_page_writeback(req); - do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags); nfs_release_request(req); } out: @@ -769,7 +997,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -877,7 +1105,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; l_ctx = req->wb_lock_context; @@ -1569,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - for (;;) { - wait_on_page_writeback(page); - req = nfs_page_find_request(page); - if (req == NULL) - break; - if (nfs_lock_request(req)) { - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_and_release_request(req); - break; - } - ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret < 0) - break; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } + return ret; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6851b00..8f029db 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -617,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 83baf2b..944275c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -600,7 +600,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create READ_BUF(4); create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: @@ -2630,7 +2641,7 @@ nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { __be32 *p; - p = xdr_reserve_space(xdr, 6); + p = xdr_reserve_space(xdr, 20); if (!p) return NULL; *p++ = htonl(2); @@ -2868,6 +2879,7 @@ again: * return the conflicting open: */ if (conf->len) { + kfree(conf->data); conf->len = 0; conf->data = NULL; goto again; @@ -2880,6 +2892,7 @@ again: if (conf->len) { p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); p = xdr_encode_opaque(p, conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ p = xdr_encode_hyper(p, (u64)0); /* clientid */ *p++ = cpu_to_be32(0); /* length of owner name */ @@ -2896,7 +2909,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); - kfree(lock->lk_denied.ld_owner.data); + return nfserr; } @@ -3267,7 +3280,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd wire_count = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); if (maxcount & 3) write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); @@ -263,11 +263,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EPERM; /* - * We can not allow to do any fallocate operation on an active - * swapfile + * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) - ret = -ETXTBSY; + return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9..bf2d03f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63..7f30bdc 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -702,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb..3857b72 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; @@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) /* wrap around? */ len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) + if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 96175df..75c3fe5 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4298,8 +4298,8 @@ xfs_bmapi_delay( } -int -__xfs_bmapi_allocate( +static int +xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4578,9 +4578,6 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; - if (flags & XFS_BMAPI_STACK_SWITCH) - bma.stack_switch = 1; - while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 38ba36e..b879ca5 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,7 +77,6 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 -#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -86,8 +85,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } + { XFS_BMAPI_CONVERT, "CONVERT" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 703b3ec..64731ef 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -249,59 +249,6 @@ xfs_bmap_rtalloc( } /* - * Stack switching interfaces for allocation - */ -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; - - /* - * we are in a transaction context here, but may also be doing work - * in kswapd context, and hence we may need to inherit that state - * temporarily to ensure that we don't block waiting for memory reclaim - * in any way. - */ - if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; - - current_set_flags_nested(&pflags, new_pflags); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, new_pflags); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - args->kswapd = current_is_kswapd(); - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - destroy_work_on_stack(&args->work); - return args->result; -} - -/* * Check if the endoff is outside the last extent. If so the caller will grow * the allocation to a stripe unit boundary. All offsets are considered outside * the end of file for an empty fork, so 1 is returned in *eof in that case. diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 075f722..2fdb72d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -55,8 +55,6 @@ struct xfs_bmalloca { bool userdata;/* set if is user data */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ - bool stack_switch; - bool kswapd; /* allocation in kswapd context */ int flags; struct completion *done; struct work_struct work; @@ -66,8 +64,6 @@ struct xfs_bmalloca { int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, int *committed); int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); -int xfs_bmapi_allocate(struct xfs_bmalloca *args); -int __xfs_bmapi_allocate(struct xfs_bmalloca *args); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index bf810c6..cf893bc 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -33,6 +33,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -2323,7 +2324,7 @@ error1: * record (to be inserted into parent). */ STATIC int /* error */ -xfs_btree_split( +__xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, @@ -2503,6 +2504,85 @@ error0: return error; } +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + /* * Copy the old inode root contents into a real block and make the * broot point to it. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6c5eb4c..6d3ec2b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -749,8 +749,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, - XFS_BMAPI_STACK_SWITCH, + count_fsb, 0, &first_block, 1, imap, &nimaps, &free_list); if (error) diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index c3453b1..7703fa6 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -483,10 +483,16 @@ xfs_sb_quota_to_disk( } /* - * GQUOTINO and PQUOTINO cannot be used together in versions - * of superblock that do not have pquotino. from->sb_flags - * tells us which quota is active and should be copied to - * disk. + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. */ if ((*fields & XFS_SB_GQUOTINO) && (from->sb_qflags & XFS_GQUOTA_ACCT)) @@ -494,6 +500,17 @@ xfs_sb_quota_to_disk( else if ((*fields & XFS_SB_PQUOTINO) && (from->sb_qflags & XFS_PQUOTA_ACCT)) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); } |