diff options
Diffstat (limited to 'fs')
103 files changed, 1398 insertions, 907 deletions
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; @@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 06e8ff1..4230252 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> -#include <linux/smp_lock.h> #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7845d1f..b50bc4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - struct bio *bio; int nr_vecs; nr_vecs = bio_get_nr_vecs(bdev); - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_byte >> 9; - } - return bio; + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); } static int check_compressed_csum(struct inode *inode, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8db9234..af52f6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -808,9 +808,9 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int ro:1; - int dirty:1; - int iref:1; + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; int disk_cache_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fb827d0..c547cca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,6 +28,7 @@ #include <linux/freezer.h> #include <linux/crc32c.h> #include <linux/slab.h> +#include <linux/migrate.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); @@ -693,6 +696,29 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; +#ifdef CONFIG_MIGRATION + return migrate_page(mapping, newpage, page); +#else + return -ENOSYS; +#endif +} + static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -707,8 +733,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc) } redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); WARN_ON(!eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); @@ -799,6 +824,9 @@ static const struct address_space_operations btree_aops = { .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1538,10 +1566,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), - GFP_NOFS); + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 951ef09..6f04444 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -232,9 +232,85 @@ fail: return ERR_PTR(ret); } +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = inode->i_ino; + key.offset = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c097f3..bcd59c7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3412,7 +3412,7 @@ again: * our reservation. */ if (unused <= space_info->total_bytes) { - unused -= space_info->total_bytes; + unused = space_info->total_bytes - unused; if (unused >= num_bytes) { if (!reserved) space_info->bytes_reserved += orig_bytes; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eac10e3..3e86b9f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) bio_put(bio); } -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { struct bio *bio; @@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, else nr = bio_get_nr_vecs(bdev); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -2901,21 +2901,53 @@ out: int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { - int ret; + int ret = 0; u64 off = start; u64 max = start + len; u32 flags = 0; + u32 found_type; + u64 last; u64 disko = 0; + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0, em_len = 0; unsigned long emflags; - ret = 0; + int hole = 0; if (len == 0) return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, inode->i_ino, -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, just return */ + if (found_key.objectid != inode->i_ino || + found_type != BTRFS_EXTENT_DATA_KEY) { + btrfs_free_path(path); + return 0; + } + last = found_key.offset; + btrfs_free_path(path); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, off, max - off, 0); @@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = PTR_ERR(em); goto out; } + while (!end) { + hole = 0; off = em->start + em->len; if (off >= max) end = 1; + if (em->block_start == EXTENT_MAP_HOLE) { + hole = 1; + goto next; + } + em_start = em->start; em_len = em->len; @@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_HOLE) { - flags |= FIEMAP_EXTENT_UNWRITTEN; } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); @@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; +next: emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { em = get_extent(inode, NULL, 0, off, max - off, 0); if (!em) @@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } emflags = em->flags; } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; + if (em_start == last) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + if (!hole) { + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } } out_free: free_extent_map(em); @@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) spin_lock(&tree->buffer_lock); eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) - goto out; + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; + } if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1c6d4f3..4183c81 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, unsigned long op); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e354c33..c1faded 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1047,8 +1047,14 @@ out: if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + num_written = PTR_ERR(trans); + goto done; + } + mutex_lock(&inode->i_mutex); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + mutex_unlock(&inode->i_mutex); if (ret == 0) { ret = btrfs_sync_log(trans, root); if (ret == 0) @@ -1067,6 +1073,7 @@ out: (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); } } +done: current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 558cac2..8039390 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) @@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int backref, u64 index) + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, backref, index); + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); if (!err) { d_instantiate(dentry, inode); return 0; @@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - objectid, BTRFS_I(dir)->block_group, mode, - &index); + dentry->d_name.len, dir->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_unlock; @@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -EPERM; btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; err = btrfs_set_inode_index(dir, &index); if (err) @@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_set_trans_block_group(trans, dir); ihold(inode); - err = btrfs_add_nondir(trans, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); if (err) { drop_inode = 1; } else { + struct dentry *parent = dget_parent(dentry); btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_log_new_name(trans, inode, NULL, parent); + dput(parent); } nr = trans->blocks_used; @@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFDIR | mode, &index); if (IS_ERR(inode)) { @@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; - err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; @@ -5535,13 +5534,21 @@ struct btrfs_dio_private { u64 bytes; u32 *csums; void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + struct bio *orig_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) { + struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; - struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; u64 start; @@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; struct extent_state *cached_state = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; int ret; if (err) goto out_done; - - ret = btrfs_dec_test_ordered_pending(inode, &ordered, - dip->logical_offset, dip->bytes); +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes); if (!ret) - goto out_done; + goto out_test; BUG_ON(!ordered); @@ -5663,8 +5673,20 @@ out_unlock: out: btrfs_delalloc_release_metadata(inode, ordered->len); btrfs_end_transaction(trans, root); + ordered_offset = ordered->file_offset + ordered->len; btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); + +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + goto again; + } out_done: bio->bi_private = dip->private; @@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " + "disk_bytenr %lu len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, bio->bi_sector, + bio->bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) + bio_io_error(dip->orig_bio); + else { + set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + u32 *csums) +{ + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + bio_get(bio); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + file_offset, csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + u32 *csums = dip->csums; + int ret = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + return -EIO; + } + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + csums); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + if (!skip_sum) + csums = csums + nr_pages; + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages ++; + bvec++; + } + } + + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + csums); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, loff_t file_offset) { @@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->disk_bytenr = (u64)bio->bi_sector << 9; bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = bio; + atomic_set(&dip->pending_bios, 0); if (write) bio->bi_end_io = btrfs_endio_direct_write; else bio->bi_end_io = btrfs_endio_direct_read; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto out_err; - - if (write && !skip_sum) { - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, 0, 0, - dip->logical_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); - if (ret) - goto out_err; + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) return; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, - dip->logical_offset, dip->csums); - - ret = btrfs_map_bio(root, rw, bio, 0, 1); - if (ret) - goto out_err; - return; -out_err: - kfree(dip->csums); - kfree(dip); free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -6607,8 +6781,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(ret); if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + struct dentry *parent = dget_parent(new_dentry); + btrfs_log_new_name(trans, old_inode, old_dir, parent); + dput(parent); btrfs_end_log_trans(root); } out_fail: @@ -6758,8 +6933,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, &index); err = PTR_ERR(inode); @@ -6773,7 +6947,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -6844,6 +7018,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 i_size; int ret = 0; bool own_trans = true; @@ -6885,11 +7060,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size_write(inode, actual_len); + i_size = actual_len; else - i_size_write(inode, cur_offset); - i_size_write(inode, cur_offset); - btrfs_ordered_update_i_size(inode, cur_offset, NULL); + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); } ret = btrfs_update_inode(trans, root, inode); @@ -6943,6 +7118,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); if (ret) @@ -7139,6 +7318,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = btrfs_getattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 463d91b..f1c9bb4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *new_root; - struct inode *dir = dentry->d_parent->d_inode; + struct dentry *parent = dget_parent(dentry); + struct inode *dir; int ret; int err; u64 objectid; @@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 0, &objectid); - if (ret) + if (ret) { + dput(parent); return ret; + } + + dir = parent->d_inode; + /* * 1 - inode item * 2 - refs @@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root, * 2 - dir items */ trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + dput(parent); return PTR_ERR(trans); + } leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: + dput(parent); if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid) { struct inode *inode; + struct dentry *parent; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; @@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_orphan_cleanup(pending_snapshot->snap); - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + parent = dget_parent(dentry); + inode = btrfs_lookup_dentry(parent->d_inode, dentry); + dput(parent); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -1669,12 +1681,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, olen = len = src->i_size - off; /* if we extend to eof, continue to block boundary */ if (off + len == src->i_size) - len = ((src->i_size + bs-1) & ~(bs-1)) - - off; + len = ALIGN(src->i_size, bs) - off; /* verify the end result is block aligned */ - if ((off & (bs-1)) || - ((off + len) & (bs-1))) + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) goto out_unlock; /* do any pending delalloc/csum calc on src, one way or @@ -1874,8 +1885,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * but shouldn't round up the file size */ endoff = new_key.offset + datal; - if (endoff > off+olen) - endoff = off+olen; + if (endoff > destoff+olen) + endoff = destoff+olen; if (endoff > inode->i_size) btrfs_i_size_write(inode, endoff); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f4621f6..ae7737e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode, /* * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", + (unsigned long long)dec_start, + (unsigned long long)dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)to_dec); + } + entry->bytes_left -= to_dec; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range * of the file. The IO should not span ordered extents. If * a given ordered_extent is completely done, 1 is returned, otherwise * 0. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ac3654..61dca83 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8299a25..dbb51ea 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_space_cache: printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); @@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *test_root = data; struct btrfs_root *root = btrfs_sb(s); - return root->fs_info->fs_devices == test_fs_devices; + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + + return set_anon_super(s, data); } + /* * Find a superblock for the given device / mount point. * @@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_root *tree_root = NULL; + struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; @@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, goto error_close_devices; } + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info || !tree_root) { + error = -ENOMEM; + goto error_close_devices; + } + fs_info->tree_root = tree_root; + fs_info->fs_devices = fs_devices; + tree_root->fs_info = fs_info; + bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) goto error_s; @@ -675,6 +708,8 @@ error_s: error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); error_free_subvol_name: kfree(subvol_name); return ERR_PTR(error); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1fffbc0..f50e931 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; - parent_inode = dentry->d_parent->d_inode; + parent = dget_parent(dentry); + parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode->i_ino, index, dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a29f193..054744a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root; + struct dentry *old_parent = NULL; /* * for regular files, if its inode is already on disk, we don't @@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; inode = parent->d_inode; } + dput(old_parent); out: return ret; } @@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; + struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; @@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; } ret = 0; end_trans: + dput(old_parent); if (ret < 0) { BUG_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; @@ -3039,8 +3047,13 @@ end_no_trans: int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + dput(parent); + + return ret; } /* diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e9c874a..561438b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, page->index << PAGE_CACHE_SHIFT, &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { @@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, &len, ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); + pages, nr_pages, 0); if (rc == -ENOENT) rc = 0; if (rc < 0) @@ -774,7 +774,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1); + &inode->i_mtime, true, 1, 0); max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e..60d27bc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); @@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; - cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; @@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, NULL /* no caps context */); try_flush_caps(inode, session, NULL); up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6..7d447af 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -336,7 +336,10 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -355,18 +358,22 @@ more: u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + pos, ino, ftype) < 0) { dout("filldir stopping us...\n"); return 0; } @@ -414,6 +421,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->last_readdir = NULL; } kfree(fi->last_name); + fi->last_name = NULL; fi->next_offset = 2; /* compensate for . and .. */ if (fi->dentry) { dput(fi->dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28c..8d79b89 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); @@ -280,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool align_to_pages) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; + int io_align, page_align; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; @@ -300,14 +303,19 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (align_to_pages) + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); + page_pos, pages_left, page_align); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) @@ -374,26 +382,25 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, off, len); - - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - } else { + if (file->f_flags & O_DIRECT) + pages = ceph_get_direct_page_vector(data, num_pages); + else pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ ret = filemap_write_and_wait(inode->i_mapping); if (ret < 0) goto done; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int flags; int do_sync = 0; int check_caps = 0; + int page_align, io_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; + io_align = pos & ~PAGE_MASK; + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; + if (file->f_flags & O_DIRECT) + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); + &mtime, false, 2, page_align); if (!req) return -ENOMEM; num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b..bf12865 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2,7 +2,6 @@ #include <linux/module.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; @@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); @@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " @@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); @@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15..098b185 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include "super.h" #include "mds_client.h" @@ -529,6 +528,9 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1588,8 +1590,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c..9341fd4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -170,6 +170,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294..7f01728 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,9 +293,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ diff --git a/fs/cifs/TODO b/fs/cifs/TODO index 5aff46c..355abcdc 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for v) mount check for unmatched uids -w) Add support for new vfs entry points for setlease and fallocate +w) Add support for new vfs entry point for fallocate x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of processes can proceed better in parallel (on the server) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 525ba59..e9a393c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -15,7 +15,7 @@ * the GNU Lesser General Public License for more details. * */ -#include <linux/radix-tree.h> +#include <linux/rbtree.h> #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H @@ -42,9 +42,9 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ struct cifs_sb_info { - struct radix_tree_root tlink_tree; -#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */ + struct rb_root tlink_tree; spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 75c4eaa..9c37897 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data, return -ENOMEM; spin_lock_init(&cifs_sb->tlink_tree_lock); - INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); + cifs_sb->tlink_tree = RB_ROOT; rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) { @@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb) /* Until the file is open and we have gotten oplock info back from the server, can not assume caching of file data or metadata */ - cifs_inode->clientCanCacheRead = false; - cifs_inode->clientCanCacheAll = false; + cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f259e4d..b577bf0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -336,7 +336,8 @@ struct cifsTconInfo { * "get" on the container. */ struct tcon_link { - unsigned long tl_index; + struct rb_node tl_rbnode; + uid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index edb6d90..7ed69b6b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,6 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9eb327d..251a17c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -116,6 +116,7 @@ struct smb_vol { static int ipv4_connect(struct TCP_Server_Info *server); static int ipv6_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); /* @@ -2900,24 +2901,16 @@ remote_path_check: goto mount_fail_check; } - tlink->tl_index = pSesInfo->linux_uid; + tlink->tl_uid = pSesInfo->linux_uid; tlink->tl_tcon = tcon; tlink->tl_time = jiffies; set_bit(TCON_LINK_MASTER, &tlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); - rc = radix_tree_preload(GFP_KERNEL); - if (rc == -ENOMEM) { - kfree(tlink); - goto mount_fail_check; - } - + cifs_sb->master_tlink = tlink; spin_lock(&cifs_sb->tlink_tree_lock); - radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); - radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid, - CIFS_TLINK_MASTER_TAG); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -3107,32 +3100,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, int cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) { - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; char *tmp; - struct tcon_link *tlink[8]; - unsigned long index = 0; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); - for (i = 0; i < ret; i++) - cifs_put_tlink(tlink[i]); - } while (ret != 0); + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); tmp = cifs_sb->prepath; cifs_sb->prepathlen = 0; @@ -3271,22 +3257,10 @@ out: return tcon; } -static struct tcon_link * +static inline struct tcon_link * cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) { - struct tcon_link *tlink; - unsigned int ret; - - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink, - 0, 1, CIFS_TLINK_MASTER_TAG); - spin_unlock(&cifs_sb->tlink_tree_lock); - - /* the master tcon should always be present */ - if (ret == 0) - BUG(); - - return tlink; + return cifs_sb->master_tlink; } struct cifsTconInfo * @@ -3302,6 +3276,47 @@ cifs_sb_tcon_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + /* * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the * current task. @@ -3309,7 +3324,7 @@ cifs_sb_tcon_pending_wait(void *unused) * If the superblock doesn't refer to a multiuser mount, then just return * the master tcon for the mount. * - * First, search the radix tree for an existing tcon for this fsuid. If one + * First, search the rbtree for an existing tcon for this fsuid. If one * exists, then check to see if it's pending construction. If it is then wait * for construction to complete. Once it's no longer pending, check to see if * it failed and either return an error or retry construction, depending on @@ -3322,14 +3337,14 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - unsigned long fsuid = (unsigned long) current_fsuid(); + uid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); @@ -3338,36 +3353,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); if (newtlink == NULL) return ERR_PTR(-ENOMEM); - newtlink->tl_index = fsuid; + newtlink->tl_uid = fsuid; newtlink->tl_tcon = ERR_PTR(-EACCES); set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); cifs_get_tlink(newtlink); - ret = radix_tree_preload(GFP_KERNEL); - if (ret != 0) { - kfree(newtlink); - return ERR_PTR(ret); - } - spin_lock(&cifs_sb->tlink_tree_lock); /* was one inserted after previous search? */ - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) { cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); kfree(newtlink); goto wait_for_construction; } - ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink); - spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - if (ret) { - kfree(newtlink); - return ERR_PTR(ret); - } tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, @@ -3413,39 +3416,39 @@ cifs_prune_tlinks(struct work_struct *work) { struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); - struct tcon_link *tlink[8]; - unsigned long now = jiffies; - unsigned long index = 0; - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || - atomic_read(&tlink[i]->tl_count) != 0 || - time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, - now)) { - tlink[i] = NULL; - continue; - } - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; - for (i = 0; i < ret; i++) { - if (tlink[i] != NULL) - cifs_put_tlink(tlink[i]); - } - } while (ret != 0); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ae82159..06c3e83 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -146,12 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); return rc; } @@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -271,8 +261,9 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); - struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); @@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifsi->clientCanCacheRead = false; - cifsi->clientCanCacheAll = false; + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -607,8 +597,6 @@ reopen_success: rc = filemap_write_and_wait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - pCifsInode->clientCanCacheAll = false; - pCifsInode->clientCanCacheRead = false; if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); @@ -622,18 +610,9 @@ reopen_success: invalidate the current end of file on the server we can not go to the server to get the new inod info */ - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - pCifsFile->dentry->d_inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = true; - pCifsInode->clientCanCacheAll = false; - } else { - pCifsInode->clientCanCacheRead = false; - pCifsInode->clientCanCacheAll = false; - } + + cifs_set_oplock_level(pCifsInode, oplock); + cifs_relock_file(pCifsFile); reopen_error_exit: @@ -775,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; if ((tcon->ses->capabilities & CAP_UNIX) && @@ -956,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { + struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; @@ -963,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1029,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); - /* since the write may have blocked check these pointers again */ - if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { - struct inode *inode = file->f_path.dentry->d_inode; /* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > file->f_path.dentry->d_inode->i_size) - i_size_write(file->f_path.dentry->d_inode, - *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(file->f_path.dentry->d_inode); + * inode->i_ctime = inode->i_mtime = + * current_fs_time(inode->i_sb);*/ + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); } + mark_inode_dirty_sync(inode); + FreeXid(xid); return total_written; } @@ -1178,7 +1148,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb; bool any_available = false; int rc; @@ -1192,6 +1162,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, return NULL; } + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 39869c3c..ef3a55b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2177,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) setattr_copy(inode, attrs); mark_inode_dirty(inode); - return 0; cifs_setattr_exit: kfree(full_path); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 077bf75..0c98672 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct cifs_sb_info *cifs_sb; #ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; - struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); + struct cifsTconInfo *tcon; __u64 ExtAttrBits = 0; __u64 ExtAttrMask = 0; - __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + __u64 caps; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); @@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (pSMBFile == NULL) - break; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); if (rc == 0) @@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; break; } - if (pSMBFile == NULL) - break; /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, extAttrBits, &ExtAttrMask);*/ } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c4e296f..43f1028 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - pCifsInode->clientCanCacheAll = false; - if (pSMB->OplockLevel == 0) - pCifsInode->clientCanCacheRead = false; + cifs_set_oplock_level(pCifsInode, + pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing @@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb_master_tcon(cifs_sb)->treeName); } } + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} diff --git a/fs/compat.c b/fs/compat.c index c580c32..eb1740a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 410ed18..a60579b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,7 +19,6 @@ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/ioctl.h> #include <linux/if.h> #include <linux/if_bridge.h> diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2537323..2720178 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -28,7 +28,6 @@ #include <linux/key.h> #include <linux/slab.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/crypto.h> #include "ecryptfs_kernel.h" @@ -164,7 +164,26 @@ out: #ifdef CONFIG_MMU -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -276,7 +297,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1452,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2fedaf8..acf8695 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,7 +27,6 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b5dd636..6a5edea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,7 +177,7 @@ struct mpage_da_data { struct ext4_io_page { struct page *p_page; - int p_count; + atomic_t p_count; }; #define MAX_IO_PAGES 128 @@ -858,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern int ext4_end_io_nolock(ext4_io_end_t *io); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4d78342..bdbe699 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -5647,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae88..eb3bc2f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,6 +331,30 @@ mext_out: return err; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c58eba34..5b4d4e3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4640,8 +4640,6 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a..beacce1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,8 +32,14 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + int __init ext4_init_pageio(void) { + int i; + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); return 0; } @@ -52,24 +60,37 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; + wait_queue_head_t *wq; BUG_ON(!io); if (io->page) put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) { - if (--io->pages[i]->p_count == 0) { - struct page *page = io->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io->pages[i]); - } - } + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); io->num_io_pages = 0; - iput(io->inode); + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); kmem_cache_free(io_end_cachep, io); } @@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) io = kmem_cache_alloc(io_end_cachep, flags); if (io) { memset(io, 0, sizeof(*io)); - io->inode = igrab(inode); - BUG_ON(!io->inode); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error) struct workqueue_struct *wq; struct inode *inode; unsigned long flags; - ext4_fsblk_t err_block; int i; BUG_ON(!io_end); - inode = io_end->inode; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - err_block = bio->bi_sector >> (inode->i_blkbits - 9); bio_put(bio); - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { - pr_err("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - return; - } - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) err_block); - } - for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; @@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - if (--io_end->pages[i]->p_count == 0) { - struct page *page = io_end->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io_end->pages[i]); - } - /* * If this is a partial write which happened to make * all buffers uptodate then we can optimize away a @@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error) */ if (!partial_write) SetPageUptodate(page); - } + put_io_page(io_end->pages[i]); + } io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io, bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - io_end->inode = inode; io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; @@ -360,7 +365,7 @@ submit_and_retry: if ((io_end->num_io_pages == 0) || (io_end->pages[io_end->num_io_pages-1] != io_page)) { io_end->pages[io_end->num_io_pages++] = io_page; - io_page->p_count++; + atomic_inc(&io_page->p_count); } return 0; } @@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return -ENOMEM; } io_page->p_page = page; - io_page->p_count = 0; + atomic_set(&io_page->p_count, 1); get_page(page); for (bh = head = page_buffers(page), block_start = 0; @@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ - if (io_page->p_count == 0) { - put_page(page); - end_page_writeback(page); - kmem_cache_free(io_page_cachep, io_page); - } + put_io_page(io_page); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 40131b7..e32195d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1173,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1186,7 +1197,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, #endif .bdev_try_to_free_page = bdev_try_to_free_page, - .trim_fs = ext4_trim_fs }; static const struct super_operations ext4_nojournal_sops = { @@ -1194,6 +1204,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -2699,7 +2710,6 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_li_request *elr; unsigned long next_wakeup; DEFINE_WAIT(wait); - int ret; BUG_ON(NULL == eli); @@ -2723,13 +2733,12 @@ cont_thread: elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) - ret = ext4_run_li_request(elr); - - if (ret) { - ret = 0; - ext4_remove_li_request(elr); - continue; + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } } if (time_before(elr->lr_next_sched, next_wakeup)) @@ -2740,7 +2749,8 @@ cont_thread: if (freezing(current)) refrigerator(); - if (time_after_eq(jiffies, next_wakeup)) { + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } @@ -2788,9 +2798,6 @@ static void ext4_clear_request_list(void) struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); - if (list_empty(&ext4_li_info->li_request_list)) - return; - list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); @@ -3257,13 +3264,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - ret = generic_check_addressable(sb->s_blocksize_bits, + err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); - if (ret) { + if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -3348,6 +3356,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3446,22 +3472,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3611,10 +3634,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3622,6 +3641,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3949,13 +3972,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -4556,12 +4577,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c822458..9242d29 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open); void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); if (ff->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; @@ -141,6 +142,15 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 06d5827..5ab3839 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; - int error; inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { @@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, goto out_inode; } - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); out_inode: dentry = d_obtain_alias(inode); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8777885..f92c177 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = NULL; + struct gfs2_inode *ip; struct inode *inode; - u64 no_addr = 0; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - spin_lock(&gl->gl_spin); - ip = (struct gfs2_inode *)gl->gl_object; if (ip) - no_addr = ip->i_no_addr; - spin_unlock(&gl->gl_spin); - if (ip) { inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - if (inode) { - d_prune_aliases(inode); - iput(inode); - } + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); } gfs2_glock_put(gl); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 06370f8..e1213f7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } -struct gfs2_skip_data { - u64 no_addr; - int skipped; -}; - -static int iget_skip_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (ip->i_no_addr == data->no_addr) { - if (inode->i_state & (I_FREEING|I_WILL_FREE)){ - data->skipped = 1; - return 0; - } - return 1; - } - return 0; -} - -static int iget_skip_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (data->skipped) - return 1; - inode->i_ino = (unsigned long)(data->no_addr); - ip->i_no_addr = data->no_addr; - return 0; -} - -static struct inode *gfs2_iget_skip(struct super_block *sb, - u64 no_addr) -{ - struct gfs2_skip_data data; - unsigned long hash = (unsigned long)no_addr; - - data.no_addr = no_addr; - data.skipped = 0; - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data); -} - /** * GFS2 lookup code fills in vfs inode contents based on info obtained * from directory entry inside gfs2_inode_lookup(). This has caused issues @@ -243,93 +200,54 @@ fail: return ERR_PTR(error); } -/** - * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation - * and try to reclaim it by doing iput. - * - * This function assumes no rgrp locks are currently held. - * - * @sb: The super block - * no_addr: The inode number - * - */ - -void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) { - struct gfs2_sbd *sdp; - struct gfs2_inode *ip; - struct gfs2_glock *io_gl = NULL; - int error; - struct gfs2_holder gh; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; struct inode *inode; + int error; - inode = gfs2_iget_skip(sb, no_addr); - - if (!inode) - return; - - /* If it's not a new inode, someone's using it, so leave it alone. */ - if (!(inode->i_state & I_NEW)) { - iput(inode); - return; - } - - ip = GFS2_I(inode); - sdp = GFS2_SB(inode); - ip->i_no_formal_ino = -1; + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return ERR_PTR(error); - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); - if (unlikely(error)) + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) goto fail; - ip->i_gl->gl_object = ip; - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail_put; - - set_bit(GIF_INVALID, &ip->i_flags); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, - &ip->i_iopen_gh); - if (unlikely(error)) - goto fail_iopen; + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + if (IS_ERR(inode)) + goto fail; - ip->i_iopen_gh.gh_gl->gl_object = ip; - gfs2_glock_put(io_gl); - io_gl = NULL; + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; - inode->i_mode = DT2IF(DT_UNKNOWN); + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, - &gh); - if (unlikely(error)) - goto fail_glock; + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(inode); + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; - /* The iput will cause it to be deleted. */ - iput(inode); - return; + error = 0; + } -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); -fail_iopen: - if (io_gl) - gfs2_glock_put(io_gl); -fail_put: - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); - return; + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6720d7d..d8499fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,9 @@ err: extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58a9b99..f606baf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); @@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qd->qd_qb.qb_value = qp->qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_warn = qp->qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } } @@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; - fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); - fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); - fdq->d_bcount = be64_to_cpu(qlvb->qb_value); + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; gfs2_glock_dq_uninit(&q_gh); out: @@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & FS_DQ_BSOFT) && - (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; if ((fdq->d_fieldmask & FS_DQ_BHARD) && - (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; if (fdq->d_fieldmask == 0) goto out_i; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bef3ab6..33c8407 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * The inode, if one has been found, in inode. */ -static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; - for(;;) { - if (goal >= rgd->rd_data) - break; + while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); n = 1; block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, @@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - return no_addr; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > 2*NR_CPUS) + return; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return 0; + return; } /** @@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno - * unlinked: the block address of an unlinked block to be reclaimed */ -static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, - u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; @@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, int loops = 0; int error, rg_locked; - *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - /* If the rg came in already locked, there's no - way we can recover from a failed try_rgrp_unlink - because that would require an iput which can only - happen after the rgrp is unlocked. */ - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); @@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; break; case GLR_TRYFAILED: @@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; int error = 0; - u64 last_unlinked = NO_BLOCK, unlinked; + u64 last_unlinked = NO_BLOCK; + int tries = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; -try_again: if (hold_rindex) { /* We need to hold the rindex unless the inode we're using is the rindex itself, in which case it's already held. */ @@ -1218,31 +1227,23 @@ try_again: else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ error = gfs2_ri_update_special(ip); + if (error) + return error; } - if (error) - return error; + do { + error = get_local_rgrp(ip, &last_unlinked); + /* If there is no space, flushing the log may release some */ + if (error) + gfs2_log_flush(sdp, NULL); + } while (error && tries++ < 3); - /* Find an rgrp suitable for allocation. If it encounters any unlinked - dinodes along the way, error will equal -EAGAIN and unlinked will - contains it block address. We then need to look up that inode and - try to free it, and try the allocation again. */ - error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (error != -EAGAIN) - return error; - - gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); - /* regardless of whether or not gfs2_process_unlinked_inode - was successful, we don't want to repeat it again. */ - last_unlinked = unlinked; - gfs2_log_flush(sdp, NULL); - error = 0; - - goto try_again; + return error; } + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d6cfac1..a5fe681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - WARN_ONCE(1, - "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); } else { *user = NULL; return ERR_PTR(-EPERM); @@ -6,7 +6,6 @@ #include <linux/syscalls.h> #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fs.h> @@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_fstrim(struct file *filp, void __user *argp) -{ - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* If filesystem doesn't support trim feature, return. */ - if (sb->s_op->trim_fs == NULL) - return -EOPNOTSUPP; - - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - - if (argp == NULL) { - range.start = 0; - range.len = ULLONG_MAX; - range.minlen = 0; - } else if (copy_from_user(&range, argp, sizeof(range))) - return -EFAULT; - - ret = sb->s_op->trim_fs(sb, &range); - if (ret < 0) - return ret; - - if ((argp != NULL) && - (copy_to_user(argp, &range, sizeof(range)))) - return -EFAULT; - - return 0; -} - /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; - case FITRIM: - error = ioctl_fstrim(filp, argp); - break; - case FS_IOC_FIEMAP: return ioctl_fiemap(filp, arg); diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb9..7da2a06 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } ret = -ESRCH; - /* - * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic", - * so we can't use rcu_read_lock(). See re-copy of ->ioprio - * in copy_process(). - */ - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -153,7 +148,7 @@ free_uid: ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) int ret = -ESRCH; int tmpio; - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c590d15..f837ba9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, __func__); goto out_err; } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; bh = __getblk(journal->j_dev, start, journal->j_blocksize); if (!bh) { diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index d5bb868..25509eb 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,7 +14,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/smp_lock.h> #include <linux/kthread.h> #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 47ea1e1..332c54c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,7 +7,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 25e21e4..ed0c59f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) continue; if (host->h_server != ni->server) continue; - if (ni->server && + if (ni->server && ni->src_len != 0 && !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; @@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrlen = ni->salen; rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_srcaddrlen = ni->src_len; host->h_version = ni->version; host->h_proto = ni->protocol; host->h_rpcclnt = NULL; @@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const char *hostname, int noresvport) { - const struct sockaddr source = { - .sa_family = AF_UNSPEC, - }; struct nlm_lookup_host_info ni = { .server = 0, .sap = sap, @@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .version = version, .hostname = hostname, .hostname_len = strlen(hostname), - .src_sap = &source, - .src_len = sizeof(source), .noresvport = noresvport, }; @@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, - .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host) args.flags |= RPC_CLNT_CREATE_HARDRTRY; if (host->h_noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); clnt = rpc_create(&args); if (!IS_ERR(clnt)) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index a336e83..38d2611 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c462d34..ef5659b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -25,7 +25,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/nlm.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index c3069f3..0caea53 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> @@ -122,7 +122,6 @@ #include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/time.h> #include <linux/rcupdate.h> @@ -1504,9 +1503,8 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { - struct file_lock *fl; + struct file_lock *fl, *ret; struct fasync_struct *new; - struct inode *inode = filp->f_path.dentry->d_inode; int error; fl = lease_alloc(filp, arg); @@ -1518,13 +1516,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } + ret = fl; lock_flocks(); - error = __vfs_setlease(filp, arg, &fl); + error = __vfs_setlease(filp, arg, &ret); if (error) { unlock_flocks(); locks_free_lock(fl); goto out_free_fasync; } + if (ret != fl) + locks_free_lock(fl); /* * fasync_insert_entry() returns the old entry if any. @@ -1532,17 +1533,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * inserted it into the fasync list. Clear new so that * we don't release it here. */ - if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; - if (error < 0) { - /* remove lease just inserted by setlease */ - fl->fl_type = F_UNLCK | F_INPROGRESS; - fl->fl_break_time = jiffies - 10; - time_out_leases(inode); - } else { - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - } + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); unlock_flocks(); out_free_fasync: diff --git a/fs/namespace.c b/fs/namespace.c index 8a415c9..3dbfc07 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/percpu.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/acct.h> diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index aac8832e..f22b12e7 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -19,7 +19,6 @@ #include <linux/mm.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 6c754f7..cb50aaf 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -17,7 +17,6 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> #include "ncplib_kernel.h" diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d290545..8fb93b6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/seq_file.h> diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c2a1f9a..d40a547 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -17,7 +17,6 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/highuid.h> -#include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index aeec017..93a8b3b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,7 +9,6 @@ #include <linux/completion.h> #include <linux/ip.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 232a7ee..1fd62fc 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/nfs4.h> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 07ac384..f0a384e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,6 +34,7 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/vmalloc.h> +#include <linux/kmemleak.h> #include "delegation.h" #include "iostat.h" @@ -161,6 +162,7 @@ struct nfs_cache_array_entry { u64 cookie; u64 ino; struct qstr string; + unsigned char d_type; }; struct nfs_cache_array { @@ -170,8 +172,6 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) - typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; @@ -194,9 +194,13 @@ typedef struct { static struct nfs_cache_array *nfs_readdir_get_array(struct page *page) { + void *ptr; if (page == NULL) return ERR_PTR(-EIO); - return (struct nfs_cache_array *)kmap(page); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; } static @@ -213,6 +217,9 @@ int nfs_readdir_clear_array(struct page *page, gfp_t mask) { struct nfs_cache_array *array = nfs_readdir_get_array(page); int i; + + if (IS_ERR(array)) + return PTR_ERR(array); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); nfs_readdir_release_array(page); @@ -231,6 +238,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le string->name = kmemdup(name, len, GFP_KERNEL); if (string->name == NULL) return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); string->hash = full_name_hash(name, len); return 0; } @@ -244,20 +256,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (IS_ERR(array)) return PTR_ERR(array); - ret = -EIO; - if (array->size >= MAX_READDIR_ARRAY) - goto out; cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); if (ret) goto out; array->last_cookie = entry->cookie; + array->size++; if (entry->eof == 1) array->eof_index = array->size; - array->size++; out: nfs_readdir_release_array(page); return ret; @@ -272,7 +288,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index > 0) + if (array->eof_index >= 0) goto out_eof; desc->current_index += array->size; return -EAGAIN; @@ -281,8 +297,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - if (index == array->eof_index) - desc->eof = 1; return 0; out_eof: desc->eof = 1; @@ -296,17 +310,17 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int status = -EAGAIN; for (i = 0; i < array->size; i++) { - if (i == array->eof_index) { - desc->eof = 1; - status = -EBADCOOKIE; - } if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; status = 0; - break; + goto out; } } - + if (i == array->eof_index) { + desc->eof = 1; + status = -EBADCOOKIE; + } +out: return status; } @@ -381,13 +395,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - struct nfs_inode *node; if (dentry->d_inode == NULL) goto different; - node = NFS_I(dentry->d_inode); - if (node->fh.size != entry->fh->size) - goto different; - if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) goto different; return 1; different: @@ -449,14 +459,15 @@ out: /* Perform conversion from xdr to cache array */ static -void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, void *xdr_page, struct page *page, unsigned int buflen) { struct xdr_stream stream; struct xdr_buf buf; __be32 *ptr = xdr_page; - int status; struct nfs_cache_array *array; + unsigned int count = 0; + int status; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -471,21 +482,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e do { status = xdr_decode(desc, entry, &stream); - if (status != 0) + if (status != 0) { + if (status == -EAGAIN) + status = 0; break; + } + + count++; - if (nfs_readdir_add_to_array(entry, page) == -1) - break; if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; } while (!entry->eof); - if (status == -EBADCOOKIE && entry->eof) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) { array = nfs_readdir_get_array(page); - array->eof_index = array->size - 1; - status = 0; - nfs_readdir_release_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); } + return status; } static @@ -537,7 +559,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; - int status = 0; + int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; @@ -549,6 +571,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -556,12 +582,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (!pages_ptr) goto out_release_array; do { + unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); - } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: @@ -582,8 +615,10 @@ static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; - if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) goto error; SetPageUptodate(page); @@ -595,7 +630,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) return 0; error: unlock_page(page); - return -EIO; + return ret; } static @@ -608,12 +643,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) - desc->eof = 1; - return page; } /* @@ -639,8 +670,10 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int res = -EAGAIN; + int res; + if (desc->page_index == 0) + desc->current_index = 0; while (1) { res = find_cache_page(desc); if (res != -EAGAIN) @@ -666,35 +699,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; - unsigned int d_type = DT_UNKNOWN; - struct dentry *dentry = NULL; array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } for (i = desc->cache_entry_index; i < array->size; i++) { - d_type = DT_UNKNOWN; + struct nfs_cache_array_entry *ent; - res = filldir(dirent, array->array[i].string.name, - array->array[i].string.len, file->f_pos, - nfs_compat_user_ino64(array->array[i].ino), d_type); - if (res < 0) + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { + desc->eof = 1; break; + } file->f_pos++; desc->cache_entry_index = i; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; - if (i == array->eof_index) { - desc->eof = 1; - break; - } } + if (i == array->eof_index) + desc->eof = 1; nfs_readdir_release_array(desc->page); +out: cache_page_release(desc); - if (dentry != NULL) - dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; @@ -729,13 +763,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } - if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { - status = -EIO; - goto out_release; - } - desc->page_index = 0; desc->page = page; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + status = nfs_do_filldir(desc, dirent, filldir); out: @@ -786,14 +820,14 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { + res = 0; /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); - if (res >= 0) + if (res == 0) continue; } - res = 0; break; } if (res == -ETOOSMALL && desc->plus) { @@ -808,10 +842,8 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; res = nfs_do_filldir(desc, dirent, filldir); - if (res < 0) { - res = 0; + if (res < 0) break; - } } out: nfs_unblock_sillyrename(dentry); @@ -1345,12 +1377,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 84d3c8b..e6ace0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, goto out; nfs_alloc_commit_data(dreq); - if (dreq->commit_data == NULL || count < wsize) + if (dreq->commit_data == NULL || count <= wsize) sync = NFS_FILE_SYNC; dreq->inode = inode; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index db08ff3..e6356b7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page) } /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e6bf457..5914a19 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - int status, nr = 0; + int status; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); + entry->d_type = DT_UNKNOWN; + p = xdr_inline_peek(xdr, 8); if (p != NULL) entry->eof = !p[0] && p[1]; @@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9a5e83..f6cc60f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 recvd, pglen; - int status, nr = 0; + int status; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } __be32 * @@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); + entry->d_type = DT_UNKNOWN; if (plus) { entry->fattr->valid = 0; p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); if (IS_ERR(p)) goto out_overflow_exit; + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s out_overflow: print_overflow_msg(__func__, xdr); out_overflow_exit: - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f24cdf..6a653ff 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); - if (status == 0) + if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } nfs_invalidate_atime(dir); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f313c4c..9f1826b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n xdr_read_pages(xdr, pglen); - return 0; + return pglen; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) @@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; @@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0a42e8f..3c04504 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -39,7 +39,6 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/mnt_namespace.h> @@ -67,6 +66,12 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#ifdef CONFIG_NFS_V3 +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, @@ -2277,7 +2282,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(3); + data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f1e5ec6..116cab9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) spin_unlock(&clp->cl_lock); } -static void nfsd4_register_conn(struct nfsd4_conn *conn) +static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; - register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { struct nfsd4_conn *conn; u32 flags = NFS4_CDFC4_FORE; + int ret; if (ses->se_flags & SESSION4_BACK_CHAN) flags |= NFS4_CDFC4_BACK; @@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); - nfsd4_register_conn(conn); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); return nfs_ok; } @@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); @@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi } __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); - nfsd4_register_conn(new); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); return; } @@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) * Spawn a thread to perform a recall on the delegation represented * by the lease (file_lock) * - * Called from break_lease() with lock_kernel() held. + * Called from break_lease() with lock_flocks() held. * Note: we assume break_lease will only call this *once* for any given * lease. */ @@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) list_add_tail(&dp->dl_recall_lru, &del_recall_lru); spin_unlock(&recall_lock); - /* only place dl_time is set. protected by lock_kernel*/ + /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); /* @@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) /* * The file_lock is being reapd. * - * Called by locks_free_lock() with lock_kernel() held. + * Called by locks_free_lock() with lock_flocks() held. */ static void nfsd_release_deleg_cb(struct file_lock *fl) @@ -2310,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl) } /* - * Called from setlease() with lock_kernel() held + * Called from setlease() with lock_flocks() held */ static int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 49c844d..59e5fe7 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) * the device at this point. * * To prevent nilfs_dat_translate() from returning the - * uncommited block number, this makes a copy of the entry + * uncommitted block number, this makes a copy of the entry * buffer and redirects nilfs_dat_translate() to the copy. */ if (!buffer_nilfs_redirected(entry_bh)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e90f86..e00d945 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -349,8 +349,8 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ino = vdesc->vd_ino; cno = vdesc->vd_cno; inode = nilfs_iget_for_gc(sb, ino, cno); - if (unlikely(inode == NULL)) { - ret = -ENOMEM; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); goto failed; } do { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d840821..1efea36 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,7 +159,9 @@ struct ocfs2_lock_res { char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; - unsigned char l_level; + char l_level; + char l_requested; + char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; @@ -169,8 +171,6 @@ struct ocfs2_lock_res { unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; - unsigned char l_requested; - unsigned char l_blocking; unsigned int l_pending_gen; spinlock_t l_lock; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f02c0ef..cfeab7c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,7 +41,6 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> -#include <linux/smp_lock.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ddb1f41..911e61f 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -418,7 +418,7 @@ out_no_root: static struct dentry *openprom_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_single(fs_type, flags, data, openprom_fill_super) + return mount_single(fs_type, flags, data, openprom_fill_super); } static struct file_system_type openprom_fs_type = { @@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, return ret; } +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = get_pipe_info(file); if (!pipe) return -EBADF; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9c2b5f4..3ddb606 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -16,7 +16,6 @@ #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sysctl.h> #include <linux/slab.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index da6b01d..c126c83 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * skip over unmapped regions. */ #define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = start_vaddr + PAGEMAP_WALK_SIZE; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; diff --git a/fs/read_write.c b/fs/read_write.c index 431a0ed..5d431ba 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> -#include <linux/smp_lock.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/module.h> diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 41656d4..0bae036 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -8,7 +8,6 @@ #include <linux/reiserfs_acl.h> #include <linux/reiserfs_xattr.h> #include <linux/exportfs.h> -#include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adf22b4..79265fd 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -9,7 +9,6 @@ #include <linux/time.h> #include <asm/uaccess.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/compat.h> /* @@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - /* we need to make sure nobody is changing the file size beneath - ** us - */ - reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); depth = reiserfs_write_lock_once(inode->i_sb); + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 076c8b1..d31bce1 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -43,7 +43,6 @@ #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3bf7a64..b243117 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,7 +28,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> -#include <linux/smp_lock.h> struct file_system_type reiserfs_fs_type; diff --git a/fs/splice.c b/fs/splice.c index 8f1dfae..ce2f025 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) -{ - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; - - return NULL; -} /* * Determine where to splice to/from. @@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - ipipe = pipe_info(in->f_path.dentry->d_inode); - opipe = pipe_info(out->f_path.dentry->d_inode); + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); if (ipipe && opipe) { if (off_in || off_out) @@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, int error; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, }; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c9af48f..7d287af 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1111,11 +1111,12 @@ xfs_vm_writepage( uptodate = 0; /* - * A hole may still be marked uptodate because discard_buffer - * leaves the flag set. + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 63fd2c0..aa1d353 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1781,7 +1781,6 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { @@ -1795,6 +1794,7 @@ xfs_buf_delwri_split( _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); } else skipped++; } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2ea238f6..ad442d9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -416,7 +416,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) goto out_dput; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 96107ef..94d5fd6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -762,7 +762,8 @@ xfs_setup_inode( inode->i_state = I_NEW; inode_sb_list_add(inode); - insert_inode_hash(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9f3a78f..064f964 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -353,9 +353,6 @@ xfs_parseargs( mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; - cmn_err(CE_WARN, - "Enabling EXPERIMENTAL delayed logging feature " - "- use at your own risk.\n"); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37d3325..afb0d7c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -853,6 +853,7 @@ restart: if (trylock) { if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { skipped++; + xfs_perag_put(pag); continue; } first_index = pag->pag_ici_reclaim_cursor; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9b715dc..9124425 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -744,9 +744,15 @@ xfs_filestream_new_ag( * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. */ if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL); + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); /* * A new AG needs to be found for the file. If the file's parent diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b1498ab..19e9dfa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -275,6 +275,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0e64b1..9bb6eda 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) -#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) -#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} #define xfs_qm_vop_create_dqattach(tp, ip, u, g) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -#define xfs_qm_sync(mp, fl) (0) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) -#define xfs_qm_unmount_quotas(mp) (0) +#define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ |