diff options
Diffstat (limited to 'fs')
175 files changed, 8447 insertions, 2495 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 616abaf..d51ec9f 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -72,9 +72,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) /* we'll recheck under lock if there's anything to look in */ if (dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; - struct hlist_node *n; spin_lock(&dentry->d_lock); - hlist_for_each_entry(fid, n, h, dlist) { + hlist_for_each_entry(fid, h, dlist) { if (any || uid_eq(fid->uid, uid)) { ret = fid; break; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index eb82ee5..d9a4367 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -125,9 +125,8 @@ static void affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; - struct hlist_node *p; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; @@ -591,11 +591,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; - struct hlist_node *n; rcu_read_lock(); - hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { + hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { /* * RCU protects us against accessing freed memory but * we have to be careful not to get a reference when the diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 230bd2a..9bd1625 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path) goto done; } } else { - if (!simple_empty(dentry)) + if (!simple_empty(dentry)) { + spin_unlock(&sbi->fs_lock); goto done; + } } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 03bc1d3..3db70da 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - if (wq->name.name) { - kfree(wq->name.name); - wq->name.name = NULL; - } + kfree(wq->name.name); + wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); wq = nwq; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a5702d7..3939829 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -322,6 +322,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return 0; } +#ifndef elf_map + static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -356,6 +358,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +#endif /* !elf_map */ + static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; + trace_block_bio_complete(bio, error); + if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 53f5fae..aea605c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1033,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - bdev->bd_inode->i_size = size; + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, size); + mutex_unlock(&bdev->bd_inode->i_mutex); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1118,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret && !bdev->bd_openers) { + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ccd25ba..9a8622a 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -5,6 +5,9 @@ config BTRFS_FS select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select RAID6_PQ + select XOR_BLOCKS + help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f..3932224 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69..bd605c8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, err = __resolve_indirect_ref(fs_info, search_commit_root, time_seq, ref, parents, extent_item_pos); - if (err) { - if (ret == 0) - ret = err; + if (err) continue; - } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca7..310a7f6 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_BACKREF__ #define __BTRFS_BACKREF__ -#include "ioctl.h" +#include <linux/btrfs.h> #include "ulist.h" #include "extent_io.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242..d9b97d4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,8 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 #define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 +#define BTRFS_INODE_READDIO_NEED_LOCK 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_clear_bit(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); +} + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bf..18af6f4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror( (bh->b_data + (dev_bytenr & 4095)); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, - sizeof(super_tmp->magic)) || + super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_leafsize(super_tmp) != state->metablock_size || diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f8..15b9408 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); else @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_CACHE_SHIFT; if (comp_bio->bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7..ecd25a1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); + /* Fallthrough */ case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); @@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, __tree_mod_log_rewind(eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); return eb_rewin; } @@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress) { struct extent_buffer *cur; @@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; parent_level = btrfs_header_level(parent); - if (cache_only && parent_level != 1) - return 0; WARN_ON(trans->transaction != root->fs_info->running_transaction); WARN_ON(trans->transid != root->fs_info->generation); @@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, else uptodate = 0; if (!cur || !uptodate) { - if (cache_only) { - free_extent_buffer(cur); - continue; - } if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); @@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, and tree logging + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans) { struct extent_buffer *cur; @@ -4887,15 +4882,12 @@ again: if (sret && slot > 0) slot--; /* - * check this node pointer against the cache_only and - * min_trans parameters. If it isn't in cache or is too - * old, skip to the next one. + * check this node pointer against the min_trans parameters. + * If it is too old, old, skip to the next one. */ while (slot < nritems) { u64 blockptr; u64 gen; - struct extent_buffer *tmp; - struct btrfs_disk_key disk_key; blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); @@ -4903,27 +4895,7 @@ again: slot++; continue; } - if (!cache_only) - break; - - if (max_key) { - btrfs_node_key(cur, &disk_key, slot); - if (comp_keys(&disk_key, max_key) >= 0) { - ret = 1; - goto out; - } - } - - tmp = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - - if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - free_extent_buffer(tmp); - break; - } - if (tmp) - free_extent_buffer(tmp); - slot++; + break; } find_next_key: /* @@ -4934,7 +4906,7 @@ find_next_key: path->slots[level] = slot; btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, - cache_only, min_trans); + min_trans); if (sret == 0) { btrfs_release_path(path); goto again; @@ -5399,8 +5371,7 @@ out: /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. + * tree based on the current path and the min_trans parameters. * * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree @@ -5409,8 +5380,7 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int level, - int cache_only, u64 min_trans) + struct btrfs_key *key, int level, u64 min_trans) { int slot; struct extent_buffer *c; @@ -5461,22 +5431,8 @@ next: if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { - u64 blockptr = btrfs_node_blockptr(c, slot); u64 gen = btrfs_node_ptr_generation(c, slot); - if (cache_only) { - struct extent_buffer *cur; - cur = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - if (!cur || - btrfs_buffer_uptodate(cur, gen, 1) <= 0) { - slot++; - if (cur) - free_extent_buffer(cur); - goto next; - } - free_extent_buffer(cur); - } if (gen < min_trans) { slot++; goto next; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b0..0d82922 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -31,10 +31,10 @@ #include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include <linux/pagemap.h> +#include <linux/btrfs.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" -#include "ioctl.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) /* * File system states */ +#define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 +/* Super block flags */ /* Errors detected */ #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) @@ -502,6 +507,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -511,6 +517,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* @@ -952,8 +959,20 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ @@ -961,6 +980,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) /* @@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache { u64 flags; u64 sectorsize; u64 cache_generation; + + /* for raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -1225,6 +1250,28 @@ struct seq_list { u64 seq; }; +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + wait_queue_head_t wait; + spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1250,6 +1297,7 @@ struct btrfs_fs_info { /* block group cache stuff */ spinlock_t block_group_cache_lock; + u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ @@ -1288,7 +1336,23 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; @@ -1307,6 +1371,13 @@ struct btrfs_fs_info { struct mutex cleaner_mutex; struct mutex chunk_mutex; struct mutex volume_mutex; + + /* this is used during read/modify/write to make sure + * no two ios are trying to mod the same stripe at the same + * time + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -1365,6 +1436,7 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; + spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered @@ -1373,13 +1445,6 @@ struct btrfs_fs_info { struct list_head delalloc_inodes; /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - - /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for @@ -1395,6 +1460,8 @@ struct btrfs_fs_info { struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_raid56_workers; + struct btrfs_workers rmw_workers; struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; @@ -1423,10 +1490,12 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1442,9 +1511,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; - u64 delalloc_bytes; - /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; @@ -1456,6 +1522,8 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1520,7 +1588,7 @@ struct btrfs_fs_info { u64 qgroup_seq; /* filesystem state */ - u64 fs_state; + unsigned long fs_state; struct btrfs_delayed_root *delayed_root; @@ -1623,6 +1691,9 @@ struct btrfs_root { struct list_head root_list; + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; @@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) /* @@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); @@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, - int cache_only, u64 min_trans); + u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, @@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); @@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, unsigned long bio_flags); - +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only); + struct btrfs_root *root); /* sysfs.c */ int btrfs_init_sysfs(void); @@ -3620,11 +3696,14 @@ __printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ #define btrfs_panic(fs_info, errno, fmt, args...) \ do { \ - struct btrfs_fs_info *_i = (fs_info); \ - __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ - BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ } while (0) /* acl.c */ @@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid) return 1; return 0; } + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3483603..0b278b1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; - struct btrfs_item *item; char *ptr; int ret; @@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, @@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) } } -static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_node *node) +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) { struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; int ret; - mutex_lock(&node->mutex); - if (!node->inode_dirty) { - mutex_unlock(&node->mutex); - return 0; - } - key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; + ret = btrfs_lookup_inode(trans, root, path, &key, 1); if (ret > 0) { btrfs_release_path(path); - mutex_unlock(&node->mutex); return -ENOENT; } else if (ret < 0) { - mutex_unlock(&node->mutex); return ret; } @@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - mutex_unlock(&node->mutex); return 0; } +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!node->inode_dirty) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + /* * Called when committing the transaction. * Returns 0 on success. @@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int nr) { - struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; @@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node && (!count || (count && nr--))) { - curr_root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, curr_root, - curr_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, - curr_root, curr_node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, curr_root, - path, curr_node); + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; @@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, return __btrfs_run_delayed_items(trans, root, nr); } -static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_delayed_node *node) +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode) { + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->delayed_block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; - ret = btrfs_insert_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - btrfs_free_path(path); + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); trans->block_rsv = block_rsv; + return ret; } -int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode) +int btrfs_commit_inode_delayed_inode(struct inode *inode) { + struct btrfs_trans_handle *trans; struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); - if (!delayed_node->count) { + if (!delayed_node->inode_dirty) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return 0; } mutex_unlock(&delayed_node->mutex); - ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->inode_dirty) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans, delayed_node->root); + btrfs_btree_balance_dirty(delayed_node->root); +out: btrfs_release_delayed_node(delayed_node); + return ret; } @@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int need_requeue = 0; - int ret; async_node = container_of(work, struct btrfs_async_delayed_node, work); @@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) block_rsv = trans->block_rsv; trans->block_rsv = &root->fs_info->delayed_block_rsv; - ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - delayed_node); - - if (!ret) - btrfs_update_delayed_inode(trans, root, path, delayed_node); - + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); /* * Maybe new delayed items have been inserted, so we need requeue * the work. Besides that, we must dequeue the empty delayed nodes diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1..78b6ad0 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, /* Used for evicting the inode. */ void btrfs_remove_delayed_node(struct inode *inode); void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae94117..b7a0641 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@ #include "delayed-ref.h" #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for @@ -422,6 +426,14 @@ again: return 1; } +void btrfs_release_ref_cluster(struct list_head *cluster) +{ + struct list_head *pos, *q; + + list_for_each_safe(pos, q, cluster) + list_del_init(pos); +} + /* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same @@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref->extent_op->flags_to_set; existing_ref->extent_op->update_flags = 1; } - kfree(ref->extent_op); + btrfs_free_delayed_extent_op(ref->extent_op); } } /* @@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(head_ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); return -ENOMEM; } @@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); return -ENOMEM; } @@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; @@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) return btrfs_delayed_node_to_head(ref); return NULL; } + +void btrfs_delayed_ref_exit(void) +{ + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d7036..f75fcaf 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root { unsigned long num_heads_ready; /* + * bumped when someone is making progress on the delayed + * refs, so that other procs know they are just adding to + * contention intead of helping + */ + atomic_t procs_running_refs; + atomic_t ref_seq; + wait_queue_head_t wait; + + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away @@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root { u64 run_delayed_start; }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(atomic_read(&ref->refs) == 0); if (atomic_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); - kfree(ref); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } } } @@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); +void btrfs_release_ref_cluster(struct list_head *cluster); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8d..7ba7b39 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } btrfs_wait_ordered_extents(root, 0); trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652d..02369a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "raid56.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; @@ -639,8 +641,15 @@ err: btree_readahead_hook(root, eb, eb->start, ret); } - if (ret) + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); clear_extent_buffer_uptodate(eb); + } free_extent_buffer(eb); out: return ret; @@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ @@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == 1) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); - else if (end_io_wq->metadata == 2) + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) btrfs_queue_worker(&fs_info->endio_freespace_worker, &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); } else { - if (end_io_wq->metadata) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); + else if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); else @@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) * 0 - if data * 1 - if normal metadta * 2 - if writing to the free space cache area + * 3 - raid parity work */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) @@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%llu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); @@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping); @@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = ret; + goto fail_alloc; + } + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb, goto fail_alloc; /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { @@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb, leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_meta_write_workers, "endio-meta-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_raid56_workers, + "endio-raid56", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->rmw_workers, + "rmw", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); @@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb, */ fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_raid56_workers.idle_thresh = 4; + fs_info->rmw_workers.idle_thresh = 2; fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->fixup_workers); ret |= btrfs_start_workers(&fs_info->endio_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->rmw_workers); + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); ret |= btrfs_start_workers(&fs_info->endio_write_workers); ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { + if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2694,13 +2742,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); @@ -2710,6 +2758,8 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2721,13 +2771,17 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; @@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { + super->magic != cpu_to_le64(BTRFS_MAGIC)) { brelse(bh); continue; } @@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))) num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 - && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID5) { + num_tolerated_disk_barrier_failures = 2; + } + } } } up_read(&sinfo->groups_sem); @@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + btrfs_free_log(NULL, root); + btrfs_free_log_root_tree(NULL, fs_info); + } + __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); @@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root) btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); + btrfs_free_stripe_hash_table(fs_info); + return 0; } @@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)transid, (unsigned long long)root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; @@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } @@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_operations, &splice); + list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { - struct list_head splice; struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - ordered = list_entry(splice.next, struct btrfs_ordered_extent, - root_extent_list); - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* the inode may be getting freed (in sys_unlink path). */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) - iput(inode); - - atomic_set(&ordered->refs, 1); - btrfs_put_ordered_extent(ordered); - - spin_lock(&root->fs_info->ordered_extent_lock); - } - + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->fs_info->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->fs_info->ordered_extent_lock); } @@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } while ((node = rb_first(&delayed_refs->root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + struct btrfs_delayed_ref_head *head = NULL; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); if (!mutex_trylock(&head->mutex)) { @@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); } + ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + if (head) + mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); btrfs_invalidate_inodes(btrfs_inode->root); } @@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); - if (!t) - break; - btrfs_destroy_ordered_operations(root); + btrfs_destroy_ordered_operations(t, root); btrfs_destroy_ordered_extents(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33e..034d7dc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { + BTRFS_WQ_ENDIO_DATA = 0, + BTRFS_WQ_ENDIO_METADATA = 1, + BTRFS_WQ_ENDIO_FREE_SPACE = 2, + BTRFS_WQ_ENDIO_RAID56 = 3, +}; + static inline u64 btrfs_sb_offset(int mirror) { u64 start = 16 * 1024; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1e59ed5..3e074da 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "print-tree.h" #include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" @@ -72,8 +73,7 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -468,8 +477,6 @@ out: } static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, int load_cache_only) { DEFINE_WAIT(wait); @@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, *actual_bytes = discarded_bytes; + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } @@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } - mutex_unlock(&head->mutex); return ret; } @@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * process of being added. Don't run this ref yet. */ list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); @@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } @@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); if (ret) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - - printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + printk(KERN_DEBUG + "btrfs: run_delayed_extent_op " + "returned %d\n", ret); spin_lock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); return ret; } goto next; } - - list_del_init(&locked_ref->cluster); - locked_ref = NULL; } ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (locked_ref) { + if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the * ref_mod on head @@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - btrfs_put_delayed_ref(ref); - kfree(extent_op); - count++; - + btrfs_free_delayed_extent_op(extent_op); if (ret) { - if (locked_ref) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - } - printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + printk(KERN_DEBUG + "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; } + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + list_del_init(&locked_ref->cluster); + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } + btrfs_put_delayed_ref(ref); + count++; next: cond_resched(); spin_lock(&delayed_refs->lock); @@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, + int count) +{ + int val = atomic_read(&delayed_refs->ref_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + + if (!run_all && !run_most) { + int old; + int seq = atomic_read(&delayed_refs->ref_seq); + +progress: + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + DEFINE_WAIT(__wait); + if (delayed_refs->num_entries < 16348) + return 0; + + prepare_to_wait(&delayed_refs->wait, &__wait, + TASK_UNINTERRUPTIBLE); + + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + schedule(); + finish_wait(&delayed_refs->wait, &__wait); + + if (!refs_newer(delayed_refs, seq, 256)) + goto progress; + else + return 0; + } else { + finish_wait(&delayed_refs->wait, &__wait); + goto again; + } + } + + } else { + atomic_inc(&delayed_refs->procs_running_refs); + } + again: loops = 0; spin_lock(&delayed_refs->lock); @@ -2477,10 +2533,6 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } while (1) { if (!(run_all || run_most) && delayed_refs->num_heads_ready < 64) @@ -2500,11 +2552,15 @@ again: ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { + btrfs_release_ref_cluster(&cluster); spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); + atomic_dec(&delayed_refs->procs_running_refs); return ret; } + atomic_add(ret, &delayed_refs->ref_seq); + count -= min_t(unsigned long, ret, count); if (count == 0) @@ -2573,6 +2629,11 @@ again: goto again; } out: + atomic_dec(&delayed_refs->procs_running_refs); + smp_mb(); + if (waitqueue_active(&delayed_refs->wait)) + wake_up(&delayed_refs->wait); + spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; @@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } /* @@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; u64 target; + u64 tmp; /* * see if restripe for this chunk_type is in progress, if so @@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) } spin_unlock(&root->fs_info->balance_lock); + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } - - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - return extended_to_chunk(flags); + return extended_to_chunk(flags | tmp); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; + u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - return get_alloc_profile(root, flags); + ret = get_alloc_profile(root, flags); + return ret; } /* @@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); if (root == root->fs_info->tree_root || BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { @@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) { u64 num_dev; - if (type & BTRFS_BLOCK_GROUP_RAID10 || - type & BTRFS_BLOCK_GROUP_RAID0) + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) num_dev = root->fs_info->fs_devices->rw_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_dev = 2; @@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, @@ -3606,6 +3686,8 @@ again: goto again; } + trans->allocating_chunk = true; + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3632,19 +3714,20 @@ again: check_system_chunk(trans, extent_root, flags); ret = btrfs_alloc_chunk(trans, extent_root, flags); - if (ret < 0 && ret != -ENOSPC) - goto out; + trans->allocating_chunk = false; spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; if (ret) space_info->full = 1; else ret = 1; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); + u64 rsv_size = 0; u64 avail; u64 used; + u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + space_info->bytes_pinned + space_info->bytes_readonly; + + spin_lock(&global_rsv->lock); + rsv_size = global_rsv->size; + spin_unlock(&global_rsv->lock); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + rsv_size <<= 1; + if (used + rsv_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; @@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math */ if (profile & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; + to_add = space_info->total_bytes; + /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; + to_add >>= 3; else - avail >>= 1; + to_add >>= 1; + + /* + * Limit the overcommit to the amount of free space we could possibly + * allocate for chunks. + */ + to_add = min(avail, to_add); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < space_info->total_bytes + to_add) return 1; return 0; } -static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, - unsigned long nr_pages, - enum wb_reason reason) +void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) { - if (!writeback_in_progress(sb->s_bdi) && - down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, reason); - up_read(&sb->s_umount); - return 1; - } + struct super_block *sb = root->fs_info->sb; + int started; - return 0; + /* If we can not start writeback, just sync all the delalloc file. */ + started = try_to_writeback_inodes_sb_nr(sb, nr_pages, + WB_REASON_FS_FREE_SPACE); + if (!started) { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + } } /* @@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, space_info = block_rsv->space_info; smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; @@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, - nr_pages, - WB_REASON_FS_FREE_SPACE); - + btrfs_writeback_inodes_sb_nr(root, nr_pages); /* * We need to wait for the async pages to actually start before * we do anything. @@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, break; } smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } } @@ -4030,6 +4148,15 @@ again: goto again; out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved) { - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries, - * one for root of the snapshot and one for parent inode. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -4536,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -4579,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) + if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); + if (ret) + goto out_fail; + } - /* - * ret != 0 here means the qgroup reservation failed, we go straight to - * the shared error handling then. - */ - if (ret == 0) - ret = reserve_metadata_bytes(root, block_rsv, - to_reserve, flush); - - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - if (root->fs_info->quota_enabled) { + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); - } - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; + goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); @@ -4647,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + * Otherwise we have to do the normal free thing to account for + * the case that the free side didn't free up its reserve + * because of this outstanding reservation. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) + calc_csum_metadata_size(inode, num_bytes, 0); + else + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; } /** @@ -4668,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4735,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; @@ -4773,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); + cache_block_group(cache, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4823,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -4873,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; @@ -4888,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, trans, root, 1); + cache_block_group(cache, 1); pin_down_extent(root, cache, bytenr, num_bytes, 0); @@ -5285,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); + ret = update_block_group(root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5330,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -5453,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } @@ -5476,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5493,7 +5662,6 @@ static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5507,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) int __get_raid_index(u64 flags) { - int index; - if (flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; + return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; + return BTRFS_RAID_RAID1; else if (flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; + return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; - else - index = 4; + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; - return index; + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -5663,6 +5831,8 @@ search: if (!block_group_bits(block_group, data)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; /* @@ -5678,8 +5848,7 @@ have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 0); + ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } @@ -5692,6 +5861,7 @@ have_block_group: * lets look there */ if (last_ptr) { + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster @@ -5758,11 +5928,15 @@ refill_cluster: goto unclustered_alloc; } + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, search_start, num_bytes, - empty_cluster + empty_size); + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this @@ -5833,7 +6007,8 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, offset); + search_start = stripe_align(root, used_block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > @@ -5984,7 +6159,7 @@ again: if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) final_tried = true; @@ -6108,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6172,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6215,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); + cache_block_group(block_group, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6329,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret && !block_rsv->failfast) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", - ret); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { @@ -6400,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); @@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) root->fs_info->fs_devices->missing_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { @@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) index = get_block_group_index(block_group); } - if (index == 0) { + if (index == BTRFS_RAID_RAID10) { dev_min = 4; /* Divide by 2 */ min_free >>= 1; - } else if (index == 1) { + } else if (index == BTRFS_RAID_RAID1) { dev_min = 2; - } else if (index == 2) { + } else if (index == BTRFS_RAID_DUP) { /* Multiply by 2 */ min_free <<= 1; - } else if (index == 3) { + } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; do_div(min_free, dev_min); } @@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } } list_del(&space_info->list); kfree(space_info); @@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_release_path(path); cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + found_key.objectid); btrfs_init_free_space_ctl(cache); /* @@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!(get_alloc_profile(root, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + chunk_offset); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); @@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); + ret = cache_block_group(cache, 0); if (!ret) wait_block_group_cache_done(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df..f173c5a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,7 +4,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, */ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); @@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) */ static void check_page_locked(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); @@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, if (ret) err = ret; - if (did_repair) { - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); - if (ret && !err) - err = ret; - } + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; kfree(rec); return err; @@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; BUG_ON(!mirror_num); + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start-page_offset(page)); + bio_add_page(bio, page, length, start - page_offset(page)); btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); @@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page) failrec->failed_mirror); did_repair = !ret; } + ret = 0; } out: @@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, struct extent_io_tree *tree = bio->bi_private; u64 start; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; @@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, return ret; } -static int merge_bio(struct extent_io_tree *tree, struct page *page, +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(page, offset, size, bio, + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, bio_flags); BUG_ON(ret < 0); return ret; @@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); @@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags) { struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } } while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; @@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - if (!ret) { - nr++; - *bio_flags = this_bio_flag; - } - } - if (ret) { + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); } @@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; @@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3446,15 +3437,9 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && - tree->ops->write_cache_pages_lock_hook) { - tree->ops->write_cache_pages_lock_hook(page, - data, flush_fn); - } else { - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); } if (unlikely(page->mapping != mapping)) { @@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; @@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; int ret = 1; @@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, gfp_t mask) { struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if ((mask & __GFP_WAIT) && @@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, len = last - offset; if (len == 0) break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); + len = ALIGN(len, sectorsize); em = get_extent(inode, NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; @@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb) list_del(&eb->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif - if (eb->pages && eb->pages != eb->inline_pages) - kfree(eb->pages); kmem_cache_free(extent_buffer_cache, eb); } @@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { - struct page **pages; - int num_pages = (len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - pages = kzalloc(num_pages, mask); - if (!pages) { - __free_extent_buffer(eb); - return NULL; - } - eb->pages = pages; - } else { - eb->pages = eb->inline_pages; - } + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); return eb; } @@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static void check_buffer_tree_ref(struct extent_buffer *eb) { + int refs; /* the ref bit is tricky. We have to make sure it is set * if we have the buffer dirty. Otherwise the * code to free a buffer can end up dropping a dirty @@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * So bump the ref count first, then set the bit. If someone * beat us to it, drop the ref we added. */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_inc(&eb->refs); @@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) void free_extent_buffer(struct extent_buffer *eb) { + int refs; + int old; if (!eb) return; + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfab..6068a19 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,10 +72,9 @@ struct extent_io_ops { int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(struct page *page, unsigned long offset, + int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); @@ -90,8 +89,6 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page, void *data, - void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -161,8 +158,7 @@ struct extent_buffer { */ wait_queue_head_t read_lock_wq; wait_queue_head_t lock_wq; - struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; - struct page **pages; + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; }; static inline void extent_set_compress_type(unsigned long *bio_flags, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fdb7a8d..2834ca57 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,6 +1,5 @@ #include <linux/err.h> #include <linux/slab.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> #include "ctree.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 94aa53b..ec16020 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -684,6 +684,24 @@ out: return ret; } +static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, + struct btrfs_sector_sum *sector_sum, + u64 total_bytes, u64 sectorsize) +{ + u64 tmp = sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while ((tmp + total_bytes) < sums->len) { + if (next_sector + sectorsize != next->bytenr) + break; + tmp += sectorsize; + next_sector = next->bytenr; + next++; + } + return tmp; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -789,20 +807,32 @@ again: goto insert; } - if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / csum_size) { - u32 diff = (csum_offset + 1) * csum_size; + int extend_nr; + u64 tmp; + u32 diff; + u32 free_space; - /* - * is the item big enough already? we dropped our lock - * before and need to recheck - */ - if (diff < btrfs_item_size_nr(leaf, path->slots[0])) - goto csum; + if (btrfs_leaf_free_space(root, leaf) < + sizeof(struct btrfs_item) + csum_size * 2) + goto insert; + + free_space = btrfs_leaf_free_space(root, leaf) - + sizeof(struct btrfs_item) - csum_size; + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); + tmp >>= root->fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); + + extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - if (diff != csum_size) - goto insert; + diff = min(free_space, diff); + diff /= csum_size; + diff *= csum_size; btrfs_extend_item(trans, root, path, diff); goto csum; @@ -812,19 +842,14 @@ insert: btrfs_release_path(path); csum_offset = 0; if (found_next) { - u64 tmp = total_bytes + root->sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; + u64 tmp; - while (tmp < sums->len) { - if (next_sector + root->sectorsize != next->bytenr) - break; - tmp += root->sectorsize; - next_sector = next->bytenr; - next++; - } - tmp = min(tmp, next_offset - file_key.offset); + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + root->fs_info->sb->s_blocksize_bits); + tmp = max((u64)1, tmp); tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); ins_size = csum_size * tmp; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b241fe..af1d060 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,11 +30,11 @@ #include <linux/statfs.h> #include <linux/compat.h> #include <linux/slab.h> +#include <linux/btrfs.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "tree-log.h" #include "locking.h" @@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while(1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + if (!__need_auto_defrag(fs_info->tree_root)) break; @@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { mutex_unlock(&inode->i_mutex); err = -EROFS; goto out; @@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags)) { - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We need to block on a committing transaction to keep us from + * throwing a ordered operation on to the list and causing + * something like sync to deadlock trying to flush out this + * inode. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); + btrfs_end_transaction(trans, root); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); } @@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; struct btrfs_trans_handle *trans; + bool full_sync = 0; trace_btrfs_sync_file(file, datasync); /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by - * multi-task, and make the performance up. + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; @@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end - start + 1); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + if (full_sync) + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* @@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (ret > 0) { + /* + * If we didn't already wait for ordered extents we need + * to do that now. + */ + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - start + 1); ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_sync_log(trans, root); - if (ret == 0) + if (ret == 0) { ret = btrfs_end_transaction(trans, root); - else + } else { + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - + start + 1); ret = btrfs_commit_transaction(trans, root); + } } } else { ret = btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0be7a87..1f84fc0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + max_bitmaps = max(max_bitmaps, 1); + BUG_ON(ctl->total_bitmaps > max_bitmaps); /* @@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align) { struct btrfs_free_space *entry; struct rb_node *node; + u64 ctl_off; + u64 tmp; + u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) @@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) if (entry->bytes < *bytes) continue; + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + ctl_off = entry->offset - ctl->start; + tmp = ctl_off + align - 1;; + do_div(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + if (entry->bytes < *bytes + align_off) + continue; + if (entry->bitmap) { - ret = search_bitmap(ctl, entry, offset, bytes); - if (!ret) + ret = search_bitmap(ctl, entry, &tmp, bytes); + if (!ret) { + *offset = tmp; return entry; + } continue; } - *offset = entry->offset; - *bytes = entry->bytes; + *offset = tmp; + *bytes = entry->bytes - align_off; return entry; } @@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } /* - * some block groups are so tiny they can't be enveloped by a bitmap, so - * don't even bother to create a bitmap for this + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. */ - if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) return false; return true; @@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; spin_lock(&ctl->tree_lock); - entry = find_free_space(ctl, &offset, &bytes_search); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len); if (!entry) goto out; @@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { + unlink_free_space(ctl, entry); - entry->offset += bytes; - entry->bytes -= bytes; + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; if (!entry->bytes) kmem_cache_free(btrfs_free_space_cachep, entry); else @@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, out: spin_unlock(&ctl->tree_lock); + if (align_gap_len) + __btrfs_add_free_space(ctl, align_gap, align_gap_len); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c07b6..c226dae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,12 +39,13 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/mount.h> +#include <linux/btrfs.h> +#include <linux/blkdev.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" @@ -54,6 +55,7 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; @@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -389,7 +391,7 @@ again: * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -488,15 +490,13 @@ cont: * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - +again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, struct async_extent, list); @@ -648,6 +648,8 @@ retry: async_extent->ram_size - 1, btrfs_get_extent, WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); kfree(async_extent); cond_resched(); continue; @@ -672,6 +674,7 @@ retry: if (ret) { int i; + for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); page_cache_release(async_extent->pages[i]); @@ -679,12 +682,10 @@ retry: kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (ret == -ENOSPC) goto retry; - goto out_free; /* JDM: Requeue? */ + goto out_free; } /* @@ -696,10 +697,13 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) + goto out_free_reserve; em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -726,6 +730,9 @@ retry: async_extent->ram_size - 1, 0); } + if (ret) + goto out_free_reserve; + ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, @@ -733,7 +740,8 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_free_reserve; /* * clear dirty, set writeback and unlock the pages. @@ -754,18 +762,30 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); - - BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); + if (ret) + goto out; cond_resched(); } ret = 0; out: return ret; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); kfree(async_extent); - goto out; + goto again; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BUG_ON(btrfs_is_free_space_inode(inode)); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_start = em->start; ram_size = ins.offset; em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -1338,6 +1360,8 @@ out_check: em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); em->generation = -1; @@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list) btrfs_free_reserved_data_space(inode, len); - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, READ, logical, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); @@ -1892,6 +1931,640 @@ out: return ret; } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + WARN_ON(1); + return ret; + } + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + if (key.offset - extent_offset != offset) + continue; + + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset + extent_offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr, fs_info, + path, record_one_backref, + old); + BUG_ON(ret < 0 && ret != -ENOENT); + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 disk_bytenr) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + if (btrfs_root_refs(&root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* parse ENOENT to 0 */ + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (relink_is_mergable(leaf, fi, new->bytenr) && + extent_len + found_key.offset == start) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct old_sa_defrag_extent *old, *tmp; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out: + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); + + kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old, *tmp; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_list; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_list; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_list: + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out_free_path: + btrfs_free_path(path); +out_kfree: + kfree(new); + return NULL; +} + /* * helper function for btrfs_finish_ordered_io, this * just reads in some of the csum leaves to prime them into ram @@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret; bool nolock; @@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state); + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + if (nolock) trans = btrfs_join_transaction_nolock(root); else @@ -2001,17 +2689,33 @@ out: if (trans) btrfs_end_transaction(trans, root); - if (ret) + if (ret) { clear_extent_uptodate(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, NULL, GFP_NOFS); + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len); + } + + /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); + /* for snapshot-aware defrag */ + if (new) + relink_file_extents(new); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; @@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } } -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - /* * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv @@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); } else { nr_unlink++; } @@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } /* @@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -3407,10 +4116,9 @@ search_again: if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); if (ret && inode->i_nlink) btrfs_orphan_del(NULL, inode); @@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); @@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_lflush(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root); @@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } insert_inode_hash(inode); @@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_update_inode(trans, root, inode); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5396,8 +6107,7 @@ again: } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + extent_end = ALIGN(extent_start + size, root->sectorsize); } if (start >= extent_end) { @@ -5469,8 +6179,7 @@ again: copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; if (compress_type) { @@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start = start; em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; em->len = len; em->block_len = block_len; em->block_start = block_start; @@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; - if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + if (create) unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + else len = min_t(u64, len, root->sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6148,6 +6846,15 @@ unlock: */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6156,24 +6863,9 @@ unlock: * aren't using if there is any left over space. */ if (lockstart < lockend) { - if (create && len < lockend - lockstart) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, - unlock_bits | EXTENT_DEFRAG, 1, 0, - &cached_state, GFP_NOFS); - /* - * Beside unlock, we also need to cleanup reserved space - * for the left range by attaching EXTENT_DO_ACCOUNTING. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, - lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); - } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); - } + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); } else { free_extent_state(cached_state); } @@ -6183,9 +6875,6 @@ unlock: return 0; unlock_err: - if (create) - unlock_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); return ret; @@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); return -EIO; } - if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; } - async_submit = 1; + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + async_submit = 0; + else + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; @@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic_inc(); + + if (rw & WRITE) { + count = iov_length(iov, nr_segs); + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + else + btrfs_delalloc_release_metadata(inode, 0); + } +out: + if (wakeup) + inode_dio_done(inode); + if (relock) + mutex_lock(&inode->i_mutex); + + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) @@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, - page_offset(page)); + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* * IO on this page will never be started, so we need @@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(inode)) + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); @@ -7299,40 +8038,22 @@ fail: static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ - struct btrfs_inode *b_dir = BTRFS_I(dir); - struct btrfs_inode *b_inode = BTRFS_I(inode); - - if (b_dir->flags & BTRFS_INODE_NODATACOW) - b_inode->flags |= BTRFS_INODE_NODATACOW; - else - b_inode->flags &= ~BTRFS_INODE_NODATACOW; - - if (b_dir->flags & BTRFS_INODE_COMPRESS) { - b_inode->flags |= BTRFS_INODE_COMPRESS; - b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; - } else { - b_inode->flags &= ~(BTRFS_INODE_COMPRESS | - BTRFS_INODE_NOCOMPRESS); - } -} - static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - fixup_inode_flags(new_dir, old_inode); - ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); @@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); -again: + spin_lock(&root->fs_info->delalloc_lock); list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -7593,8 +8312,11 @@ again: list_del_init(&binode->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) + if (!inode) { + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &binode->runtime_flags); continue; + } list_add_tail(&binode->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -7619,13 +8341,6 @@ again: btrfs_wait_and_free_delalloc_work(work); } - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&root->fs_info->delalloc_inodes)) { - spin_unlock(&root->fs_info->delalloc_lock); - goto again; - } - spin_unlock(&root->fs_info->delalloc_lock); - /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(trans, root, + min(num_bytes, 256ULL * 1024 * 1024), + min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c3f09f7..c83086fd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -42,12 +42,12 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/btrfs.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "volumes.h" #include "locking.h" @@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; - struct dentry *parent = dentry->d_parent; - struct inode *dir; + struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + u64 qgroup_reserved; uuid_le new_uuid; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) return ret; - dir = parent->d_inode; - + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items + * The same as the snapshot creation, please see the comment + * of create_snapshot(). */ - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 7, &qgroup_reserved); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; - ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, - inherit ? *inherit : NULL); + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -527,13 +535,15 @@ fail: if (!ret) d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); - +out: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit **inherit) +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 7, + &pending_snapshot->qgroup_reserved); + if (ret) + goto out; + pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - if (inherit) { - pending_snapshot->inherit = *inherit; - *inherit = NULL; /* take responsibility to free it */ - } + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; - trans = btrfs_start_transaction(root->fs_info->extent_root, 6); + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } - ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); - BUG_ON(ret); - spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); @@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +out: kfree(pending_snapshot); return ret; } @@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, name, namelen, + error = create_snapshot(snap_src, dir, dentry, name, namelen, async_transid, readonly, inherit); } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid, inherit); + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); @@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root, while(1) { ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, newer_than); + path, newer_than); if (ret != 0) goto none; if (min_key.objectid != ino) @@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + ret = -EAGAIN; + break; + } + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, range->flags & @@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, int ret = 0; int mod = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); + if (!devid) { + ret = -EINVAL; + goto out_free; + } printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } @@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -ENODEV; goto out_free; } @@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizer unable to apply on " "readonly device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (device->is_tgtdev_for_dev_replace) { - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1457,7 +1486,7 @@ out: static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; @@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, - readonly, &inherit); + readonly, inherit); if (ret == 0 && ptr && copy_to_user(arg + @@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode, path->keep_locks = 1; while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0, + ret = btrfs_search_forward(root, &key, &max_key, path, sk->min_transid); if (ret != 0) { if (ret > 0) @@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; int namelen; int ret; int err = 0; @@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, two for dir entries, two for root + * ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 5, &qgroup_reserved); + if (err) + goto out_up_write; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_up_write; + goto out_release; } - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, @@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } } out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans, root); if (ret && !err) err = ret; inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: @@ -2171,6 +2217,12 @@ out_unlock: shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + + /* the last ref */ + if (dest->cache_inode) { + iput(dest->cache_inode); + dest->cache_inode = NULL; + } } out_dput: dput(dentry); @@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - ret = btrfs_defrag_root(root, 0); + ret = btrfs_defrag_root(root); if (ret) goto out; - ret = btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root->fs_info->extent_root); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); @@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) struct inode_fs_paths *ipath = NULL; struct btrfs_path *path; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; path = btrfs_alloc_path(); @@ -3914,6 +3966,65 @@ out: return ret; } +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + const char *label = root->fs_info->super_copy->label; + size_t len = strnlen(label, BTRFS_LABEL_SIZE); + int ret; + + if (len == BTRFS_LABEL_SIZE) { + pr_warn("btrfs: label is too long, return the first %zu bytes\n", + --len); + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = copy_to_user(arg, label, len); + mutex_unlock(&root->fs_info->volume_mutex); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + pr_err("btrfs: unable to set label with more than %d bytes\n", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&root->fs_info->volume_mutex); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + strcpy(super_block->label, label); + ret = btrfs_end_transaction(trans, root); + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); + case BTRFS_IOC_GET_FSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case BTRFS_IOC_SET_FSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index dabca9c..0000000 --- a/fs/btrfs/ioctl.h +++ /dev/null @@ -1,502 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __IOCTL_ -#define __IOCTL_ -#include <linux/ioctl.h> - -#define BTRFS_IOCTL_MAGIC 0x94 -#define BTRFS_VOL_NAME_MAX 255 - -/* this should be 4k */ -#define BTRFS_PATH_NAME_MAX 4087 -struct btrfs_ioctl_vol_args { - __s64 fd; - char name[BTRFS_PATH_NAME_MAX + 1]; -}; - -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 - -#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) -#define BTRFS_SUBVOL_RDONLY (1ULL << 1) -#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) -#define BTRFS_FSID_SIZE 16 -#define BTRFS_UUID_SIZE 16 - -#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) - -struct btrfs_qgroup_limit { - __u64 flags; - __u64 max_rfer; - __u64 max_excl; - __u64 rsv_rfer; - __u64 rsv_excl; -}; - -struct btrfs_qgroup_inherit { - __u64 flags; - __u64 num_qgroups; - __u64 num_ref_copies; - __u64 num_excl_copies; - struct btrfs_qgroup_limit lim; - __u64 qgroups[0]; -}; - -struct btrfs_ioctl_qgroup_limit_args { - __u64 qgroupid; - struct btrfs_qgroup_limit lim; -}; - -#define BTRFS_SUBVOL_NAME_MAX 4039 -struct btrfs_ioctl_vol_args_v2 { - __s64 fd; - __u64 transid; - __u64 flags; - union { - struct { - __u64 size; - struct btrfs_qgroup_inherit __user *qgroup_inherit; - }; - __u64 unused[4]; - }; - char name[BTRFS_SUBVOL_NAME_MAX + 1]; -}; - -/* - * structure to report errors and progress to userspace, either as a - * result of a finished scrub, a canceled scrub or a progress inquiry - */ -struct btrfs_scrub_progress { - __u64 data_extents_scrubbed; /* # of data extents scrubbed */ - __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ - __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ - __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ - __u64 read_errors; /* # of read errors encountered (EIO) */ - __u64 csum_errors; /* # of failed csum checks */ - __u64 verify_errors; /* # of occurences, where the metadata - * of a tree block did not match the - * expected values, like generation or - * logical */ - __u64 no_csum; /* # of 4k data block for which no csum - * is present, probably the result of - * data written with nodatasum */ - __u64 csum_discards; /* # of csum for which no data was found - * in the extent tree. */ - __u64 super_errors; /* # of bad super blocks encountered */ - __u64 malloc_errors; /* # of internal kmalloc errors. These - * will likely cause an incomplete - * scrub */ - __u64 uncorrectable_errors; /* # of errors where either no intact - * copy was found or the writeback - * failed */ - __u64 corrected_errors; /* # of errors corrected */ - __u64 last_physical; /* last physical address scrubbed. In - * case a scrub was aborted, this can - * be used to restart the scrub */ - __u64 unverified_errors; /* # of occurences where a read for a - * full (64k) bio failed, but the re- - * check succeeded for each 4k piece. - * Intermittent error. */ -}; - -#define BTRFS_SCRUB_READONLY 1 -struct btrfs_ioctl_scrub_args { - __u64 devid; /* in */ - __u64 start; /* in */ - __u64 end; /* in */ - __u64 flags; /* in */ - struct btrfs_scrub_progress progress; /* out */ - /* pad to 1k */ - __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -struct btrfs_ioctl_dev_replace_start_params { - __u64 srcdevid; /* in, if 0, use srcdev_name instead */ - __u64 cont_reading_from_srcdev_mode; /* in, see #define - * above */ - __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ - __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 -struct btrfs_ioctl_dev_replace_status_params { - __u64 replace_state; /* out, see #define above */ - __u64 progress_1000; /* out, 0 <= x <= 1000 */ - __u64 time_started; /* out, seconds since 1-Jan-1970 */ - __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ - __u64 num_write_errors; /* out */ - __u64 num_uncorrectable_read_errors; /* out */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 -struct btrfs_ioctl_dev_replace_args { - __u64 cmd; /* in */ - __u64 result; /* out */ - - union { - struct btrfs_ioctl_dev_replace_start_params start; - struct btrfs_ioctl_dev_replace_status_params status; - }; /* in/out */ - - __u64 spare[64]; -}; - -struct btrfs_ioctl_dev_info_args { - __u64 devid; /* in/out */ - __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ - __u64 bytes_used; /* out */ - __u64 total_bytes; /* out */ - __u64 unused[379]; /* pad to 4k */ - __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ -}; - -struct btrfs_ioctl_fs_info_args { - __u64 max_id; /* out */ - __u64 num_devices; /* out */ - __u8 fsid[BTRFS_FSID_SIZE]; /* out */ - __u64 reserved[124]; /* pad to 1k */ -}; - -/* balance control ioctl modes */ -#define BTRFS_BALANCE_CTL_PAUSE 1 -#define BTRFS_BALANCE_CTL_CANCEL 2 - -/* - * this is packed, because it should be exactly the same as its disk - * byte order counterpart (struct btrfs_disk_balance_args) - */ -struct btrfs_balance_args { - __u64 profiles; - __u64 usage; - __u64 devid; - __u64 pstart; - __u64 pend; - __u64 vstart; - __u64 vend; - - __u64 target; - - __u64 flags; - - __u64 unused[8]; -} __attribute__ ((__packed__)); - -/* report balance progress to userspace */ -struct btrfs_balance_progress { - __u64 expected; /* estimated # of chunks that will be - * relocated to fulfill the request */ - __u64 considered; /* # of chunks we have considered so far */ - __u64 completed; /* # of chunks relocated so far */ -}; - -#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) -#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) -#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) - -struct btrfs_ioctl_balance_args { - __u64 flags; /* in/out */ - __u64 state; /* out */ - - struct btrfs_balance_args data; /* in/out */ - struct btrfs_balance_args meta; /* in/out */ - struct btrfs_balance_args sys; /* in/out */ - - struct btrfs_balance_progress stat; /* out */ - - __u64 unused[72]; /* pad to 1k */ -}; - -#define BTRFS_INO_LOOKUP_PATH_MAX 4080 -struct btrfs_ioctl_ino_lookup_args { - __u64 treeid; - __u64 objectid; - char name[BTRFS_INO_LOOKUP_PATH_MAX]; -}; - -struct btrfs_ioctl_search_key { - /* which root are we searching. 0 is the tree of tree roots */ - __u64 tree_id; - - /* keys returned will be >= min and <= max */ - __u64 min_objectid; - __u64 max_objectid; - - /* keys returned will be >= min and <= max */ - __u64 min_offset; - __u64 max_offset; - - /* max and min transids to search for */ - __u64 min_transid; - __u64 max_transid; - - /* keys returned will be >= min and <= max */ - __u32 min_type; - __u32 max_type; - - /* - * how many items did userland ask for, and how many are we - * returning - */ - __u32 nr_items; - - /* align to 64 bits */ - __u32 unused; - - /* some extra for later */ - __u64 unused1; - __u64 unused2; - __u64 unused3; - __u64 unused4; -}; - -struct btrfs_ioctl_search_header { - __u64 transid; - __u64 objectid; - __u64 offset; - __u32 type; - __u32 len; -}; - -#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) -/* - * the buf is an array of search headers where - * each header is followed by the actual item - * the type field is expanded to 32 bits for alignment - */ -struct btrfs_ioctl_search_args { - struct btrfs_ioctl_search_key key; - char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; -}; - -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; - -/* flags for the defrag range ioctl */ -#define BTRFS_DEFRAG_RANGE_COMPRESS 1 -#define BTRFS_DEFRAG_RANGE_START_IO 2 - -struct btrfs_ioctl_space_info { - __u64 flags; - __u64 total_bytes; - __u64 used_bytes; -}; - -struct btrfs_ioctl_space_args { - __u64 space_slots; - __u64 total_spaces; - struct btrfs_ioctl_space_info spaces[0]; -}; - -struct btrfs_data_container { - __u32 bytes_left; /* out -- bytes not needed to deliver output */ - __u32 bytes_missing; /* out -- additional bytes needed for result */ - __u32 elem_cnt; /* out */ - __u32 elem_missed; /* out */ - __u64 val[0]; /* out */ -}; - -struct btrfs_ioctl_ino_path_args { - __u64 inum; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *fspath; out */ - __u64 fspath; /* out */ -}; - -struct btrfs_ioctl_logical_ino_args { - __u64 logical; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *inodes; out */ - __u64 inodes; -}; - -enum btrfs_dev_stat_values { - /* disk I/O failure stats */ - BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ - BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ - BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ - - /* stats for indirect indications for I/O failures */ - BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or - * contents is illegal: this is an - * indication that the block was damaged - * during read or write, or written to - * wrong location or read from wrong - * location */ - BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not - * been written */ - - BTRFS_DEV_STAT_VALUES_MAX -}; - -/* Reset statistics after reading; needs SYS_ADMIN capability */ -#define BTRFS_DEV_STATS_RESET (1ULL << 0) - -struct btrfs_ioctl_get_dev_stats { - __u64 devid; /* in */ - __u64 nr_items; /* in/out */ - __u64 flags; /* in/out */ - - /* out values: */ - __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; - - __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ -}; - -#define BTRFS_QUOTA_CTL_ENABLE 1 -#define BTRFS_QUOTA_CTL_DISABLE 2 -#define BTRFS_QUOTA_CTL_RESCAN 3 -struct btrfs_ioctl_quota_ctl_args { - __u64 cmd; - __u64 status; -}; - -struct btrfs_ioctl_qgroup_assign_args { - __u64 assign; - __u64 src; - __u64 dst; -}; - -struct btrfs_ioctl_qgroup_create_args { - __u64 create; - __u64 qgroupid; -}; -struct btrfs_ioctl_timespec { - __u64 sec; - __u32 nsec; -}; - -struct btrfs_ioctl_received_subvol_args { - char uuid[BTRFS_UUID_SIZE]; /* in */ - __u64 stransid; /* in */ - __u64 rtransid; /* out */ - struct btrfs_ioctl_timespec stime; /* in */ - struct btrfs_ioctl_timespec rtime; /* out */ - __u64 flags; /* in */ - __u64 reserved[16]; /* in */ -}; - -struct btrfs_ioctl_send_args { - __s64 send_fd; /* in */ - __u64 clone_sources_count; /* in */ - __u64 __user *clone_sources; /* in */ - __u64 parent_root; /* in */ - __u64 flags; /* in */ - __u64 reserved[4]; /* in */ -}; - -#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ - struct btrfs_ioctl_vol_args) -/* trans start and trans end are dangerous, and only for - * use by applications that know how to avoid the - * resulting deadlocks - */ -#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) -#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) -#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) - -#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) -#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ - struct btrfs_ioctl_vol_args) - -#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ - struct btrfs_ioctl_clone_range_args) - -#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ - struct btrfs_ioctl_defrag_range_args) -#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ - struct btrfs_ioctl_search_args) -#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ - struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) -#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ - struct btrfs_ioctl_space_args) -#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) -#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ - struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) -#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) -#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) -#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ - struct btrfs_ioctl_dev_info_args) -#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ - struct btrfs_ioctl_fs_info_args) -#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) -#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ - struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ - struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ - struct btrfs_ioctl_received_subvol_args) -#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) -#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ - struct btrfs_ioctl_quota_ctl_args) -#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ - struct btrfs_ioctl_qgroup_assign_args) -#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ - struct btrfs_ioctl_qgroup_create_args) -#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ - struct btrfs_ioctl_qgroup_limit_args) -#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ - struct btrfs_ioctl_get_dev_stats) -#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ - struct btrfs_ioctl_dev_replace_args) - -#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c..e95df43 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -113,11 +113,10 @@ again: read_unlock(&eb->lock); return; } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); - read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e5ed567..dc08d77 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && + !(type == BTRFS_ORDERED_NOCOW)) + entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); + INIT_LIST_HEAD(&entry->log_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); + WARN_ON(entry->csum_bytes_left < sum->len); + entry->csum_bytes_left -= sum->len; + if (entry->csum_bytes_left == 0) + wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -405,6 +413,66 @@ out: return ret == 0; } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_extent *ordered; + struct rb_node *n; + int index = log->log_transid % 2; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + spin_lock(&log->log_extents_lock[index]); + if (list_empty(&ordered->log_list)) { + list_add_tail(&ordered->log_list, &log->logged_list[index]); + atomic_inc(&ordered->refs); + } + spin_unlock(&log->log_extents_lock[index]); + } + spin_unlock_irq(&tree->lock); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + /* * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped @@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; + struct btrfs_transaction *cur_trans = trans->transaction; struct list_head splice; struct list_head works; struct btrfs_delalloc_work *work, *next; @@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - + list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); - inode = &btrfs_inode->vfs_inode; list_del_init(&btrfs_inode->ordered_operations); @@ -574,24 +640,22 @@ again: * the inode may be getting freed (in sys_unlink path). */ inode = igrab(inode); - - if (!wait && inode) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - if (!inode) continue; + + if (!wait) + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { + spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); - spin_lock(&root->fs_info->ordered_extent_lock); list_splice_tail(&splice, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); ret = -ENOMEM; goto out; @@ -603,9 +667,6 @@ again: cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - spin_unlock(&root->fs_info->ordered_extent_lock); out: list_for_each_entry_safe(work, next, &works, list) { @@ -974,6 +1035,7 @@ out: void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { + struct btrfs_transaction *cur_trans = trans->transaction; u64 last_mod; last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5..8eadfe4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -79,6 +79,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent * has done its due diligence in updating * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered + ordered extent */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -96,6 +98,9 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* number of bytes that still need csumming */ + u64 csum_bytes_left; + /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -118,6 +123,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* If we need to wait on this to be done */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd..920957e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + break; case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c8562..aee4b1c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,13 +23,13 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> +#include <linux/btrfs.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" -#include "ioctl.h" #include "backref.h" /* TODO XXX FIXME @@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.offset = qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, key.offset = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_key key; + struct extent_buffer *leaf = NULL; int ret; - - if (!root) - return -EINVAL; + int nr = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - while (1) { - key.objectid = 0; - key.offset = 0; - key.type = 0; + path->leave_spinning = 1; - path->leave_spinning = 1; + key.objectid = 0; + key.offset = 0; + key.type = 0; + + while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } else if (ret < 0) { + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) break; - } - - ret = btrfs_del_item(trans, root, path); + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; + btrfs_release_path(path); } ret = 0; @@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, int ret = 0; spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) { + spin_unlock(&fs_info->qgroup_lock); + return 0; + } fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 0000000..9a79fb7 --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2100 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/hash.h> +#include <linux/list_sort.h> +#include <linux/raid/xor.h> +#include <linux/vmalloc.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { + struct btrfs_fs_info *fs_info; + struct btrfs_bio *bbio; + + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; + + /* while we're doing rmw on a stripe + * we put it into a hash table so we can + * lock the stripe and merge more rbios + * into it. + */ + struct list_head hash_list; + + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + + /* + * for scheduling work in the helper threads + */ + struct btrfs_work work; + + /* + * bio list and bio_list_lock are used + * to add more bios into the stripe + * in hopes of avoiding the full rmw + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* also protected by the bio_list_lock, the + * plug list is used by the plugging code + * to collect partial bios while plugged. The + * stripe locking code also uses it to hand off + * the stripe lock to the next pending IO + */ + struct list_head plug_list; + + /* + * flags that tell us if it is safe to + * merge with this bio + */ + unsigned long flags; + + /* size of each individual stripe on disk */ + int stripe_len; + + /* number of data stripes (no p/q) */ + int nr_data; + + /* + * set if we're doing a parity rebuild + * for a read from higher up, which is handled + * differently from a parity rebuild as part of + * rmw + */ + int read_rebuild; + + /* first bad stripe */ + int faila; + + /* second bad stripe (for raid6 use) */ + int failb; + + /* + * number of pages needed to represent the full + * stripe + */ + int nr_pages; + + /* + * size of all the bios in the bio_list. This + * helps us decide if the rbio maps to a full + * stripe or not + */ + int bio_list_bytes; + + atomic_t refs; + + /* + * these are two arrays of pointers. We allocate the + * rbio big enough to hold them both and setup their + * locations when the rbio is allocated + */ + + /* pointers to pages that we allocated for + * reading/writing stripes directly from the disk (including P/Q) + */ + struct page **stripe_pages; + + /* + * pointers to the pages in the bio_list. Stored + * here for faster lookup + */ + struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + int table_size; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table_size = sizeof(*table) + sizeof(*h) * num_entries; + table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!table) { + table = vzalloc(table_size); + if (!table) + return -ENOMEM; + } + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + init_waitqueue_head(&cur->wait); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + if (x) { + if (is_vmalloc_addr(x)) + vfree(x); + else + kfree(x); + } + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->raid_map[0]; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + if (is_vmalloc_addr(info->stripe_hash_table)) + vfree(info->stripe_hash_table); + else + kfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + if (size != rbio->nr_data * rbio->stripe_len) + ret = 0; + + BUG_ON(size > rbio->nr_data * rbio->stripe_len); + return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + ret = __rbio_is_full(rbio); + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->raid_map[0] != + cur->raid_map[0]) + return 0; + + /* reads can't merge with writes */ + if (last->read_rebuild != + cur->read_rebuild) { + return 0; + } + + return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + return NULL; + + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> + PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + unsigned long flags; + DEFINE_WAIT(wait); + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + int walk = 0; + + spin_lock_irqsave(&h->lock, flags); + list_for_each_entry(cur, &h->hash_list, hash_list) { + walk++; + if (cur->raid_map[0] == rbio->raid_map[0]) { + spin_lock(&cur->bio_list_lock); + + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * we couldn't merge with the running + * rbio, see if we can merge with the + * pending ones. We don't have to + * check for rmw_locked because there + * is no way they are inside finish_rmw + * right now + */ + list_for_each_entry(pending, &cur->plug_list, + plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* no merging, put us on the tail of the plug list, + * our rbio will be started with the currently + * running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } + } +lockit: + atomic_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + __free_raid_bio(freeit); + return ret; +} + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock_irqsave(&h->lock, flags); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + atomic_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + + if (next->read_rebuild) + async_read_rebuild(next); + else { + steal_rbio(rbio, next); + async_rmw_stripe(next); + } + + goto done_nolock; + } else if (waitqueue_active(&h->wait)) { + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + wake_up(&h->wait); + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + WARN_ON(atomic_read(&rbio->refs) < 0); + if (!atomic_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + kfree(rbio->raid_map); + kfree(rbio->bbio); + kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + unlock_stripe(rbio); + __free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *next; + free_raid_bio(rbio); + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + if (uptodate) + set_bit(BIO_UPTODATE, &cur->bi_flags); + bio_endio(cur, err); + cur = next; + } +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); + return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else. This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, + int index, int pagenr, int bio_list_only) +{ + int chunk_page; + struct page *p = NULL; + + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + + spin_lock_irq(&rbio->bio_list_lock); + p = rbio->bio_pages[chunk_page]; + spin_unlock_irq(&rbio->bio_list_lock); + + if (p || bio_list_only) + return p; + + return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ + unsigned long nr = stripe_len * nr_stripes; + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + int nr_data = 0; + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + void *p; + + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + GFP_NOFS); + if (!rbio) { + kfree(raid_map); + kfree(bbio); + return ERR_PTR(-ENOMEM); + } + + bio_list_init(&rbio->bio_list); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + rbio->bbio = bbio; + rbio->raid_map = raid_map; + rbio->fs_info = root->fs_info; + rbio->stripe_len = stripe_len; + rbio->nr_pages = num_pages; + rbio->faila = -1; + rbio->failb = -1; + atomic_set(&rbio->refs, 1); + + /* + * the stripe_pages and bio_pages array point to the extra + * memory we allocated past the end of the rbio + */ + p = rbio + 1; + rbio->stripe_pages = p; + rbio->bio_pages = p + sizeof(struct page *) * num_pages; + + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + nr_data = bbio->num_stripes - 2; + else + nr_data = bbio->num_stripes - 1; + + rbio->nr_data = nr_data; + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + ClearPageUptodate(page); + } + return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_sector << 9; + last_end += last->bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + test_bit(BIO_UPTODATE, &last->bi_flags) && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); + if (ret == PAGE_CACHE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_sector = disk_start >> 9; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ + int index; + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); + index += page; + return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + struct page *p; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_sector << 9; + stripe_offset = start - rbio->raid_map[0]; + page_index = stripe_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + rbio->bio_pages[page_index + i] = p; + } + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[bbio->num_stripes]; + int stripe_len = rbio->stripe_len; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct bio_list bio_list; + struct bio *bio; + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; + int ret; + + bio_list_init(&bio_list); + + if (bbio->num_stripes - rbio->nr_data == 1) { + p_stripe = bbio->num_stripes - 1; + } else if (bbio->num_stripes - rbio->nr_data == 2) { + p_stripe = bbio->num_stripes - 2; + q_stripe = bbio->num_stripes - 1; + } else { + BUG(); + } + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->bbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *p; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + p = rbio_pstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + p = rbio_qstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + + for (stripe = 0; stripe < bbio->num_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio. Used to figure out which + * stripe has failed. This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 physical = bio->bi_sector; + u64 stripe_start; + int i; + struct btrfs_bio_stripe *stripe; + + physical <<= 9; + + for (i = 0; i < rbio->bbio->num_stripes; i++) { + stripe = &rbio->bbio->stripes[i]; + stripe_start = stripe->physical; + if (physical >= stripe_start && + physical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping). Used to figure out which stripe has + * failed. This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 logical = bio->bi_sector; + u64 stripe_start; + int i; + + logical <<= 9; + + for (i = 0; i < rbio->nr_data; i++) { + stripe_start = rbio->raid_map[i]; + if (logical >= stripe_start && + logical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + + /* we already know this stripe is bad, move on */ + if (rbio->faila == failed || rbio->failb == failed) + goto out; + + if (rbio->faila == -1) { + /* first failure on this rbio */ + rbio->faila = failed; + atomic_inc(&rbio->bbio->error); + } else if (rbio->failb == -1) { + /* second failure on this rbio */ + rbio->failb = failed; + atomic_inc(&rbio->bbio->error); + } else { + ret = -EIO; + } +out: + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + + return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + int failed = find_bio_stripe(rbio, bio); + + if (failed < 0) + return -EIO; + + return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ + int i; + struct page *p; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + SetPageUptodate(p); + } +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + goto cleanup; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_rmw(rbio); + return; + +cleanup: + + rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = rmw_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = read_rebuild_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->bbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_rmw_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return 0; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; + +finish: + validate_rbio_for_rmw(rbio); + return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = alloc_rbio_parity_pages(rbio); + if (ret) + return ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + finish_rmw(rbio); + return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + async_rmw_stripe(rbio); + return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe. So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ + /* head off into rmw land if we don't have a full stripe */ + if (!rbio_is_full(rbio)) + return partial_stripe_write(rbio); + return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_sector; + u64 b_sector = rb->bio_list.head->bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + /* + * sort our plug list then try to merge + * everything we can in hopes of creating full + * stripes. + */ + list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* we have a full stripe, send it down */ + full_stripe_write(cur); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + __free_raid_bio(cur); + continue; + + } + __raid56_parity_write(last); + } + last = cur; + } + if (last) { + __raid56_parity_write(last); + } + kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ + struct btrfs_plug_cb *plug; + plug = container_of(work, struct btrfs_plug_cb, work); + run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug; + plug = container_of(cb, struct btrfs_plug_cb, cb); + + if (from_schedule) { + plug->work.flags = 0; + plug->work.func = unplug_work; + btrfs_queue_worker(&plug->info->rmw_workers, + &plug->work); + return; + } + run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + kfree(raid_map); + kfree(bbio); + return PTR_ERR(rbio); + } + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + /* + * don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (rbio_is_full(rbio)) + return full_stripe_write(rbio); + + cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, + sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = root->fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + } else { + return __raid56_parity_write(rbio); + } + return 0; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int pagenr, stripe; + void **pointers; + int faila = -1, failb = -1; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + int i; + + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + GFP_NOFS); + if (!pointers) { + err = -ENOMEM; + goto cleanup_io; + } + + faila = rbio->faila; + failb = rbio->failb; + + if (rbio->read_rebuild) { + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* setup our array of pointers with pages + * from each stripe + */ + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + pointers[stripe] = kmap(page); + } + + /* all raid6 handling here */ + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == + RAID6_Q_STRIPE) { + + /* + * single failure, rebuild from parity raid5 + * style + */ + if (failb < 0) { + if (faila == rbio->nr_data) { + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * TODO, we should redo the xor here. + */ + err = -EIO; + goto cleanup; + } + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* make sure our ps and qs are in order */ + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + + /* if the q stripe is failed, do a pstripe reconstruction + * from the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want + */ + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + err = -EIO; + goto cleanup; + } + /* + * otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, pointers); + } else { + raid6_2data_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, failb, + pointers); + } + } else { + void *p; + + /* rebuild from P stripe here (raid5 or raid6) */ + BUG_ON(failb != -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], + pointers[rbio->nr_data], + PAGE_CACHE_SIZE); + + /* rearrange the pointer array */ + p = pointers[faila]; + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) + pointers[stripe] = pointers[stripe + 1]; + pointers[rbio->nr_data - 1] = p; + + /* xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + } + /* if we're doing this rebuild as part of an rmw, go through + * and set all of our private rbio pages in the + * failed stripes as uptodate. This way finish_rmw will + * know they can be trusted. If this was a read reconstruction, + * other endio functions will fiddle the uptodate bits + */ + if (!rbio->read_rebuild) { + for (i = 0; i < nr_pages; i++) { + if (faila != -1) { + page = rbio_stripe_page(rbio, faila, i); + SetPageUptodate(page); + } + if (failb != -1) { + page = rbio_stripe_page(rbio, failb, i); + SetPageUptodate(page); + } + } + } + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + kunmap(page); + } + } + + err = 0; +cleanup: + kfree(pointers); + +cleanup_io: + + if (rbio->read_rebuild) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + rbio_orig_end_io(rbio, err, err == 0); + } else if (err == 0) { + rbio->faila = -1; + rbio->failb = -1; + finish_rmw(rbio); + } else { + rbio_orig_end_io(rbio, err, 0); + } +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + /* + * we only read stripe pages off the disk, set them + * up to date if there were no errors + */ + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + rbio_orig_end_io(rbio, -EIO, 0); + else + __raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->bbio->error, 0); + + /* + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + if (rbio->faila == stripe || + rbio->failb == stripe) + continue; + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *p; + + /* + * the rmw code may have already read this + * page in + */ + p = rbio_stripe_page(rbio, stripe, pagenr); + if (PageUptodate(p)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); + if (ret < 0) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * we might have no bios to read just because the pages + * were up to date, or we might have no bios to read because + * the devices were gone. + */ + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + __raid_recover_end_io(rbio); + goto out; + } else { + goto cleanup; + } + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_recover_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } +out: + return 0; + +cleanup: + if (rbio->read_rebuild) + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + return PTR_ERR(rbio); + } + + rbio->read_rebuild = 1; + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return -EIO; + } + + /* + * reconstruct from the q stripe if they are + * asking for mirror 3 + */ + if (mirror_num == 3) + rbio->failb = bbio->num_stripes - 2; + + ret = lock_stripe_add(rbio); + + /* + * __raid56_parity_recover will end the bio with + * any errors it hits. We don't want to return + * its error value up the stack because our caller + * will end up calling bio_endio with any nonzero + * return + */ + if (ret == 0) + __raid56_parity_recover(rbio); + /* + * our rbio has been added to the list of + * rbios that will be handled after the + * currently lock owner is done + */ + return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 0000000..ea5d73b --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 17c306b..50695dc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode, } } - page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 67783e0..53c3501 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" +#include "raid56.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_device *extent_dev; int extent_mirror_num; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + if (num >= nr_data_stripes(map)) { + return 0; + } + } + nstripes = length; offset = 0; do_div(nstripes, map->stripe_len); @@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_root *root = sctx->dev_root; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; gen = root->fs_info->last_trans_committed; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f4ab7a9..f7a8b86 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -85,6 +85,7 @@ struct send_ctx { u32 send_max_size; u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ struct vfsmount *mnt; @@ -3709,6 +3710,39 @@ out: return ret; } +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (!clone_root) { + if (clone_root) { + ret = send_clone(sctx, offset, len, clone_root); + } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = send_update_extent(sctx, offset, len); + } else { while (pos < len) { l = len - pos; if (l > BTRFS_SEND_READ_SIZE) @@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx, pos += ret; } ret = 0; - } else { - ret = send_clone(sctx, offset, len, clone_root); } - out: return ret; } @@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct btrfs_fs_info *fs_info; struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; - struct file *filp = NULL; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; @@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = -EINVAL; + goto out; + } + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); if (!sctx) { ret = -ENOMEM; @@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); INIT_LIST_HEAD(&sctx->name_cache_list); + sctx->flags = arg->flags; + sctx->send_filp = fget(arg->send_fd); if (IS_ERR(sctx->send_filp)) { ret = PTR_ERR(sctx->send_filp); @@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; out: - if (filp) - fput(filp); kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32..8bb18f7 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -86,6 +86,7 @@ enum btrfs_send_cmd { BTRFS_SEND_C_UTIMES, BTRFS_SEND_C_END, + BTRFS_SEND_C_UPDATE_EXTENT, __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9..68a29a1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,13 +41,13 @@ #include <linux/slab.h> #include <linux/cleancache.h> #include <linux/ratelimit.h> +#include <linux/btrfs.h> #include "compat.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "xattr.h" #include "volumes.h" @@ -63,8 +63,7 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, - char nbuf[16]) +static const char *btrfs_decode_error(int errno, char nbuf[16]) { char *errstr = NULL; @@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) * today we only save the error info into ram. Long term we'll * also send it down to the disk */ - fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } static void save_error_info(struct btrfs_fs_info *fs_info) @@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (sb->s_flags & MS_RDONLY) return; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); /* @@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, struct super_block *sb = fs_info->sb; char nbuf[16]; const char *errstr; - va_list args; - va_start(args, fmt); /* * Special case: if the error is EROFS, and we're already @@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; - errstr = btrfs_decode_error(fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); if (fmt) { - struct va_format vaf = { - .fmt = fmt, - .va = &args, - }; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", sb->s_id, function, line, errstr, &vaf); + va_end(args); } else { printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); @@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, save_error_info(fs_info); btrfs_handle_error(fs_info); } - va_end(args); } static const char * const logtypes[] = { @@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, char nbuf[16]; const char *errstr; - errstr = btrfs_decode_error(root->fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); btrfs_printk(root->fs_info, "%s:%d: Aborting unused transaction(%s).\n", function, line, errstr); @@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(fs_info, errno, nbuf); - if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + errstr = btrfs_decode_error(errno, nbuf); + if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", s_id, function, line, &vaf, errstr); @@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_compress_force: case Opt_compress_force_type: compress_force = true; + /* Fallthrough */ case Opt_compress: case Opt_compress_type: if (token == Opt_compress || @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { + mutex_lock(&info->chunk_mutex); info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", @@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0); - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & MS_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + /* + * We need cleanup all defragable inodes if the autodefragment is + * close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (fs_info->sb->s_flags & MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + btrfs_remount_prepare(fs_info, old_opts, *flags); + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) fs_info->thread_pool_size, old_thread_pool_size); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - return 0; + goto out; if (*flags & MS_RDONLY) { /* @@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } sb->s_flags &= ~MS_RDONLY; } - +out: + btrfs_remount_cleanup(fs_info, old_opts); return 0; restore: @@ -1289,10 +1326,13 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); return ret; } @@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_inode; - err = btrfs_interface_init(); + err = btrfs_delayed_ref_init(); if (err) goto free_auto_defrag; + err = btrfs_interface_init(); + if (err) + goto free_delayed_ref; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); free_auto_defrag: btrfs_auto_defrag_exit(); free_delayed_inode: @@ -1720,6 +1766,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae..5b326cd 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,7 +21,6 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/kobject.h> #include "ctree.h" diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4c0067c..e52da6f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans->in_commit && + type != TRANS_JOIN && + type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -86,6 +93,10 @@ loop: spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -113,7 +124,7 @@ loop: */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); goto loop; - } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; @@ -155,8 +166,12 @@ loop: spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->ordered_operations); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, int ret; u64 qgroup_reserved = 0; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -359,8 +374,11 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } } while (ret == -EBUSY); if (ret < 0) { @@ -382,9 +400,10 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; - h->qgroup_reserved = qgroup_reserved; + h->qgroup_reserved = 0; h->delayed_ref_elem.seq = 0; h->type = type; + h->allocating_chunk = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -400,6 +419,7 @@ again: h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } + h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) @@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) err = -EIO; - } assert_qgroups_uptodate(trans); - memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); return err; } @@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct blk_plug plug; + blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; + blk_finish_plug(&plug); return werr; } @@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, } /* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged. */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; @@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_defrag_leaves(trans, root, cacheonly); + ret = btrfs_defrag_leaves(trans, root); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root); @@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } } root->defrag_running = 0; return ret; @@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct inode *parent_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = pending->error = -ENOMEM; - goto path_alloc_fail; + return ret; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); @@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; dentry = pending->dentry; - parent = dget_parent(dentry); - parent_inode = parent->d_inode; + parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: - dput(parent); trans->block_rsv = rsv; + trans->bytes_reserved = 0; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: btrfs_free_path(path); -path_alloc_fail: - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; } @@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_async_commit { struct btrfs_trans_handle *newtrans; struct btrfs_root *root; - struct delayed_work work; + struct work_struct work; }; static void do_async_commit(struct work_struct *work) { struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work.work); + container_of(work, struct btrfs_async_commit, work); /* * We've got freeze protection passed with the transaction. @@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (!ac) return -ENOMEM; - INIT_DELAYED_WORK(&ac->work, do_async_commit); + INIT_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { @@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); - schedule_delayed_work(&ac->work, 0); + schedule_work(&ac->work); /* wait for transaction to start and unblock */ if (wait_for_unblock) @@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); WARN_ON(trans->use_count > 1); @@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; } spin_unlock(&root->fs_info->trans_lock); @@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, } if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) + return ret; btrfs_wait_ordered_extents(root, 1); } @@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(trans, root, 1); - return 0; + return ret; } /* @@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); - ret = btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } cur_trans = trans->transaction; @@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { @@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e..3c8e0d2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct list_head ordered_operations; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -68,6 +69,7 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; short aborted; short adding_csums; + bool allocating_chunk; enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to @@ -82,11 +84,13 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; + struct inode *dir; struct btrfs_root *root; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; /* extra metadata reseration for relocation */ int error; bool readonly; @@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee..94e05c1 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,13 +23,14 @@ #include "transaction.h" #include "locking.h" -/* defrag all the leaves in a given btree. If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to * better reflect disk order */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only) + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u64 last_ret = 0; u64 min_trans = 0; - if (cache_only) - goto out; - if (root->fs_info->extent_root == root) { /* * there's recursion here right now in the tree locking, @@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } path->keep_locks = 1; - if (cache_only) - min_trans = root->defrag_trans_start; - ret = btrfs_search_forward(root, &key, NULL, path, - cache_only, min_trans); + ret = btrfs_search_forward(root, &key, NULL, path, min_trans); if (ret < 0) goto out; if (ret > 0) { @@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + next_key_ret = btrfs_find_next_key(root, path, &key, 1, min_trans); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, - cache_only, &last_ret, + &last_ret, &root->defrag_progress); if (ret) { WARN_ON(ret == -EAGAIN); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1..c7ef569 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent_for_log_replay(wc->trans, - log->fs_info->extent_root, + btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen, 0)) { @@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key *key) { int found_type; - u64 mask = root->sectorsize - 1; u64 extent_end; u64 start = key->offset; u64 saved_nbytes; @@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; + extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; @@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, unsigned long log_transid = 0; mutex_lock(&root->log_mutex); + log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { wait_log_commit(trans, root, root->log_transid); @@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { ret = -EAGAIN; + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } - log_transid = root->log_transid; if (log_transid % 2 == 0) mark = EXTENT_DIRTY; else @@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = 0; goto out; @@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + if (trans) { + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } + /* + * We may have short-circuited the log tree with the full commit logic + * and left ordered extents on our list, so clear these out to keep us + * from leaking inodes and memory. + */ + btrfs_free_logged_extents(log, 0); + btrfs_free_logged_extents(log, 1); + free_extent_buffer(log->node); kfree(log); } @@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); /* * we didn't find anything from this transaction, see if there @@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct btrfs_ordered_extent *ordered; struct list_head ordered_sums; struct btrfs_map_token token; struct btrfs_key key; - u64 csum_offset = em->mod_start - em->start; - u64 csum_len = em->mod_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + u64 csum_offset; + u64 csum_len; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; +insert: INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); @@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + /* + * If we are overwriting an inline extent with a real one then we need + * to just delete the inline extent as it may not be large enough to + * have the entire file_extent_item. + */ + if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == + BTRFS_FILE_EXTENT_INLINE) { + ret = btrfs_del_item(trans, log, path); + btrfs_release_path(path); + if (ret) { + path->really_keep_locks = 0; + return ret; + } + goto insert; + } + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans, csum_len = block_len; } + /* + * First check and see if our csums are on our outstanding ordered + * extents. + */ +again: + spin_lock_irq(&log->log_extents_lock[index]); + list_for_each_entry(ordered, &log->logged_list[index], log_list) { + struct btrfs_ordered_sum *sum; + + if (!mod_len) + break; + + if (ordered->inode != inode) + continue; + + if (ordered->file_offset + ordered->len <= mod_start || + mod_start + mod_len <= ordered->file_offset) + continue; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this + * ordered extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered->file_offset + ordered->len >= + mod_start + mod_len) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered->file_offset + ordered->len < + mod_start + mod_len) { + mod_len = (mod_start + mod_len) - + (ordered->file_offset + ordered->len); + mod_start = ordered->file_offset + + ordered->len; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, + &ordered->flags)) + continue; + atomic_inc(&ordered->refs); + spin_unlock_irq(&log->log_extents_lock[index]); + /* + * we've dropped the lock, we must either break or + * start over after this. + */ + + wait_event(ordered->wait, ordered->csum_bytes_left == 0); + + list_for_each_entry(sum, &ordered->list, list) { + ret = btrfs_csum_file_blocks(trans, log, sum); + if (ret) { + btrfs_put_ordered_extent(ordered); + goto unlocked; + } + } + btrfs_put_ordered_extent(ordered); + goto again; + + } + spin_unlock_irq(&log->log_extents_lock[index]); +unlocked: + + if (!mod_len || ret) + return ret; + + csum_offset = mod_start - em->start; + csum_len = mod_len; + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; u64 test_gen; int ret = 0; + int num = 0; INIT_LIST_HEAD(&extents); @@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); + + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ atomic_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); + num++; } list_sort(NULL, &extents, extent_cmp); +process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); + btrfs_get_logged_extents(log, inode); + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, while (1) { ins_nr = 0; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); if (ret != 0) break; again: @@ -3656,6 +3797,8 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: + if (err) + btrfs_free_logged_extents(log, log->log_transid); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c1..ddc61ca 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,7 +5,7 @@ */ #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include "ulist.h" /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cbb7f4..35bb2d4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,6 +25,8 @@ #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <asm/div64.h> #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -32,6 +34,7 @@ #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "raid56.h" #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" @@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->writeable = 0; new_device->in_fs_metadata = 0; new_device->can_discard = 0; + spin_lock_init(&new_device->io_lock); list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; struct block_device *bdev; - struct buffer_head *bh; - int ret; + struct page *page; + void *p; + int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; + u64 bytenr; + pgoff_t index; + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; mutex_lock(&uuid_mutex); - ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); - if (ret) + + bdev = blkdev_get_by_path(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); goto error; - disk_super = (struct btrfs_super_block *)bh->b_data; + } + + /* make sure our super fits in the device */ + if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + goto error_bdev_put; + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + goto error_bdev_put; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_CACHE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) + goto error_bdev_put; + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_NOFS); + + if (IS_ERR_OR_NULL(page)) + goto error_bdev_put; + + p = kmap(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + + if (btrfs_super_bytenr(disk_super) != bytenr || + disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) + goto error_unmap; + devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); + if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; @@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); } + printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; - brelse(bh); + +error_unmap: + kunmap(page); + page_cache_release(page); + +error_bdev_put: blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); @@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; bool clear_super = false; mutex_lock(&uuid_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid5\n"); + ret = -EINVAL; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + printk(KERN_ERR "btrfs: unable to go below three " + "devices on raid6\n"); + ret = -EINVAL; + goto out; + } + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, chunk_used = btrfs_block_group_used(&cache->item); if (bargs->usage == 0) - user_thresh = 0; + user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; else @@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, return 0; if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - factor = num_stripes / factor; + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { + factor = num_stripes / 2; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { + factor = num_stripes - 1; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { + factor = num_stripes - 2; + } else { + factor = num_stripes; + } for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int mixed = 0; int ret; u64 num_devices; + unsigned seq; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10); + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || @@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } } - } + } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { int num_tolerated_disk_barrier_failures; @@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, mutex_lock(&fs_info->balance_mutex); atomic_dec(&fs_info->balance_running); - if (bargs) { - memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); - } - - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { - __cancel_balance(fs_info); - } - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); } + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + wake_up(&fs_info->balance_wait_q); return ret; @@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) } struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - { 2, 1, 0, 4, 2, 2 /* raid10 */ }, - { 1, 1, 2, 2, 2, 2 /* raid1 */ }, - { 1, 2, 1, 1, 1, 2 /* dup */ }, - { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 1, 1, 1, 1 /* single */ }, + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .devs_increment = 1, + .ncopies = 3, + }, }; +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO allow them to set a preferred stripe size */ + return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + u64 features; + + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + return; + + features = btrfs_super_incompat_flags(info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_RAID56) + return; + + features |= BTRFS_FEATURE_INCOMPAT_RAID56; + btrfs_set_super_incompat_flags(info->super_copy, features); + printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_device_info *devices_info = NULL; u64 total_avail; int num_stripes; /* total number of stripes to allocate */ + int data_stripes; /* number of stripes that count for + block group size */ int sub_stripes; /* sub_stripes info for map */ int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ @@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_chunk_size; u64 stripe_size; u64 num_bytes; + u64 raid_stripe_len = BTRFS_STRIPE_LEN; int ndevs; int i; int j; @@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) continue; + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } devices_info[ndevs].dev_offset = dev_offset; devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; - WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * ndevs > max_chunk_size * ncopies) { - stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, ndevs); + /* + * this will have to be fixed for RAID1 and RAID10 over + * more drives + */ + data_stripes = num_stripes / ncopies; + + if (type & BTRFS_BLOCK_GROUP_RAID5) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 1; + } + if (type & BTRFS_BLOCK_GROUP_RAID6) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 2; + } + + /* + * Use the number of data stripes to figure out how big this chunk + * is really going to be in terms of logical address space, + * and compare that answer with the max chunk size + */ + if (stripe_size * data_stripes > max_chunk_size) { + u64 mask = (1ULL << 24) - 1; + stripe_size = max_chunk_size; + do_div(stripe_size, data_stripes); + + /* bump the answer up to a 16MB boundary */ + stripe_size = (stripe_size + mask) & ~mask; + + /* but don't go higher than the limits we found + * while searching for free extents + */ + if (stripe_size > devices_info[ndevs-1].max_avail) + stripe_size = devices_info[ndevs-1].max_avail; } do_div(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, BTRFS_STRIPE_LEN); - stripe_size *= BTRFS_STRIPE_LEN; + do_div(stripe_size, raid_stripe_len); + stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { @@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } } map->sector_size = extent_root->sectorsize; - map->stripe_len = BTRFS_STRIPE_LEN; - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; + map->stripe_len = raid_stripe_len; + map->io_align = raid_stripe_len; + map->io_width = raid_stripe_len; map->type = type; map->sub_stripes = sub_stripes; *map_ret = map; - num_bytes = stripe_size * (num_stripes / ncopies); + num_bytes = stripe_size * data_stripes; *stripe_size_out = stripe_size; *num_bytes_out = num_bytes; @@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - free_extent_map(em); - if (ret) - goto error; - - ret = btrfs_make_block_group(trans, extent_root, 0, type, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, num_bytes); - if (ret) + if (ret) { + free_extent_map(em); goto error; + } for (i = 0; i < map->num_stripes; ++i) { struct btrfs_device *device; @@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, info->chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, dev_offset, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - goto error; - } + if (ret) + goto error_dev_extent; + } + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, num_bytes); + if (ret) { + i = map->num_stripes - 1; + goto error_dev_extent; } + free_extent_map(em); + check_raid56_incompat_flag(extent_root->fs_info, type); + kfree(devices_info); return 0; +error_dev_extent: + for (; i >= 0; i--) { + struct btrfs_device *device; + int err; + + device = map->stripes[i].dev; + err = btrfs_free_dev_extent(trans, device, start); + if (err) { + btrfs_abort_transaction(trans, extent_root, err); + break; + } + } + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); error: kfree(map); kfree(devices_info); @@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, if (ret) return ret; - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); if (ret) @@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); @@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); @@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + unsigned long len = root->sectorsize; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + len = map->stripe_len * nr_data_stripes(map); + } + free_extent_map(em); + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + ret = 1; + free_extent_map(em); + return ret; +} + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct map_lookup *map, int first, int num, int optimal, int dev_replace_is_ongoing) @@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return optimal; } +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num) + int mirror_num, u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; @@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; + u64 stripe_len; + u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_alloc_stripes; int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; + if (mirror_num > map->num_stripes) + mirror_num = 0; + + stripe_len = map->stripe_len; stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, stripe_len); - stripe_offset = stripe_nr * map->stripe_len; + stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (rw & REQ_DISCARD) + /* if we're here for raid56, we need to know the stripe aligned start */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + raid56_full_stripe_start = offset; + + /* allow a write of a full stripe, but make sure we don't + * allow straddling of stripes + */ + do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + } + + if (rw & REQ_DISCARD) { + /* we don't discard raid56 yet */ + if (map->type & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + ret = -EOPNOTSUPP; + goto out; + } *length = min_t(u64, em->len - offset, *length); - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len; + /* For writes to RAID[56], allow a full stripeset across all disks. + For other RAID types and for RAID[56] reads, just allow a single + stripe (on a single disk). */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + (rw & REQ_WRITE)) { + max_len = stripe_len * nr_data_stripes(map) - + (offset - raid56_full_stripe_start); + } else { + /* we limit the length of each bio to what fits in a stripe */ + max_len = stripe_len - stripe_offset; + } + *length = min_t(u64, em->len - offset, max_len); } else { *length = em->len - offset; } + /* This is for when we're called from btrfs_merge_bio_hook() and all + it cares about is the length */ if (!bbio_ret) goto out; @@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0); + logical, &tmp_length, &tmp_bbio, 0, NULL); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; - stripe_nr_end = (offset + *length + map->stripe_len - 1) & - (~(map->stripe_len - 1)); + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); do_div(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, @@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) + && raid_map_ret) { + int i, rot; + + /* push stripe_nr back to the start of the full stripe */ + stripe_nr = raid56_full_stripe_start; + do_div(stripe_nr, stripe_len); + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = nr_parity_stripes(map); + + raid_map = kmalloc(sizeof(u64) * num_stripes, + GFP_NOFS); + if (!raid_map) { + ret = -ENOMEM; + goto out; + } + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_DUP)) { max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; } } @@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } + if (raid_map) { + sort_parity_stripes(bbio, raid_map); + *raid_map_ret = raid_map; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num); + mirror_num, NULL); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; read_lock(&em_tree->lock); @@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ @@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; @@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err) bio->bi_bdev = (struct block_device *) (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is - * beyond the tolerance of the multi-bio + * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; @@ -4668,13 +5079,18 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline void schedule_bio(struct btrfs_root *root, +noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; + if (device->missing || !device->bdev) { + bio_endio(bio, -EIO); + return; + } + /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); @@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, #endif bio->bi_bdev = dev->bdev; if (async) - schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, rw, bio); else btrfsic_submit_bio(rw, bio); } @@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; + u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num); - if (ret) + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, + mirror_num, &raid_map); + if (ret) /* -ENOMEM */ return ret; total_devs = bbio->num_stripes; + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); + + if (raid_map) { + /* In this case, map_length has been set to the length of + a single stripe; not the whole write */ + if (rw & WRITE) { + return raid56_parity_write(root, bio, bbio, + raid_map, map_length); + } else { + return raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); + } + } + if (map_length < length) { printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); - while (dev_nr < total_devs) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939..062d860 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,8 +21,8 @@ #include <linux/bio.h> #include <linux/sort.h> +#include <linux/btrfs.h> #include "async-thread.h" -#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); - +void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { diff --git a/fs/buffer.c b/fs/buffer.c index 8e18281..b4dcb34 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) } EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ + trace_block_touch_buffer(bh); + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + static int sleep_on_buffer(void *word) { io_schedule(); @@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); + trace_block_dirty_buffer(bh); + /* * Very *carefully* optimize the it-is-already-dirty case. * diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d4f81ed..a60ea97 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - int rc, bytes; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); int i; - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - rc = le32_to_cpu(replyhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ @@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 1, 0); + NULL, false, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) &ci->i_layout, snapc, page_off, len, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, - &page, 1, 0, 0, true); + &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; struct ceph_inode_info *ci = ceph_inode(inode); unsigned wrote; struct page *page; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - __s32 rc = -EIO; - u64 bytes = 0; + int rc = req->r_result; + u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -741,8 +725,6 @@ retry: struct page *page; int want; u64 offset, len; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; long writeback_stat; next = 0; @@ -838,7 +820,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1, 0); + &inode->i_mtime, true, 0); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -906,10 +888,8 @@ get_more_pages: /* revise final length, page count */ req->r_num_pages = locked_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->extent.length = cpu_to_le64(len); - op->payload_len = cpu_to_le32(len); + req->r_request_ops[0].extent.length = cpu_to_le64(len); + req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ae2be69..78e2f57 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -611,8 +611,16 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) + else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1563,9 +1571,14 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -1593,7 +1606,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1670,8 +1683,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } cap->seq = seq; @@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 11b57c2..bf338d9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + if (err) + goto out_err; + err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + *opened |= FILE_CREATED; + } err = finish_open(file, dentry, ceph_open, opened); } @@ -535,7 +541,7 @@ more: ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2, page_align); + &mtime, false, page_align); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f5ed767..4a98934 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; - struct ceph_object_layout ol; struct ceph_pg pgid; int r; @@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EFAULT; down_read(&osdc->map_sem); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) @@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, + ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, osdc->osdmap); - pgid = ol.ol_pgid; dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7a3dfe0..442880d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -233,6 +233,30 @@ bad: } /* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* * parse extra results */ static int parse_reply_info_extra(void **p, void *end, @@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else + else if (info->head->op == CEPH_MDS_OP_READDIR) return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; } /* @@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP) && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ff4188b..c2a19fb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in *dir_in; u8 dir_complete, dir_end; }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; }; /* encoded blob describing snapshot contexts for certain diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44..0d3c924 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); ceph_decode_16_safe(p, end, version, bad); + if (version > 3) { + pr_warning("got mdsmap version %d > 3, failing", version); + goto bad; + } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); m->m_epoch = ceph_decode_32(p); @@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); m->m_num_data_pg_pools = n; - m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) goto badmem; - ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) - m->m_data_pg_pools[i] = ceph_decode_32(p); - m->m_cas_pg_pool = ceph_decode_32(p); + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ dout("mdsmap_decode success epoch %u\n", m->m_epoch); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d..89fa4a9 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s) case CEPH_MDS_STATE_BOOT: return "up:boot"; case CEPH_MDS_STATE_STANDBY: return "up:standby"; case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; case CEPH_MDS_STATE_CREATING: return "up:creating"; case CEPH_MDS_STATE_STARTING: return "up:starting"; /* up and in */ @@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; case CEPH_MDS_OP_READDIR: return "readdir"; case CEPH_MDS_OP_MKNOD: return "mknod"; case CEPH_MDS_OP_LINK: return "link"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e86aa994..9fe17c6c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) /* * express utilization in terms of large blocks to avoid * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); @@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f053bbd..c7b3097 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,7 +21,7 @@ /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; -extern int ceph_copy_to_page_vector(struct page **pages, - const char *data, - loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, - char *data, - loff_t off, size_t len); -extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); + extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2c2ae5b..9b6b2b6d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -29,9 +29,94 @@ struct ceph_vxattr { size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); - bool readonly; + bool readonly, hidden; + bool (*exists_cb)(struct ceph_inode_info *ci); }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + size_t s; + char *p = (char *)&ci->i_layout; + + for (s = 0; s < sizeof(ci->i_layout); s++, p++) + if (*p) + return true; + return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + pool_name); + else + ret = snprintf(val, size, + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + (unsigned long long)pool); + + up_read(&osdc->map_sem); + return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, "%s", pool_name); + else + ret = snprintf(val, size, "%lld", (unsigned long long)pool); + up_read(&osdc->map_sem); + return ret; +} + /* directories */ static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, @@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, (long)ci->i_rctime.tv_nsec); } -#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name -#define XATTR_NAME_CEPH(_type, _name) \ - { \ - .name = CEPH_XATTR_NAME(_type, _name), \ - .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ - .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ - .readonly = true, \ - } +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + .hidden = false, \ + .exists_cb = NULL, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + } static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = false, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, subdirs), @@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rctime), - { 0 } /* Required table terminator */ + { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ -static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, - size_t size) -{ - int ret; - - ret = snprintf(val, size, - "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - return ret; -} - static struct ceph_vxattr ceph_file_vxattrs[] = { - XATTR_NAME_CEPH(file, layout), - /* The following extended attribute name is deprecated */ { - .name = XATTR_CEPH_PREFIX "layout", - .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), - .getxattr_cb = ceph_vxattrcb_file_layout, - .readonly = true, + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = false, + .exists_cb = ceph_vxattrcb_layout_exists, }, - { 0 } /* Required table terminator */ + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_file_vxattrs_name_size; /* total size of all names */ @@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) size_t size = 0; for (vxattr = vxattrs; vxattr->name; vxattr++) - size += vxattr->name_size; + if (!vxattr->hidden) + size += vxattr->name_size; return size; } @@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - /* let's see if a virtual xattr was requested */ - vxattr = ceph_match_vxattr(inode, name); - spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); + /* let's see if a virtual xattr was requested */ + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; @@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, spin_lock(&ci->i_ceph_lock); - if (vxattr && vxattr->readonly) { - err = vxattr->getxattr_cb(ci, value, size); - goto out; - } - err = __build_xattrs(inode); if (err < 0) goto out; @@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); - if (!xattr) { - if (vxattr) - err = vxattr->getxattr_cb(ci, value, size); + if (!xattr) goto out; - } err = -ERANGE; if (size && size < xattr->val_len) @@ -664,23 +763,30 @@ list_xattr: vir_namelen = ceph_vxattrs_name_size(vxattrs); /* adding 1 byte per each variable due to the null termination */ - namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; - if (size && namelen > size) + if (size && vir_namelen + namelen > size) goto out; - err = namelen; + err = namelen + vir_namelen; if (size == 0) goto out; names = __copy_xattr_names(ci, names); /* virtual xattr names, too */ - if (vxattrs) + err = namelen; + if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; + if (!vxattrs[i].hidden && + !(vxattrs[i].exists_cb && + !vxattrs[i].exists_cb(ci))) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + err += len + 1; + } } + } out: spin_unlock(&ci->i_ceph_lock); @@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (vxattr && vxattr->readonly) return -EOPNOTSUPP; + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; newname = kmemdup(name, name_len + 1, GFP_NOFS); @@ -838,6 +948,7 @@ retry: do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name) if (vxattr && vxattr->readonly) return -EOPNOTSUPP; + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: @@ -931,6 +1046,7 @@ retry: return err; do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_send_removexattr(dentry, name); out: return err; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4bad7b1..1a052c0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -564,6 +564,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dentry = ERR_PTR(-ENOENT); break; } + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } /* skip separators */ while (*s == sep) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 00e12f2..7353bc5 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1909,8 +1909,11 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { - if (rc != 0) + if (rc != 0) { SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } unlock_page(wdata->pages[i]); } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4474a57..54125e0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1031,7 +1031,7 @@ static int cifs_parse_security_flavors(char *value, switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5: - vol->secFlg |= CIFSSEC_MAY_KRB5; + vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN; break; case Opt_sec_krb5i: vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c16d2a0..8c0d855 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -43,6 +43,7 @@ #include "cifs_fs_sb.h" #include "fscache.h" + static inline int cifs_convert_flags(unsigned int flags) { if ((flags & O_ACCMODE) == O_RDONLY) @@ -72,10 +73,15 @@ static u32 cifs_posix_convert_flags(unsigned int flags) else if ((flags & O_ACCMODE) == O_RDWR) posix_flags = SMB_O_RDWR; - if (flags & O_CREAT) + if (flags & O_CREAT) { posix_flags |= SMB_O_CREAT; - if (flags & O_EXCL) - posix_flags |= SMB_O_EXCL; + if (flags & O_EXCL) + posix_flags |= SMB_O_EXCL; + } else if (flags & O_EXCL) + cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag" + "but not O_CREAT on file open. Ignoring O_EXCL", + current->comm, current->tgid); + if (flags & O_TRUNC) posix_flags |= SMB_O_TRUNC; /* be safe and imply O_SYNC for O_DSYNC */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index d2a8339..83f2606 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -816,10 +816,9 @@ static bool inode_has_hashed_dentries(struct inode *inode) { struct dentry *dentry; - struct hlist_node *p; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; diff --git a/fs/coredump.c b/fs/coredump.c index 69baf90..c647965 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -501,7 +501,7 @@ void do_coredump(siginfo_t *siginfo) * so we dump it as root in mode 2, and only into a controlled * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { + if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ diff --git a/fs/dcache.c b/fs/dcache.c index 68220dd..fbfae008 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -675,11 +675,10 @@ EXPORT_SYMBOL(dget_parent); static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; - struct hlist_node *p; again: discon_alias = NULL; - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && @@ -730,10 +729,9 @@ EXPORT_SYMBOL(d_find_alias); void d_prune_aliases(struct inode *inode) { struct dentry *dentry; - struct hlist_node *p; restart: spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_count) { __dget_dlock(dentry); @@ -1443,14 +1441,13 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, int len = entry->d_name.len; const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; - struct hlist_node *p; if (!inode) { __d_instantiate(entry, NULL); return NULL; } - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f750165..1b11466 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1183,7 +1183,7 @@ static void detach_lkb(struct dlm_lkb *lkb) static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - int rv, id; + int rv; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -1199,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - retry: - rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); - if (!rv) - return -ENOMEM; - + idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); - if (!rv) - lkb->lkb_id = id; + rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + if (rv >= 0) + lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); - - if (rv == -EAGAIN) - goto retry; + idr_preload_end(); if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 2e99fb0..3ca79d3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -796,7 +796,6 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); - idr_remove_all(&ls->ls_lkbidr); idr_destroy(&ls->ls_lkbidr); /* diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index dd87a31..4f5ad24 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -177,12 +177,11 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { int r; - struct hlist_node *h; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, h, &connection_hash[r], list) { + hlist_for_each_entry(con, &connection_hash[r], list) { if (con->nodeid == nodeid) return con; } @@ -232,13 +231,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) static void foreach_conn(void (*conn_func)(struct connection *c)) { int i; - struct hlist_node *h, *n; + struct hlist_node *n; struct connection *con; for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){ + hlist_for_each_entry_safe(con, n, &connection_hash[i], list) conn_func(con); - } } } @@ -257,13 +255,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) static struct connection *assoc2con(int assoc_id) { int i; - struct hlist_node *h; struct connection *con; mutex_lock(&connections_lock); for (i = 0 ; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry(con, h, &connection_hash[i], list) { + hlist_for_each_entry(con, &connection_hash[i], list) { if (con->sctp_assoc == assoc_id) { mutex_unlock(&connections_lock); return con; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index aedea28..a6bc63f 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -305,27 +305,26 @@ static int recover_idr_empty(struct dlm_ls *ls) static int recover_idr_add(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; - int rv, id; - - rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS); - if (!rv) - return -ENOMEM; + int rv; + idr_preload(GFP_NOFS); spin_lock(&ls->ls_recover_idr_lock); if (r->res_id) { - spin_unlock(&ls->ls_recover_idr_lock); - return -1; - } - rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id); - if (rv) { - spin_unlock(&ls->ls_recover_idr_lock); - return rv; + rv = -1; + goto out_unlock; } - r->res_id = id; + rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT); + if (rv < 0) + goto out_unlock; + + r->res_id = rv; ls->ls_recover_list_count++; dlm_hold_rsb(r); + rv = 0; +out_unlock: spin_unlock(&ls->ls_recover_idr_lock); - return 0; + idr_preload_end(); + return rv; } static void recover_idr_del(struct dlm_rsb *r) @@ -351,24 +350,21 @@ static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id) return r; } -static int recover_idr_clear_rsb(int id, void *p, void *data) +static void recover_idr_clear(struct dlm_ls *ls) { - struct dlm_ls *ls = data; - struct dlm_rsb *r = p; + struct dlm_rsb *r; + int id; - r->res_id = 0; - r->res_recover_locks_count = 0; - ls->ls_recover_list_count--; + spin_lock(&ls->ls_recover_idr_lock); - dlm_put_rsb(r); - return 0; -} + idr_for_each_entry(&ls->ls_recover_idr, r, id) { + idr_remove(&ls->ls_recover_idr, id); + r->res_id = 0; + r->res_recover_locks_count = 0; + ls->ls_recover_list_count--; -static void recover_idr_clear(struct dlm_ls *ls) -{ - spin_lock(&ls->ls_recover_idr_lock); - idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls); - idr_remove_all(&ls->ls_recover_idr); + dlm_put_rsb(r); + } if (ls->ls_recover_list_count != 0) { log_error(ls, "warning: recover_list_count %d", diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 5fa2471..8d7a577 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -115,10 +115,9 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) */ int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon) { - struct hlist_node *elem; int rc; - hlist_for_each_entry(*daemon, elem, + hlist_for_each_entry(*daemon, &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], euid_chain) { if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) { @@ -445,7 +444,6 @@ void ecryptfs_release_messaging(void) mutex_unlock(&ecryptfs_msg_ctx_lists_mux); } if (ecryptfs_daemon_hash) { - struct hlist_node *elem; struct ecryptfs_daemon *daemon; int i; @@ -453,7 +451,7 @@ void ecryptfs_release_messaging(void) for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, elem, + hlist_for_each_entry(daemon, &ecryptfs_daemon_hash[i], euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); @@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, SUID_DUMPABLE_ENABLED); + set_dumpable(current->mm, SUID_DUMP_USER); else set_dumpable(current->mm, suid_dumpable); @@ -1639,17 +1639,17 @@ EXPORT_SYMBOL(set_binfmt); void set_dumpable(struct mm_struct *mm, int value) { switch (value) { - case SUID_DUMPABLE_DISABLED: + case SUID_DUMP_DISABLE: clear_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_ENABLED: + case SUID_DUMP_USER: set_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_SAFE: + case SUID_DUMP_ROOT: set_bit(MMF_DUMP_SECURELY, &mm->flags); smp_wmb(); set_bit(MMF_DUMPABLE, &mm->flags); @@ -1662,7 +1662,7 @@ int __get_dumpable(unsigned long mm_flags) int ret; ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret; + return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; } int get_dumpable(struct mm_struct *mm) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 5df4bb4..262fc99 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -44,14 +44,13 @@ find_acceptable_alias(struct dentry *result, { struct dentry *dentry, *toput = NULL; struct inode *inode; - struct hlist_node *p; if (acceptable(context, result)) return result; inode = result->d_inode; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 2f2e0da..92e68b3 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -635,7 +635,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", - EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 6dda04f..d8cd1f0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -334,7 +334,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) * * For non-htree, ext4_llseek already chooses the proper max offset. */ -loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6e16c18..4a01ba3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1309,6 +1309,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; + struct percpu_counter s_extent_cache_cnt; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index f768f4a..95796a1 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -147,11 +147,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan); -static int ext4_es_reclaim_extents_count(struct super_block *sb); int __init ext4_init_es(void) { - ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + ext4_es_cachep = kmem_cache_create("ext4_extent_status", + sizeof(struct extent_status), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; @@ -302,8 +303,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, /* * We don't count delayed extent because we never try to reclaim them */ - if (!ext4_es_is_delayed(es)) + if (!ext4_es_is_delayed(es)) { EXT4_I(inode)->i_es_lru_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + } return es; } @@ -314,6 +317,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) if (!ext4_es_is_delayed(es)) { BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); EXT4_I(inode)->i_es_lru_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -674,10 +678,11 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan); + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) - return ext4_es_reclaim_extents_count(sbi->s_sb); + return ret; INIT_LIST_HEAD(&scanned); @@ -705,9 +710,10 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) } list_splice_tail(&scanned, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk); - return ext4_es_reclaim_extents_count(sbi->s_sb); + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return ret; } void ext4_es_register_shrinker(struct super_block *sb) @@ -751,25 +757,6 @@ void ext4_es_lru_del(struct inode *inode) spin_unlock(&sbi->s_es_lru_lock); } -static int ext4_es_reclaim_extents_count(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *ei; - struct list_head *cur; - int nr_cached = 0; - - spin_lock(&sbi->s_es_lru_lock); - list_for_each(cur, &sbi->s_es_lru) { - ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - read_lock(&ei->i_es_lock); - nr_cached += ei->i_es_lru_nr; - read_unlock(&ei->i_es_lock); - } - spin_unlock(&sbi->s_es_lru_lock); - trace_ext4_es_reclaim_extents_count(sb, nr_cached); - return nr_cached; -} - static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, int nr_to_scan) { diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index cf83e77..f190dfe 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,10 +20,13 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -#define EXTENT_STATUS_WRITTEN 0x80000000 /* written extent */ -#define EXTENT_STATUS_UNWRITTEN 0x40000000 /* unwritten extent */ -#define EXTENT_STATUS_DELAYED 0x20000000 /* delayed extent */ -#define EXTENT_STATUS_HOLE 0x10000000 /* hole */ +/* + * These flags live in the high bits of extent_status.es_pblk + */ +#define EXTENT_STATUS_WRITTEN (1ULL << 63) +#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) +#define EXTENT_STATUS_DELAYED (1ULL << 61) +#define EXTENT_STATUS_HOLE (1ULL << 60) #define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ @@ -58,22 +61,22 @@ extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_WRITTEN); + return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_UNWRITTEN); + return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_DELAYED); + return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_HOLE); + return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; } static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c4f4b1..9ea0cde 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2512,12 +2512,8 @@ static int ext4_nonda_switch(struct super_block *sb) /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && - !writeback_in_progress(sb->s_bdi) && - down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } + if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6540ebe..7bb713a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3419,7 +3419,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) win = offs; ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_B2C(sbi, win); + EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -4565,7 +4565,7 @@ do_more: EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - count_clusters = EXT4_B2C(sbi, count); + count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4807,11 +4807,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, blocks_freed)); + EXT4_NUM_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_B2C(sbi, blocks_freed), + atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), &sbi->s_flex_groups[flex_group].free_clusters); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c7f4d75..b2c8ee5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1247,7 +1247,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_B2C(sbi, group_data->free_blocks_count)); + EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, @@ -1349,7 +1349,7 @@ static void ext4_update_super(struct super_block *sb, /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, free_blocks)); + EXT4_NUM_B2C(sbi, free_blocks)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); @@ -1360,7 +1360,7 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_B2C(sbi, free_blocks), + atomic_add(EXT4_NUM_B2C(sbi, free_blocks), &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 620cf561..5e6c878 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -783,6 +783,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -1247,6 +1248,11 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quota options when quota turned on"); return -1; } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } qname = match_strdup(args); if (!qname) { ext4_msg(sb, KERN_ERR, @@ -1544,6 +1550,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "quota options when quota turned on"); return -1; } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, + "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } sbi->s_jquota_fmt = m->mount_opt; #endif } else { @@ -1592,6 +1605,12 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { + ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " + "feature is enabled"); + return 0; + } if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -3161,7 +3180,7 @@ int ext4_calculate_overhead(struct super_block *sb) } /* Add the journal blocks as well */ if (sbi->s_journal) - overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); sbi->s_overhead = overhead; smp_wmb(); @@ -3688,6 +3707,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!err) { err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } + if (!err) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); + } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; @@ -3711,13 +3733,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA - sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { - /* Use qctl operations for hidden quota files. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) sb->s_qcop = &ext4_qctl_sysfile_operations; - } + else + sb->s_qcop = &ext4_qctl_operations; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -3913,6 +3933,16 @@ no_journal: if (err) goto failed_mount7; +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !(sb->s_flags & MS_RDONLY)) { + err = ext4_enable_quotas(sb); + if (err) + goto failed_mount8; + } +#endif /* CONFIG_QUOTA */ + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -3930,16 +3960,6 @@ no_journal: } else descr = "out journal"; -#ifdef CONFIG_QUOTA - /* Enable quota usage during mount. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !(sb->s_flags & MS_RDONLY)) { - err = ext4_enable_quotas(sb); - if (err) - goto failed_mount8; - } -#endif /* CONFIG_QUOTA */ - if (test_opt(sb, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) @@ -3993,6 +4013,7 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4538,6 +4559,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); + kfree(orig_data); return -ENOMEM; } } else @@ -4816,9 +4838,12 @@ static int ext4_release_dquot(struct dquot *dquot) static int ext4_mark_dquot_dirty(struct dquot *dquot) { + struct super_block *sb = dquot->dq_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* Are we journaling quotas? */ - if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 12701a5..e9cc3f0 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -95,6 +95,8 @@ struct msdos_sb_info { spinlock_t dir_hash_lock; struct hlist_head dir_hashtable[FAT_HASH_SIZE]; + + unsigned int dirty; /* fs state before mount */ }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index f8f4916..acf6e47 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -341,12 +341,11 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); - struct hlist_node *_p; struct msdos_inode_info *i; struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); - hlist_for_each_entry(i, _p, head, i_fat_hash) { + hlist_for_each_entry(i, head, i_fat_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_pos != i_pos) continue; @@ -488,10 +487,59 @@ static void fat_evict_inode(struct inode *inode) fat_detach(inode); } +static void fat_set_state(struct super_block *sb, + unsigned int set, unsigned int force) +{ + struct buffer_head *bh; + struct fat_boot_sector *b; + struct msdos_sb_info *sbi = sb->s_fs_info; + + /* do not change any thing if mounted read only */ + if ((sb->s_flags & MS_RDONLY) && !force) + return; + + /* do not change state if fs was dirty */ + if (sbi->dirty) { + /* warn only on set (mount). */ + if (set) + fat_msg(sb, KERN_WARNING, "Volume was not properly " + "unmounted. Some data may be corrupt. " + "Please run fsck."); + return; + } + + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector " + "to mark fs as dirty"); + return; + } + + b = (struct fat_boot_sector *) bh->b_data; + + if (sbi->fat_bits == 32) { + if (set) + b->fat32.state |= FAT_STATE_DIRTY; + else + b->fat32.state &= ~FAT_STATE_DIRTY; + } else /* fat 16 and 12 */ { + if (set) + b->fat16.state |= FAT_STATE_DIRTY; + else + b->fat16.state &= ~FAT_STATE_DIRTY; + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); +} + static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); + fat_set_state(sb, 0, 0); + iput(sbi->fsinfo_inode); iput(sbi->fat_inode); @@ -566,8 +614,18 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { + int new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + + /* make sure we update state on remount. */ + new_rdonly = *flags & MS_RDONLY; + if (new_rdonly != (sb->s_flags & MS_RDONLY)) { + if (new_rdonly) + fat_set_state(sb, 0, 0); + else + fat_set_state(sb, 1, 1); + } return 0; } @@ -1298,17 +1356,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; - if (!sbi->fat_length && b->fat32_length) { + if (!sbi->fat_length && b->fat32.length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; - sbi->fat_length = le32_to_cpu(b->fat32_length); - sbi->root_cluster = le32_to_cpu(b->root_cluster); + sbi->fat_length = le32_to_cpu(b->fat32.length); + sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster); /* MC - if info_sector is 0, don't multiply by 0 */ - sbi->fsinfo_sector = le16_to_cpu(b->info_sector); + sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector); if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; @@ -1362,6 +1420,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (sbi->fat_bits != 32) sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; + /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ + if (sbi->fat_bits == 32) + sbi->dirty = b->fat32.state & FAT_STATE_DIRTY; + else /* fat 16 or 12 */ + sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; + /* check that FAT table does not overflow */ fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); @@ -1456,6 +1520,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, "the device does not support discard"); } + fat_set_state(sb, 1, 0); return 0; out_invalid: diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index ef4b5fa..499c104 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -21,13 +21,12 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head; - struct hlist_node *_p; struct msdos_inode_info *i; struct inode *inode = NULL; head = sbi->dir_hashtable + fat_dir_hash(i_logstart); spin_lock(&sbi->dir_hash_lock); - hlist_for_each_entry(i, _p, head, i_dir_hash) { + hlist_for_each_entry(i, head, i_dir_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_logstart != i_logstart) continue; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 310972b..21f46fb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) static int write_inode(struct inode *inode, struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, wbc); + int ret; + + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { + trace_writeback_write_inode_start(inode, wbc); + ret = inode->i_sb->s_op->write_inode(inode, wbc); + trace_writeback_write_inode(inode, wbc); + return ret; + } return 0; } @@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(!(inode->i_state & I_SYNC)); + trace_writeback_single_inode_start(inode, wbc, nr_to_write); + ret = do_writepages(mapping, wbc); /* @@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty the inode itself */ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + trace_writeback_dirty_inode_start(inode, flags); + if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags); + + trace_writeback_dirty_inode(inode, flags); } /* @@ -1332,47 +1344,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb); /** - * writeback_inodes_sb_if_idle - start writeback if none underway + * try_to_writeback_inodes_sb_nr - try to start writeback if none underway * @sb: the superblock - * @reason: reason why some writeback work was initiated + * @nr: the number of pages to write + * @reason: the reason of writeback * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) +int try_to_writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { - if (!writeback_in_progress(sb->s_bdi)) { - down_read(&sb->s_umount); - writeback_inodes_sb(sb, reason); - up_read(&sb->s_umount); + if (writeback_in_progress(sb->s_bdi)) return 1; - } else + + if (!down_read_trylock(&sb->s_umount)) return 0; + + writeback_inodes_sb_nr(sb, nr, reason); + up_read(&sb->s_umount); + return 1; } -EXPORT_SYMBOL(writeback_inodes_sb_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); /** - * writeback_inodes_sb_nr_if_idle - start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock - * @nr: the number of pages to write * @reason: reason why some writeback work was initiated * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - if (!writeback_in_progress(sb->s_bdi)) { - down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr, reason); - up_read(&sb->s_umount); - return 1; - } else - return 0; + return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } -EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 8dcb114..e2cba1f 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -237,13 +237,12 @@ static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie) { struct fscache_object *object; - struct hlist_node *_n; int ret; _enter("%p,%p{%s}", cache, cookie, cookie->def->name); spin_lock(&cookie->lock); - hlist_for_each_entry(object, _n, &cookie->backing_objects, + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (object->cache == cache) goto object_already_extant; @@ -311,7 +310,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie, { struct fscache_object *p; struct fscache_cache *cache = object->cache; - struct hlist_node *_n; int ret; _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); @@ -321,7 +319,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* there may be multiple initial creations of this object, but we only * want one */ ret = -EEXIST; - hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { + hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { if (p->cache == object->cache) { if (p->state >= FSCACHE_OBJECT_DYING) ret = -ENOBUFS; @@ -331,7 +329,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* pin the parent object */ spin_lock_nested(&cookie->parent->lock, 1); - hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, + hlist_for_each_entry(p, &cookie->parent->backing_objects, cookie_link) { if (p->cache == object->cache) { if (p->state >= FSCACHE_OBJECT_DYING) { @@ -435,7 +433,6 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate); void __fscache_update_cookie(struct fscache_cookie *cookie) { struct fscache_object *object; - struct hlist_node *_p; fscache_stat(&fscache_n_updates); @@ -452,7 +449,7 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) spin_lock(&cookie->lock); /* update the index entry on disk in each cache backing this cookie */ - hlist_for_each_entry(object, _p, + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); } diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 3cc0df7..09d278b 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ - bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o - + bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ + attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c new file mode 100644 index 0000000..8d691f1 --- /dev/null +++ b/fs/hfsplus/attributes.c @@ -0,0 +1,399 @@ +/* + * linux/fs/hfsplus/attributes.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handling of records in attributes tree + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct kmem_cache *hfsplus_attr_tree_cachep; + +int hfsplus_create_attr_tree_cache(void) +{ + if (hfsplus_attr_tree_cachep) + return -EEXIST; + + hfsplus_attr_tree_cachep = + kmem_cache_create("hfsplus_attr_cache", + sizeof(hfsplus_attr_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!hfsplus_attr_tree_cachep) + return -ENOMEM; + + return 0; +} + +void hfsplus_destroy_attr_tree_cache(void) +{ + kmem_cache_destroy(hfsplus_attr_tree_cachep); +} + +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1_cnid, k2_cnid; + + k1_cnid = k1->attr.cnid; + k2_cnid = k2->attr.cnid; + if (k1_cnid != k2_cnid) + return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1; + + return hfsplus_strcmp( + (const struct hfsplus_unistr *)&k1->attr.key_name, + (const struct hfsplus_unistr *)&k2->attr.key_name); +} + +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name) +{ + int len; + + memset(key, 0, sizeof(struct hfsplus_attr_key)); + key->attr.cnid = cpu_to_be32(cnid); + if (name) { + len = strlen(name); + if (len > HFSPLUS_ATTR_MAX_STRLEN) { + printk(KERN_ERR "hfs: invalid xattr name's length\n"); + return -EINVAL; + } + hfsplus_asc2uni(sb, + (struct hfsplus_unistr *)&key->attr.key_name, + HFSPLUS_ATTR_MAX_STRLEN, name, len); + len = be16_to_cpu(key->attr.key_name.length); + } else { + key->attr.key_name.length = 0; + len = 0; + } + + /* The length of the key, as stored in key_len field, does not include + * the size of the key_len field itself. + * So, offsetof(hfsplus_attr_key, key_name) is a trick because + * it takes into consideration key_len field (__be16) of + * hfsplus_attr_key structure instead of length field (__be16) of + * hfsplus_attr_unistr structure. + */ + key->key_len = + cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + + 2 * len); + + return 0; +} + +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, + u32 cnid, + struct hfsplus_attr_unistr *name) +{ + int ustrlen; + + memset(key, 0, sizeof(struct hfsplus_attr_key)); + ustrlen = be16_to_cpu(name->length); + key->attr.cnid = cpu_to_be32(cnid); + key->attr.key_name.length = cpu_to_be16(ustrlen); + ustrlen *= 2; + memcpy(key->attr.key_name.unicode, name->unicode, ustrlen); + + /* The length of the key, as stored in key_len field, does not include + * the size of the key_len field itself. + * So, offsetof(hfsplus_attr_key, key_name) is a trick because + * it takes into consideration key_len field (__be16) of + * hfsplus_attr_key structure instead of length field (__be16) of + * hfsplus_attr_unistr structure. + */ + key->key_len = + cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + + ustrlen); +} + +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) +{ + return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); +} + +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry) +{ + if (entry) + kmem_cache_free(hfsplus_attr_tree_cachep, entry); +} + +#define HFSPLUS_INVALID_ATTR_RECORD -1 + +static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type, + u32 cnid, const void *value, size_t size) +{ + if (record_type == HFSPLUS_ATTR_FORK_DATA) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_fork_data); + } else if (record_type == HFSPLUS_ATTR_EXTENTS) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing. + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_extents); + } else if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + u16 len; + + memset(entry, 0, sizeof(struct hfsplus_attr_inline_data)); + entry->inline_data.record_type = cpu_to_be32(record_type); + if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE) + len = size; + else + return HFSPLUS_INVALID_ATTR_RECORD; + entry->inline_data.length = cpu_to_be16(len); + memcpy(entry->inline_data.raw_bytes, value, len); + /* + * Align len on two-byte boundary. + * It needs to add pad byte if we have odd len. + */ + len = round_up(len, 2); + return offsetof(struct hfsplus_attr_inline_data, raw_bytes) + + len; + } else /* invalid input */ + memset(entry, 0, sizeof(*entry)); + + return HFSPLUS_INVALID_ATTR_RECORD; +} + +int hfsplus_find_attr(struct super_block *sb, u32 cnid, + const char *name, struct hfs_find_data *fd) +{ + int err = 0; + + dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + if (name) { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_rec_by_key); + if (err) + goto failed_find_attr; + } else { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid); + if (err) + goto failed_find_attr; + } + +failed_find_attr: + return err; +} + +int hfsplus_attr_exists(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + if (!HFSPLUS_SB(sb)->attr_tree) + return 0; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return 0; + + err = hfsplus_find_attr(sb, inode->i_ino, name, &fd); + if (err) + goto attr_not_found; + + hfs_find_exit(&fd); + return 1; + +attr_not_found: + hfs_find_exit(&fd); + return 0; +} + +int hfsplus_create_attr(struct inode *inode, + const char *name, + const void *value, size_t size) +{ + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + hfsplus_attr_entry *entry_ptr; + int entry_size; + int err; + + dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + entry_ptr = hfsplus_alloc_attr_entry(); + if (!entry_ptr) + return -ENOMEM; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + goto failed_init_create_attr; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto failed_create_attr; + } else { + err = -EINVAL; + goto failed_create_attr; + } + + /* Mac OS X supports only inline data attributes. */ + entry_size = hfsplus_attr_build_record(entry_ptr, + HFSPLUS_ATTR_INLINE_DATA, + inode->i_ino, + value, size); + if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) { + err = -EINVAL; + goto failed_create_attr; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto failed_create_attr; + } + + err = hfs_brec_insert(&fd, entry_ptr, entry_size); + if (err) + goto failed_create_attr; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + +failed_create_attr: + hfs_find_exit(&fd); + +failed_init_create_attr: + hfsplus_destroy_attr_entry(entry_ptr); + return err; +} + +static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, + struct hfs_find_data *fd) +{ + int err = 0; + __be32 found_cnid, record_type; + + hfs_bnode_read(fd->bnode, &found_cnid, + fd->keyoffset + + offsetof(struct hfsplus_attr_key, cnid), + sizeof(__be32)); + if (cnid != be32_to_cpu(found_cnid)) + return -ENOENT; + + hfs_bnode_read(fd->bnode, &record_type, + fd->entryoffset, sizeof(record_type)); + + switch (be32_to_cpu(record_type)) { + case HFSPLUS_ATTR_INLINE_DATA: + /* All is OK. Do nothing. */ + break; + case HFSPLUS_ATTR_FORK_DATA: + case HFSPLUS_ATTR_EXTENTS: + printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + return -EOPNOTSUPP; + default: + printk(KERN_ERR "hfs: invalid extended attribute record\n"); + return -ENOENT; + } + + err = hfs_brec_remove(fd); + if (err) + return err; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + return err; +} + +int hfsplus_delete_attr(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return err; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto out; + } else { + printk(KERN_ERR "hfs: invalid extended attribute name\n"); + err = -EINVAL; + goto out; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + err = __hfsplus_delete_attr(inode, inode->i_ino, &fd); + if (err) + goto out; + +out: + hfs_find_exit(&fd); + return err; +} + +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) +{ + int err = 0; + struct hfs_find_data fd; + + dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + + if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd); + if (err) + return err; + + for (;;) { + err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); + if (err) { + if (err != -ENOENT) + printk(KERN_ERR "hfs: xattr search failed.\n"); + goto end_delete_all; + } + + err = __hfsplus_delete_attr(dir, cnid, &fd); + if (err) + goto end_delete_all; + } + +end_delete_all: + hfs_find_exit(&fd); + return err; +} diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5d799c1..d73c98d 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -24,7 +24,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - mutex_lock(&tree->tree_lock); + switch (tree->cnid) { + case HFSPLUS_CAT_CNID: + mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); + break; + case HFSPLUS_EXT_CNID: + mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); + break; + case HFSPLUS_ATTR_CNID: + mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); + break; + default: + BUG(); + } return 0; } @@ -38,15 +50,73 @@ void hfs_find_exit(struct hfs_find_data *fd) fd->tree = NULL; } -/* Find the record in bnode that best matches key (not greater than...)*/ -int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) +{ + __be32 cur_cnid, search_cnid; + + if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { + cur_cnid = fd->key->ext.cnid; + search_cnid = fd->search_key->ext.cnid; + } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { + cur_cnid = fd->key->cat.parent; + search_cnid = fd->search_key->cat.parent; + } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { + cur_cnid = fd->key->attr.cnid; + search_cnid = fd->search_key->attr.cnid; + } else + BUG(); + + if (cur_cnid == search_cnid) { + (*end) = (*cur_rec); + if ((*begin) == (*end)) + return 1; + } else { + if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) + (*begin) = (*cur_rec) + 1; + else + (*end) = (*cur_rec) - 1; + } + + return 0; +} + +int hfs_find_rec_by_key(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) { int cmpval; + + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + (*end) = (*cur_rec); + return 1; + } + if (cmpval < 0) + (*begin) = (*cur_rec) + 1; + else + *(end) = (*cur_rec) - 1; + + return 0; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found) +{ u16 off, len, keylen; int rec; int b, e; int res; + if (!rec_found) + BUG(); + b = 0; e = bnode->num_recs - 1; res = -ENOENT; @@ -59,17 +129,12 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); - cmpval = bnode->tree->keycmp(fd->key, fd->search_key); - if (!cmpval) { - e = rec; + if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } - if (cmpval < 0) - b = rec + 1; - else - e = rec - 1; } while (b <= e); + if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); @@ -79,19 +144,21 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) } hfs_bnode_read(bnode, fd->key, off, keylen); } + done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; + fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ -int hfs_brec_find(struct hfs_find_data *fd) +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; @@ -122,7 +189,7 @@ int hfs_brec_find(struct hfs_find_data *fd) goto invalid; bnode->parent = parent; - res = __hfs_brec_find(bnode, fd); + res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) @@ -149,7 +216,7 @@ int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 1c42cc5..f31ac6f 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -62,7 +62,8 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) tree = node->tree; if (node->type == HFS_NODE_LEAF || - tree->attributes & HFS_TREE_VARIDXKEYS) + tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) key_len = hfs_bnode_read_u16(node, off) + 2; else key_len = tree->max_key_len + 2; @@ -314,7 +315,8 @@ void hfs_bnode_dump(struct hfs_bnode *node) if (i && node->type == HFS_NODE_INDEX) { int tmp; - if (node->tree->attributes & HFS_TREE_VARIDXKEYS) + if (node->tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; @@ -646,6 +648,8 @@ void hfs_bnode_put(struct hfs_bnode *node) if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); + hfs_bnode_clear(node, 0, + PAGE_CACHE_SIZE * tree->pages_per_bnode); hfs_bmap_free(node); hfs_bnode_free(node); return; diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2a734cf..298d4e4 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -36,7 +36,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) return 0; if ((node->type == HFS_NODE_INDEX) && - !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { + !(node->tree->attributes & HFS_TREE_VARIDXKEYS) && + (node->tree->cnid != HFSPLUS_ATTR_CNID)) { retval = node->tree->max_key_len + 2; } else { recoff = hfs_bnode_read_u16(node, @@ -151,12 +152,13 @@ skip: /* get index key */ hfs_bnode_read_key(new_node, fd->search_key, 14); - __hfs_brec_find(fd->bnode, fd); + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_bnode_put(new_node); new_node = NULL; - if (tree->attributes & HFS_TREE_VARIDXKEYS) + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) key_len = be16_to_cpu(fd->search_key->key_len) + 2; else { fd->search_key->key_len = @@ -201,7 +203,7 @@ again: hfs_bnode_put(node); node = fd->bnode = parent; - __hfs_brec_find(node, fd); + __hfs_brec_find(node, fd, hfs_find_rec_by_key); goto again; } hfs_bnode_write_u16(node, @@ -367,12 +369,13 @@ again: parent = hfs_bnode_find(tree, node->parent); if (IS_ERR(parent)) return PTR_ERR(parent); - __hfs_brec_find(parent, fd); + __hfs_brec_find(parent, fd, hfs_find_rec_by_key); hfs_bnode_dump(parent); rec = fd->record; /* size difference between old and new key */ - if (tree->attributes & HFS_TREE_VARIDXKEYS) + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; @@ -427,7 +430,7 @@ skip: hfs_bnode_read_key(new_node, fd->search_key, 14); cnid = cpu_to_be32(new_node->this); - __hfs_brec_find(fd->bnode, fd); + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_brec_insert(fd, &cnid, sizeof(cnid)); hfs_bnode_put(fd->bnode); hfs_bnode_put(new_node); @@ -495,13 +498,15 @@ static int hfs_btree_inc_height(struct hfs_btree *tree) /* insert old root idx into new root */ node->parent = tree->root; if (node->type == HFS_NODE_LEAF || - tree->attributes & HFS_TREE_VARIDXKEYS) + tree->attributes & HFS_TREE_VARIDXKEYS || + tree->cnid == HFSPLUS_ATTR_CNID) key_size = hfs_bnode_read_u16(node, 14) + 2; else key_size = tree->max_key_len + 2; hfs_bnode_copy(new_node, 14, node, 14, key_size); - if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + if (!(tree->attributes & HFS_TREE_VARIDXKEYS) && + (tree->cnid != HFSPLUS_ATTR_CNID)) { key_size = tree->max_key_len + 2; hfs_bnode_write_u16(new_node, 14, tree->max_key_len); } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 685d07d..efb689c 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -98,6 +98,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); } break; + case HFSPLUS_ATTR_CNID: + if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + tree->keycmp = hfsplus_attr_bin_cmp_key; + break; default: printk(KERN_ERR "hfs: unknown B*Tree requested\n"); goto fail_page; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 798d9c4..840d71e 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -45,7 +45,8 @@ void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, key->cat.parent = cpu_to_be32(parent); if (str) { - hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len); + hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); len = be16_to_cpu(key->cat.name.length); } else { key->cat.name.length = 0; @@ -167,7 +168,8 @@ static int hfsplus_fill_cat_thread(struct super_block *sb, entry->type = cpu_to_be16(type); entry->thread.reserved = 0; entry->thread.parentID = cpu_to_be32(parentid); - hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len); + hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + str->name, str->len); return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; } @@ -198,7 +200,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), &tmp.thread.nodeName); - return hfs_brec_find(fd); + return hfs_brec_find(fd, hfs_find_rec_by_key); } int hfsplus_create_cat(u32 cnid, struct inode *dir, @@ -221,7 +223,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, dir->i_ino, str); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; @@ -233,7 +235,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); entry_size = hfsplus_cat_build_record(&entry, cnid, inode); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { /* panic? */ if (!err) @@ -253,7 +255,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, err1: hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - if (!hfs_brec_find(&fd)) + if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) hfs_brec_remove(&fd); err2: hfs_find_exit(&fd); @@ -279,7 +281,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int len; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -296,7 +298,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) } else hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -326,7 +328,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) goto out; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -337,6 +339,12 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) dir->i_size--; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + + if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { + if (HFSPLUS_SB(sb)->attr_tree) + hfsplus_delete_all_attrs(dir, cnid); + } + out: hfs_find_exit(&fd); @@ -363,7 +371,7 @@ int hfsplus_rename_cat(u32 cnid, /* find the old dir entry and read the data */ hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { @@ -376,7 +384,7 @@ int hfsplus_rename_cat(u32 cnid, /* create new dir entry with the data from the old entry */ hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); - err = hfs_brec_find(&dst_fd); + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; @@ -391,7 +399,7 @@ int hfsplus_rename_cat(u32 cnid, /* finally remove the old entry */ hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; err = hfs_brec_remove(&src_fd); @@ -402,7 +410,7 @@ int hfsplus_rename_cat(u32 cnid, /* remove old thread entry */ hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset); @@ -414,7 +422,7 @@ int hfsplus_rename_cat(u32 cnid, hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); - err = hfs_brec_find(&dst_fd); + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 074e045..031c24e 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -15,6 +15,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +#include "xattr.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -138,7 +139,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err) return err; hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -421,6 +422,15 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, if (res) goto out_err; + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto out_err; + } + hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); goto out; @@ -450,15 +460,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, mode, rdev); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - clear_nlink(inode); - hfsplus_delete_inode(inode); - iput(inode); - goto out; + if (res) + goto failed_mknod; + + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto failed_mknod; } hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); + goto out; + +failed_mknod: + clear_nlink(inode); + hfsplus_delete_inode(inode); + iput(inode); out: mutex_unlock(&sbi->vh_mutex); return res; @@ -499,15 +520,19 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, } const struct inode_operations hfsplus_dir_inode_operations = { - .lookup = hfsplus_lookup, - .create = hfsplus_create, - .link = hfsplus_link, - .unlink = hfsplus_unlink, - .mkdir = hfsplus_mkdir, - .rmdir = hfsplus_rmdir, - .symlink = hfsplus_symlink, - .mknod = hfsplus_mknod, - .rename = hfsplus_rename, + .lookup = hfsplus_lookup, + .create = hfsplus_create, + .link = hfsplus_link, + .unlink = hfsplus_unlink, + .mkdir = hfsplus_mkdir, + .rmdir = hfsplus_rmdir, + .symlink = hfsplus_symlink, + .mknod = hfsplus_mknod, + .rename = hfsplus_rename, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = hfsplus_listxattr, + .removexattr = hfsplus_removexattr, }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index eba76ea..a94f0f7 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -95,7 +95,7 @@ static void __hfsplus_ext_write_extent(struct inode *inode, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) return; @@ -154,7 +154,7 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, hfsplus_ext_build_key(fd->search_key, cnid, block, type); fd->key->ext.cnid = 0; - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res && res != -ENOENT) return res; if (fd->key->ext.cnid != fd->search_key->ext.cnid || diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a6da86b..05b11f3 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -23,6 +23,7 @@ #define DBG_SUPER 0x00000010 #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 +#define DBG_ATTR_MOD 0x00000080 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) @@ -46,6 +47,13 @@ typedef int (*btree_keycmp)(const hfsplus_btree_key *, #define NODE_HASH_SIZE 256 +/* B-tree mutex nested subclasses */ +enum hfsplus_btree_mutex_classes { + CATALOG_BTREE_MUTEX, + EXTENTS_BTREE_MUTEX, + ATTR_BTREE_MUTEX, +}; + /* An HFS+ BTree held in memory */ struct hfs_btree { struct super_block *sb; @@ -223,6 +231,7 @@ struct hfsplus_inode_info { #define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */ #define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */ #define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */ +#define HFSPLUS_I_ATTR_DIRTY 4 /* has changes in the attributes tree */ #define HFSPLUS_IS_RSRC(inode) \ test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) @@ -302,7 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) #define hfs_brec_remove hfsplus_brec_remove #define hfs_find_init hfsplus_find_init #define hfs_find_exit hfsplus_find_exit -#define __hfs_brec_find __hplusfs_brec_find +#define __hfs_brec_find __hfsplus_brec_find #define hfs_brec_find hfsplus_brec_find #define hfs_brec_read hfsplus_brec_read #define hfs_brec_goto hfsplus_brec_goto @@ -324,10 +333,33 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) */ #define HFSPLUS_IOC_BLESS _IO('h', 0x80) +typedef int (*search_strategy_t)(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); + /* * Functions in any *.c used in other files */ +/* attributes.c */ +int hfsplus_create_attr_tree_cache(void); +void hfsplus_destroy_attr_tree_cache(void); +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *, + u32, const char *); +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, + u32 cnid, + struct hfsplus_attr_unistr *name); +int hfsplus_find_attr(struct super_block *, u32, + const char *, struct hfs_find_data *); +int hfsplus_attr_exists(struct inode *inode, const char *name); +int hfsplus_create_attr(struct inode *, const char *, const void *, size_t); +int hfsplus_delete_attr(struct inode *, const char *); +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); + /* bitmap.c */ int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *); int hfsplus_block_free(struct super_block *, u32, u32); @@ -369,8 +401,15 @@ int hfs_brec_remove(struct hfs_find_data *); /* bfind.c */ int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); void hfs_find_exit(struct hfs_find_data *); -int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); -int hfs_brec_find(struct hfs_find_data *); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); +int hfs_find_rec_by_key(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); +int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *, + search_strategy_t); +int hfs_brec_find(struct hfs_find_data *, search_strategy_t); int hfs_brec_read(struct hfs_find_data *, void *, int); int hfs_brec_goto(struct hfs_find_data *, int); @@ -417,11 +456,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -int hfsplus_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size); -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); /* options.c */ int hfsplus_parse_options(char *, struct hfsplus_sb_info *); @@ -446,7 +480,7 @@ int hfsplus_strcmp(const struct hfsplus_unistr *, int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, - struct hfsplus_unistr *, const char *, int); + struct hfsplus_unistr *, int, const char *, int); int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, struct qstr *str); int hfsplus_compare_dentry(const struct dentry *parent, diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 921967e..452ede0 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -52,13 +52,23 @@ typedef __be32 hfsplus_cnid; typedef __be16 hfsplus_unichr; +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 + /* A "string" as used in filenames, etc. */ struct hfsplus_unistr { __be16 length; - hfsplus_unichr unicode[255]; + hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; } __packed; -#define HFSPLUS_MAX_STRLEN 255 +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed; /* POSIX permissions */ struct hfsplus_perm { @@ -291,6 +301,8 @@ struct hfsplus_cat_file { /* File attribute bits */ #define HFSPLUS_FILE_LOCKED 0x0001 #define HFSPLUS_FILE_THREAD_EXISTS 0x0002 +#define HFSPLUS_XATTR_EXISTS 0x0004 +#define HFSPLUS_ACL_EXISTS 0x0008 /* HFS+ catalog thread (part of a cat_entry) */ struct hfsplus_cat_thread { @@ -327,11 +339,63 @@ struct hfsplus_ext_key { #define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA 0x20 +#define HFSPLUS_ATTR_EXTENTS 0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { + __be16 key_len; + __be16 pad; + hfsplus_cnid cnid; + __be32 start_block; + struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { + __be32 record_type; + __be32 reserved; + struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { + __be32 record_type; + __be32 reserved; + struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { + __be32 record_type; + __be32 reserved1; + u8 reserved2[6]; + __be16 length; + u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { + __be32 record_type; + struct hfsplus_attr_fork_data fork_data; + struct hfsplus_attr_extents extents; + struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; + /* HFS+ generic BTree key */ typedef union { __be16 key_len; struct hfsplus_cat_key cat; struct hfsplus_ext_key ext; + struct hfsplus_attr_key attr; } __packed hfsplus_btree_key; #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index dcd05be..160ccc9 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -17,6 +17,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +#include "xattr.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -348,6 +349,18 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = error2; } + if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) { + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait( + sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } else { + printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); + } + } + if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) @@ -365,9 +378,10 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, .setattr = hfsplus_setattr, - .setxattr = hfsplus_setxattr, - .getxattr = hfsplus_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, + .removexattr = hfsplus_removexattr, }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index e3c4c42..d3ff5cc 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -16,7 +16,6 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/sched.h> -#include <linux/xattr.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" @@ -151,110 +150,3 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -ENOTTY; } } - -int hfsplus_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct hfs_find_data fd; - hfsplus_cat_entry entry; - struct hfsplus_cat_file *file; - int res; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - if (res) - return res; - res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (res) - goto out; - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - file = &entry.file; - - if (!strcmp(name, "hfs.type")) { - if (size == 4) - memcpy(&file->user_info.fdType, value, 4); - else - res = -ERANGE; - } else if (!strcmp(name, "hfs.creator")) { - if (size == 4) - memcpy(&file->user_info.fdCreator, value, 4); - else - res = -ERANGE; - } else - res = -EOPNOTSUPP; - if (!res) { - hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); - } -out: - hfs_find_exit(&fd); - return res; -} - -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct hfs_find_data fd; - hfsplus_cat_entry entry; - struct hfsplus_cat_file *file; - ssize_t res = 0; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - if (size) { - res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - if (res) - return res; - res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (res) - goto out; - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - } - file = &entry.file; - - if (!strcmp(name, "hfs.type")) { - if (size >= 4) { - memcpy(value, &file->user_info.fdType, 4); - res = 4; - } else - res = size ? -ERANGE : 4; - } else if (!strcmp(name, "hfs.creator")) { - if (size >= 4) { - memcpy(value, &file->user_info.fdCreator, 4); - res = 4; - } else - res = size ? -ERANGE : 4; - } else - res = -EOPNOTSUPP; -out: - if (size) - hfs_find_exit(&fd); - return res; -} - -#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) - -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - if (!buffer || !size) - return HFSPLUS_ATTRLIST_SIZE; - if (size < HFSPLUS_ATTRLIST_SIZE) - return -ERANGE; - strcpy(buffer, "hfs.type"); - strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); - - return HFSPLUS_ATTRLIST_SIZE; -} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 796198d..974c26f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -20,6 +20,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb); static void hfsplus_destroy_inode(struct inode *inode); #include "hfsplus_fs.h" +#include "xattr.h" static int hfsplus_system_read_inode(struct inode *inode) { @@ -118,6 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode) case HFSPLUS_ATTR_CNID: fork = &vhdr->attr_file; tree = sbi->attr_tree; + break; default: return -EIO; } @@ -191,6 +193,12 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); if (!error) error = error2; + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) error = error2; @@ -281,6 +289,7 @@ static void hfsplus_put_super(struct super_block *sb) hfsplus_sync_fs(sb, 1); } + hfs_btree_close(sbi->attr_tree); hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); @@ -477,12 +486,20 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "hfs: failed to load catalog file\n"); goto out_close_ext_tree; } + if (vhdr->attr_file.total_blocks != 0) { + sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); + if (!sbi->attr_tree) { + printk(KERN_ERR "hfs: failed to load attributes file\n"); + goto out_close_cat_tree; + } + } + sb->s_xattr = hfsplus_xattr_handlers; inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { printk(KERN_ERR "hfs: failed to load allocation file\n"); err = PTR_ERR(inode); - goto out_close_cat_tree; + goto out_close_attr_tree; } sbi->alloc_file = inode; @@ -542,10 +559,27 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, sbi->hidden_dir); - mutex_unlock(&sbi->vh_mutex); - if (err) + if (err) { + mutex_unlock(&sbi->vh_mutex); + goto out_put_hidden_dir; + } + + err = hfsplus_init_inode_security(sbi->hidden_dir, + root, &str); + if (err == -EOPNOTSUPP) + err = 0; /* Operation is not supported. */ + else if (err) { + /* + * Try to delete anyway without + * error analysis. + */ + hfsplus_delete_cat(sbi->hidden_dir->i_ino, + root, &str); + mutex_unlock(&sbi->vh_mutex); goto out_put_hidden_dir; + } + mutex_unlock(&sbi->vh_mutex); hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); } @@ -562,6 +596,8 @@ out_put_root: sb->s_root = NULL; out_put_alloc_file: iput(sbi->alloc_file); +out_close_attr_tree: + hfs_btree_close(sbi->attr_tree); out_close_cat_tree: hfs_btree_close(sbi->cat_tree); out_close_ext_tree: @@ -635,9 +671,20 @@ static int __init init_hfsplus_fs(void) hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; + err = hfsplus_create_attr_tree_cache(); + if (err) + goto destroy_inode_cache; err = register_filesystem(&hfsplus_fs_type); if (err) - kmem_cache_destroy(hfsplus_inode_cachep); + goto destroy_attr_tree_cache; + return 0; + +destroy_attr_tree_cache: + hfsplus_destroy_attr_tree_cache(); + +destroy_inode_cache: + kmem_cache_destroy(hfsplus_inode_cachep); + return err; } @@ -650,6 +697,7 @@ static void __exit exit_hfsplus_fs(void) * destroy cache. */ rcu_barrier(); + hfsplus_destroy_attr_tree_cache(); kmem_cache_destroy(hfsplus_inode_cachep); } diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index a32998f2..2c2e47d 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -295,7 +295,8 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size) return hfsplus_decompose_table + (off / 4); } -int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, +int hfsplus_asc2uni(struct super_block *sb, + struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len) { int size, dsize, decompose; @@ -303,7 +304,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, wchar_t c; decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { + while (outlen < max_unistr_len && len > 0) { size = asc2unichar(sb, astr, len, &c); if (decompose) @@ -311,7 +312,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, else dstr = NULL; if (dstr) { - if (outlen + dsize > HFSPLUS_MAX_STRLEN) + if (outlen + dsize > max_unistr_len) break; do { ustr->unicode[outlen++] = cpu_to_be16(*dstr++); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c new file mode 100644 index 0000000..e8a4b08 --- /dev/null +++ b/fs/hfsplus/xattr.c @@ -0,0 +1,709 @@ +/* + * linux/fs/hfsplus/xattr.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +const struct xattr_handler *hfsplus_xattr_handlers[] = { + &hfsplus_xattr_osx_handler, + &hfsplus_xattr_user_handler, + &hfsplus_xattr_trusted_handler, + &hfsplus_xattr_security_handler, + NULL +}; + +static int strcmp_xattr_finder_info(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME, + sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME)); + } + return -1; +} + +static int strcmp_xattr_acl(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_ACL_NAME, + sizeof(HFSPLUS_XATTR_ACL_NAME)); + } + return -1; +} + +static inline int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +static int can_set_xattr(struct inode *inode, const char *name, + const void *value, size_t value_len) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return -EOPNOTSUPP; /* TODO: implement ACL support */ + + if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "osx." + */ + if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; + } + + /* + * Don't allow setting an attribute in an unknown namespace. + */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; +} + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err = 0; + struct hfs_find_data cat_fd; + hfsplus_cat_entry entry; + u16 cat_entry_flags, cat_entry_type; + u16 folder_finderinfo_len = sizeof(struct DInfo) + + sizeof(struct DXInfo); + u16 file_finderinfo_len = sizeof(struct FInfo) + + sizeof(struct FXInfo); + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + err = can_set_xattr(inode, name, value, size); + if (err) + return err; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) + name += XATTR_MAC_OSX_PREFIX_LEN; + + if (value == NULL) { + value = ""; + size = 0; + } + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: catalog searching failed\n"); + goto end_setxattr; + } + + if (!strcmp_xattr_finder_info(name)) { + if (flags & XATTR_CREATE) { + printk(KERN_ERR "hfs: xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset, + sizeof(hfsplus_cat_entry)); + if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { + if (size == folder_finderinfo_len) { + memcpy(&entry.folder.user_info, value, + folder_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { + if (size == file_finderinfo_len) { + memcpy(&entry.file.user_info, value, + file_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else { + err = -EOPNOTSUPP; + goto end_setxattr; + } + goto end_setxattr; + } + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { + err = -EOPNOTSUPP; + goto end_setxattr; + } + + if (hfsplus_attr_exists(inode, name)) { + if (flags & XATTR_CREATE) { + printk(KERN_ERR "hfs: xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_setxattr; + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } else { + if (flags & XATTR_REPLACE) { + printk(KERN_ERR "hfs: cannot replace xattr\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + if (cat_entry_type == HFSPLUS_FOLDER) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + printk(KERN_ERR "hfs: invalid catalog entry type\n"); + err = -EIO; + goto end_setxattr; + } + +end_setxattr: + hfs_find_exit(&cat_fd); + return err; +} + +static inline int is_osx_xattr(const char *xattr_name) +{ + return !is_known_namespace(xattr_name); +} + +static int name_len(const char *xattr_name, int xattr_name_len) +{ + int len = xattr_name_len + 1; + + if (is_osx_xattr(xattr_name)) + len += XATTR_MAC_OSX_PREFIX_LEN; + + return len; +} + +static int copy_name(char *buffer, const char *xattr_name, int name_len) +{ + int len = name_len; + int offset = 0; + + if (is_osx_xattr(xattr_name)) { + strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); + offset += XATTR_MAC_OSX_PREFIX_LEN; + len += XATTR_MAC_OSX_PREFIX_LEN; + } + + strncpy(buffer + offset, xattr_name, name_len); + memset(buffer + offset + name_len, 0, 1); + len += 1; + + return len; +} + +static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, + void *value, size_t size) +{ + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 entry_type; + u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); + u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); + u16 record_len = max(folder_rec_len, file_rec_len); + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + + if (size >= record_len) { + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return res; + } + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_getxattr_finder_info; + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + + if (entry_type == HFSPLUS_FOLDER) { + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + folder_rec_len); + memcpy(value, folder_finder_info, folder_rec_len); + res = folder_rec_len; + } else if (entry_type == HFSPLUS_FILE) { + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + file_rec_len); + memcpy(value, file_finder_info, file_rec_len); + res = file_rec_len; + } else { + res = -EOPNOTSUPP; + goto end_getxattr_finder_info; + } + } else + res = size ? -ERANGE : record_len; + +end_getxattr_finder_info: + if (size >= record_len) + hfs_find_exit(&fd); + return res; +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfsplus_attr_entry *entry; + __be32 xattr_record_type; + u32 record_type; + u16 record_length = 0; + ssize_t res = 0; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) { + /* skip "osx." prefix */ + name += XATTR_MAC_OSX_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + + if (!strcmp_xattr_finder_info(name)) + return hfsplus_getxattr_finder_info(dentry, value, size); + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + entry = hfsplus_alloc_attr_entry(); + if (!entry) { + printk(KERN_ERR "hfs: can't allocate xattr entry\n"); + return -ENOMEM; + } + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + goto failed_getxattr_init; + } + + res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd); + if (res) { + if (res == -ENOENT) + res = -ENODATA; + else + printk(KERN_ERR "hfs: xattr searching failed\n"); + goto out; + } + + hfs_bnode_read(fd.bnode, &xattr_record_type, + fd.entryoffset, sizeof(xattr_record_type)); + record_type = be32_to_cpu(xattr_record_type); + if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + record_length = hfs_bnode_read_u16(fd.bnode, + fd.entryoffset + + offsetof(struct hfsplus_attr_inline_data, + length)); + if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { + printk(KERN_ERR "hfs: invalid xattr record size\n"); + res = -EIO; + goto out; + } + } else if (record_type == HFSPLUS_ATTR_FORK_DATA || + record_type == HFSPLUS_ATTR_EXTENTS) { + printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + res = -EOPNOTSUPP; + goto out; + } else { + printk(KERN_ERR "hfs: invalid xattr record\n"); + res = -EIO; + goto out; + } + + if (size) { + hfs_bnode_read(fd.bnode, entry, fd.entryoffset, + offsetof(struct hfsplus_attr_inline_data, + raw_bytes) + record_length); + } + + if (size >= record_length) { + memcpy(value, entry->inline_data.raw_bytes, record_length); + res = record_length; + } else + res = size ? -ERANGE : record_length; + +out: + hfs_find_exit(&fd); + +failed_getxattr_init: + hfsplus_destroy_attr_entry(entry); + return res; +} + +static inline int can_list(const char *xattr_name) +{ + if (!xattr_name) + return 0; + + return strncmp(xattr_name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN); +} + +static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, + char *buffer, size_t size) +{ + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 entry_type; + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + unsigned long len, found_bit; + int xattr_name_len, symbols_count; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return res; + } + + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_listxattr_finder_info; + + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + if (entry_type == HFSPLUS_FOLDER) { + len = sizeof(struct DInfo) + sizeof(struct DXInfo); + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + len); + found_bit = find_first_bit((void *)folder_finder_info, len*8); + } else if (entry_type == HFSPLUS_FILE) { + len = sizeof(struct FInfo) + sizeof(struct FXInfo); + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + len); + found_bit = find_first_bit((void *)file_finder_info, len*8); + } else { + res = -EOPNOTSUPP; + goto end_listxattr_finder_info; + } + + if (found_bit >= (len*8)) + res = 0; + else { + symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1; + xattr_name_len = + name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); + if (!buffer || !size) { + if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) + res = xattr_name_len; + } else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) { + if (size < xattr_name_len) + res = -ERANGE; + else { + res = copy_name(buffer, + HFSPLUS_XATTR_FINDER_INFO_NAME, + symbols_count); + } + } + } + +end_listxattr_finder_info: + hfs_find_exit(&fd); + + return res; +} + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + ssize_t err; + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 key_len = 0; + struct hfsplus_attr_key attr_key; + char strbuf[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + int xattr_name_len; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfsplus_listxattr_finder_info(dentry, buffer, size); + if (res < 0) + return res; + else if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return (res == 0) ? -EOPNOTSUPP : res; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); + if (err) { + if (err == -ENOENT) { + if (res == 0) + res = -ENODATA; + goto end_listxattr; + } else { + res = err; + goto end_listxattr; + } + } + + for (;;) { + key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + if (key_len == 0 || key_len > fd.tree->max_key_len) { + printk(KERN_ERR "hfs: invalid xattr key length: %d\n", + key_len); + res = -EIO; + goto end_listxattr; + } + + hfs_bnode_read(fd.bnode, &attr_key, + fd.keyoffset, key_len + sizeof(key_len)); + + if (be32_to_cpu(attr_key.cnid) != inode->i_ino) + goto end_listxattr; + + xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN; + if (hfsplus_uni2asc(inode->i_sb, + (const struct hfsplus_unistr *)&fd.key->attr.key_name, + strbuf, &xattr_name_len)) { + printk(KERN_ERR "hfs: unicode conversion failed\n"); + res = -EIO; + goto end_listxattr; + } + + if (!buffer || !size) { + if (can_list(strbuf)) + res += name_len(strbuf, xattr_name_len); + } else if (can_list(strbuf)) { + if (size < (res + name_len(strbuf, xattr_name_len))) { + res = -ERANGE; + goto end_listxattr; + } else + res += copy_name(buffer + res, + strbuf, xattr_name_len); + } + + if (hfs_brec_goto(&fd, 1)) + goto end_listxattr; + } + +end_listxattr: + hfs_find_exit(&fd); + return res; +} + +int hfsplus_removexattr(struct dentry *dentry, const char *name) +{ + int err = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data cat_fd; + u16 flags; + u16 cat_entry_type; + int is_xattr_acl_deleted = 0; + int is_all_xattrs_deleted = 0; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + err = can_set_xattr(inode, name, NULL, 0); + if (err) + return err; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) + name += XATTR_MAC_OSX_PREFIX_LEN; + + if (!strcmp_xattr_finder_info(name)) + return -EOPNOTSUPP; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: catalog searching failed\n"); + goto end_removexattr; + } + + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_removexattr; + + is_xattr_acl_deleted = !strcmp_xattr_acl(name); + is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL); + + if (!is_xattr_acl_deleted && !is_all_xattrs_deleted) + goto end_removexattr; + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + + if (cat_entry_type == HFSPLUS_FOLDER) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + printk(KERN_ERR "hfs: invalid catalog entry type\n"); + err = -EIO; + goto end_removexattr; + } + +end_removexattr: + hfs_find_exit(&cat_fd); + return err; +} + +static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_osx_handler = { + .prefix = XATTR_MAC_OSX_PREFIX, + .list = hfsplus_osx_listxattr, + .get = hfsplus_osx_getxattr, + .set = hfsplus_osx_setxattr, +}; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h new file mode 100644 index 0000000..847b695 --- /dev/null +++ b/fs/hfsplus/xattr.h @@ -0,0 +1,60 @@ +/* + * linux/fs/hfsplus/xattr.h + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#ifndef _LINUX_HFSPLUS_XATTR_H +#define _LINUX_HFSPLUS_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler hfsplus_xattr_osx_handler; +extern const struct xattr_handler hfsplus_xattr_user_handler; +extern const struct xattr_handler hfsplus_xattr_trusted_handler; +/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ +/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ +extern const struct xattr_handler hfsplus_xattr_security_handler; + +extern const struct xattr_handler *hfsplus_xattr_handlers[]; + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); + +int hfsplus_removexattr(struct dentry *dentry, const char *name); + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) +{ + /*TODO: implement*/ + return 0; +} + +static inline int hfsplus_init_inode_security(struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = hfsplus_init_acl(inode, dir); + if (!err) + err = hfsplus_init_security(inode, dir, qstr); + return err; +} + +#endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c new file mode 100644 index 0000000..83b842f --- /dev/null +++ b/fs/hfsplus/xattr_security.c @@ -0,0 +1,104 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for storing security labels as extended attributes. + */ + +#include <linux/security.h> +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +static int hfsplus_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t xattr_name_len; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + xattr_name_len = strlen(xattr->name); + + if (xattr_name_len == 0) + continue; + + if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN > + HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + + XATTR_SECURITY_PREFIX_LEN, xattr->name); + memset(xattr_name + + XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1); + + err = __hfsplus_setxattr(inode, xattr_name, + xattr->value, xattr->value_len, 0); + if (err) + break; + } + return err; +} + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &hfsplus_initxattrs, NULL); +} + +const struct xattr_handler hfsplus_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = hfsplus_security_listxattr, + .get = hfsplus_security_getxattr, + .set = hfsplus_security_setxattr, +}; diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c new file mode 100644 index 0000000..426cee2 --- /dev/null +++ b/fs/hfsplus/xattr_trusted.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for trusted extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_TRUSTED_PREFIX); + strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_TRUSTED_PREFIX); + strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = hfsplus_trusted_listxattr, + .get = hfsplus_trusted_getxattr, + .set = hfsplus_trusted_setxattr, +}; diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c new file mode 100644 index 0000000..e340165 --- /dev/null +++ b/fs/hfsplus/xattr_user.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_user.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for user extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_USER_PREFIX); + strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_USER_PREFIX); + strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = hfsplus_user_listxattr, + .get = hfsplus_user_getxattr, + .set = hfsplus_user_setxattr, +}; @@ -798,11 +798,10 @@ static struct inode *find_inode(struct super_block *sb, int (*test)(struct inode *, void *), void *data) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + hlist_for_each_entry(inode, head, i_hash) { spin_lock(&inode->i_lock); if (inode->i_sb != sb) { spin_unlock(&inode->i_lock); @@ -830,11 +829,10 @@ repeat: static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + hlist_for_each_entry(inode, head, i_hash) { spin_lock(&inode->i_lock); if (inode->i_ino != ino) { spin_unlock(&inode->i_lock); @@ -1132,11 +1130,10 @@ EXPORT_SYMBOL(iget_locked); static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); - struct hlist_node *node; struct inode *inode; spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, node, b, i_hash) { + hlist_for_each_entry(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) { spin_unlock(&inode_hash_lock); return 0; @@ -1291,10 +1288,9 @@ int insert_inode_locked(struct inode *inode) struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) @@ -1306,7 +1302,7 @@ int insert_inode_locked(struct inode *inode) } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); @@ -1334,11 +1330,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) @@ -1350,7 +1345,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b7e2385..d6ee5ae 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -382,7 +382,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; - handle = ERR_PTR(err); + return ERR_PTR(err); } handle->h_type = type; handle->h_line_no = line_no; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index a271740..0796c45 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -11,7 +11,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/nfs_fs.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> @@ -220,10 +220,19 @@ reclaimer(void *ptr) { struct nlm_host *host = (struct nlm_host *) ptr; struct nlm_wait *block; + struct nlm_rqst *req; struct file_lock *fl, *next; u32 nsmstate; struct net *net = host->net; + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) { + printk(KERN_ERR "lockd: reclaimer unable to alloc memory." + " Locks for %s won't be reclaimed!\n", + host->h_name); + return 0; + } + allow_signal(SIGKILL); down_write(&host->h_rwsem); @@ -253,7 +262,7 @@ restart: */ if (signalled()) continue; - if (nlmclnt_reclaim(host, fl) != 0) + if (nlmclnt_reclaim(host, fl, req) != 0) continue; list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); if (host->h_nsmstate != nsmstate) { @@ -279,5 +288,6 @@ restart: /* Release host handle after use */ nlmclnt_release_host(host); lockd_down(net); + kfree(req); return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 3662771..7e529c3 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -618,17 +618,15 @@ out_unlock: * RECLAIM: Try to reclaim a lock */ int -nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) +nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, + struct nlm_rqst *req) { - struct nlm_rqst reqst, *req; int status; - req = &reqst; memset(req, 0, sizeof(*req)); locks_init_lock(&req->a_args.lock.fl); locks_init_lock(&req->a_res.lock.fl); req->a_host = host; - req->a_flags = 0; /* Set up the argument struct */ nlmclnt_setlockargs(req, fl); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0e17090..969d589 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -13,6 +13,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/mutex.h> @@ -32,15 +33,15 @@ static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH]; static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; -#define for_each_host(host, pos, chain, table) \ +#define for_each_host(host, chain, table) \ for ((chain) = (table); \ (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ - hlist_for_each_entry((host), (pos), (chain), h_hash) + hlist_for_each_entry((host), (chain), h_hash) -#define for_each_host_safe(host, pos, next, chain, table) \ +#define for_each_host_safe(host, next, chain, table) \ for ((chain) = (table); \ (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ - hlist_for_each_entry_safe((host), (pos), (next), \ + hlist_for_each_entry_safe((host), (next), \ (chain), h_hash) static unsigned long nrhosts; @@ -225,7 +226,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .net = net, }; struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -237,7 +237,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, mutex_lock(&nlm_host_mutex); chain = &nlm_client_hosts[nlm_hash_address(sap)]; - hlist_for_each_entry(host, pos, chain, h_hash) { + hlist_for_each_entry(host, chain, h_hash) { if (host->net != net) continue; if (!rpc_cmp_addr(nlm_addr(host), sap)) @@ -322,7 +322,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const size_t hostname_len) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; struct sockaddr *src_sap = svc_daddr(rqstp); @@ -350,7 +349,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, nlm_gc_hosts(net); chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; - hlist_for_each_entry(host, pos, chain, h_hash) { + hlist_for_each_entry(host, chain, h_hash) { if (host->net != net) continue; if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) @@ -515,10 +514,9 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, { struct nlm_host *host; struct hlist_head *chain; - struct hlist_node *pos; mutex_lock(&nlm_host_mutex); - for_each_host(host, pos, chain, cache) { + for_each_host(host, chain, cache) { if (host->h_nsmhandle == nsm && host->h_nsmstate != info->state) { host->h_nsmstate = info->state; @@ -570,7 +568,6 @@ void nlm_host_rebooted(const struct nlm_reboot *info) static void nlm_complain_hosts(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; if (net) { @@ -587,7 +584,7 @@ static void nlm_complain_hosts(struct net *net) dprintk("lockd: %lu hosts left:\n", nrhosts); } - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; dprintk(" %s (cnt %d use %d exp %ld net %p)\n", @@ -600,14 +597,13 @@ void nlm_shutdown_hosts_net(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts in net %p...\n", net); - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; host->h_expires = jiffies - 1; @@ -644,11 +640,11 @@ static void nlm_gc_hosts(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos, *next; + struct hlist_node *next; struct nlm_host *host; dprintk("lockd: host garbage collection for net %p\n", net); - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; host->h_inuse = 0; @@ -657,7 +653,7 @@ nlm_gc_hosts(struct net *net) /* Mark all hosts that hold locks, blocks or shares */ nlmsvc_mark_resources(net); - for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; if (atomic_read(&host->h_count) || host->h_inuse diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3c2cfc6..1812f02 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index b3a24b0..97e8741 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -13,7 +13,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/nfsd/nfsfh.h> #include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> @@ -84,7 +84,6 @@ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, struct nfs_fh *f) { - struct hlist_node *pos; struct nlm_file *file; unsigned int hash; __be32 nfserr; @@ -96,7 +95,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, /* Lock file table */ mutex_lock(&nlm_file_mutex); - hlist_for_each_entry(file, pos, &nlm_files[hash], f_list) + hlist_for_each_entry(file, &nlm_files[hash], f_list) if (!nfs_compare_fh(&file->f_handle, f)) goto found; @@ -248,13 +247,13 @@ static int nlm_traverse_files(void *data, nlm_host_match_fn_t match, int (*is_failover_file)(void *data, struct nlm_file *file)) { - struct hlist_node *pos, *next; + struct hlist_node *next; struct nlm_file *file; int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { - hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { + hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) { if (is_failover_file && !is_failover_file(data, file)) continue; file->f_count++; diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 862a2f1..5f7b053 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd) struct super_block *pipefs_sb; int ret = 0; + sunrpc_init_cache_detail(cd); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { ret = nfs_cache_register_sb(pipefs_sb, cd); rpc_put_sb_net(net); + if (ret) + sunrpc_destroy_cache_detail(cd); } return ret; } @@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) nfs_cache_unregister_sb(pipefs_sb, cd); rpc_put_sb_net(net); } -} - -void nfs_cache_init(struct cache_detail *cd) -{ - sunrpc_init_cache_detail(cd); -} - -void nfs_cache_destroy(struct cache_detail *cd) -{ sunrpc_destroy_cache_detail(cd); } diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 317db95..4116d2c 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern void nfs_cache_init(struct cache_detail *cd); -extern void nfs_cache_destroy(struct cache_detail *cd); extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); extern int nfs_cache_register_sb(struct super_block *sb, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9f3c664..84d8eae 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -197,7 +197,6 @@ error_0: EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) -/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index ca4b11e..9455270 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/dns_resolver.h> #include "dns_resolve.h" @@ -42,6 +43,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); #include <linux/seq_file.h> #include <linux/inet.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -142,7 +144,7 @@ static int nfs_dns_upcall(struct cache_detail *cd, ret = nfs_cache_upcall(cd, key->hostname); if (ret) - ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + ret = sunrpc_cache_pipe_upcall(cd, ch); return ret; } @@ -351,60 +353,47 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, } EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); +static struct cache_detail nfs_dns_resolve_template = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_request = nfs_dns_request, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + + int nfs_dns_resolver_cache_init(struct net *net) { - int err = -ENOMEM; + int err; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct cache_detail *cd; - struct cache_head **tbl; - cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); - if (cd == NULL) - goto err_cd; - - tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *), - GFP_KERNEL); - if (tbl == NULL) - goto err_tbl; - - cd->owner = THIS_MODULE, - cd->hash_size = NFS_DNS_HASHTBL_SIZE, - cd->hash_table = tbl, - cd->name = "dns_resolve", - cd->cache_put = nfs_dns_ent_put, - cd->cache_upcall = nfs_dns_upcall, - cd->cache_parse = nfs_dns_parse, - cd->cache_show = nfs_dns_show, - cd->match = nfs_dns_match, - cd->init = nfs_dns_ent_init, - cd->update = nfs_dns_ent_update, - cd->alloc = nfs_dns_ent_alloc, - - nfs_cache_init(cd); - err = nfs_cache_register_net(net, cd); + nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); + if (IS_ERR(nn->nfs_dns_resolve)) + return PTR_ERR(nn->nfs_dns_resolve); + + err = nfs_cache_register_net(net, nn->nfs_dns_resolve); if (err) goto err_reg; - nn->nfs_dns_resolve = cd; return 0; err_reg: - nfs_cache_destroy(cd); - kfree(cd->hash_table); -err_tbl: - kfree(cd); -err_cd: + cache_destroy_net(nn->nfs_dns_resolve, net); return err; } void nfs_dns_resolver_cache_destroy(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct cache_detail *cd = nn->nfs_dns_resolve; - nfs_cache_unregister_net(net, cd); - nfs_cache_destroy(cd); - kfree(cd->hash_table); - kfree(cd); + nfs_cache_unregister_net(net, nn->nfs_dns_resolve); + cache_destroy_net(nn->nfs_dns_resolve, net); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b586fe9..1f94167 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -237,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; + if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 2e9779b..ac4fc9a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -6,6 +6,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> @@ -29,15 +30,14 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) if (clp->rpc_ops->version != 4 || minorversion != 0) return ret; -retry: - if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) - return -ENOMEM; + idr_preload(GFP_KERNEL); spin_lock(&nn->nfs_client_lock); - ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); + ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; spin_unlock(&nn->nfs_client_lock); - if (ret == -EAGAIN) - goto retry; - return ret; + idr_preload_end(); + return ret < 0 ? ret : 0; } #ifdef CONFIG_NFS_V4_1 diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 194c484..49eeb04 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data) task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } @@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data) task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 8c07241..b8da955 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -36,7 +36,7 @@ * Default data server connection timeout and retrans vaules. * Set by module paramters dataserver_timeo and dataserver_retrans. */ -#define NFS4_DEF_DS_TIMEO 60 +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 /* diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b720064..1fe284f 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -31,6 +31,7 @@ #include <linux/nfs_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> +#include <linux/sunrpc/addr.h> #include "internal.h" #include "nfs4session.h" diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 1e09eb7..0dd7660 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/vfs.h> #include <linux/inet.h> #include "internal.h" diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index eae83bf..b2671cb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -93,6 +93,8 @@ static int nfs4_map_errors(int err) return err; switch (err) { case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: return -EPERM; @@ -1158,6 +1160,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) data->o_arg.fmode); iput(inode); out: + nfs_release_seqid(data->o_arg.seqid); return state; err_put_inode: iput(inode); @@ -6045,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; + unsigned long timeo, giveup; dprintk("--> %s\n", __func__); @@ -6056,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: - task->tk_status = -NFS4ERR_DELAY; + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + if (time_after(giveup, jiffies)) + task->tk_status = -NFS4ERR_DELAY; break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -6129,11 +6136,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) static void nfs4_layoutget_release(void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -6148,7 +6157,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { struct pnfs_layout_segment * nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) { - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); struct rpc_task *task; struct rpc_message msg = { @@ -6174,10 +6184,15 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6be70f6..48ac5aa 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1181,7 +1181,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; - bool first = false; + bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; @@ -1215,10 +1215,9 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - if (list_empty(&lo->plh_segs)) - first = true; - + first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. @@ -1422,13 +1421,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1463,7 +1464,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* @@ -1578,13 +1580,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_read(&pgio, inode, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1615,7 +1619,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 97cb358..94ba804 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -230,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index d35b62e..6da209b 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -77,9 +77,8 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n; - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) if (d->ld == ld && d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) { if (atomic_read(&d->ref)) @@ -248,12 +247,11 @@ static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n; HLIST_HEAD(tmp); spin_lock(&nfs4_deviceid_lock); rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); @@ -291,12 +289,11 @@ void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) { struct nfs4_deviceid_node *d; - struct hlist_node *n; int i; rcu_read_lock(); for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node) if (d->nfs_client == clp) set_bit(NFS_DEVICEID_INVALID, &d->flags); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a9dc5fc..17b32b7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -31,6 +31,7 @@ #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index d26a32f..1f1f38f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -335,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); return; } - if (task->tk_status != 0) { + if (task->tk_status != 0) nfs_cancel_async_unlink(old_dentry); - return; - } - - d_drop(old_dentry); - d_drop(new_dentry); } /** @@ -549,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) error = rpc_wait_for_completion_task(task); if (error == 0) error = task->tk_status; + switch (error) { + case 0: + /* The rename succeeded */ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by + * forcing a new lookup */ + d_drop(dentry); + d_drop(sdentry); + } rpc_put_task(task); out_dput: dput(sdentry); diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 93cc9d3..87fd141 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -12,6 +12,10 @@ /* * Representation of a reply cache entry. + * + * Note that we use a sockaddr_in6 to hold the address instead of the more + * typical sockaddr_storage. This is for space reasons, since sockaddr_storage + * is much larger than a sockaddr_in6. */ struct svc_cacherep { struct hlist_node c_hash; @@ -20,11 +24,13 @@ struct svc_cacherep { unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in c_addr; + struct sockaddr_in6 c_addr; __be32 c_xid; u32 c_prot; u32 c_proc; u32 c_vers; + unsigned int c_len; + __wsum c_csum; unsigned long c_timestamp; union { struct kvec u_vec; @@ -46,8 +52,7 @@ enum { enum { RC_DROPIT, RC_REPLY, - RC_DOIT, - RC_INTR + RC_DOIT }; /* @@ -67,6 +72,12 @@ enum { */ #define RC_DELAY (HZ/5) +/* Cache entries expire after this time period */ +#define RC_EXPIRE (120 * HZ) + +/* Checksum this amount of the request */ +#define RC_CSUMLEN (256U) + int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 5681c59..5f38ea3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, expkey_request); -} - static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); @@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = { .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_upcall = expkey_upcall, + .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref) path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); + kfree(exp->ex_uuid); kfree(exp); } @@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); -} - static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -674,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; + new->ex_uuid = NULL; new->cd = item->cd; } @@ -715,7 +707,7 @@ static struct cache_detail svc_export_cache_template = { .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_upcall = svc_export_upcall, + .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 497584c..d620e7f 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -9,7 +9,7 @@ #include <linux/debugfs.h> #include <linux/module.h> #include <linux/nsproxy.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <asm/uaccess.h> #include "state.h" diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 0ce1234..4832fd8 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int -idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) -{ - return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); -} - -static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_upcall = idtoname_upcall, + .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int -nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) -{ - return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); -} - -static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_upcall = nametoid_upcall, + .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9d1c5db..ae73175e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -993,14 +993,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!buf) return nfserr_jukebox; + p = buf; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, - cstate->current_fh.fh_dentry, buf, - &count, verify->ve_bmval, + cstate->current_fh.fh_dentry, &p, + count, verify->ve_bmval, rqstp, 0); /* this means that nfsd4_encode_fattr() ran out of space */ - if (status == nfserr_resource && count == 0) + if (status == nfserr_resource) status = nfserr_not_same; if (status) goto out_kfree; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 4914af4..899ca26 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1185,6 +1185,12 @@ bin_to_hex_dup(const unsigned char *src, int srclen) static int nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) { + /* XXX: The usermode helper s not working in container yet. */ + if (net != &init_net) { + WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " + "tracking in a container!\n"); + return -EINVAL; + } return nfsd4_umh_cltrack_upcall("init", NULL, NULL); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9e7103b..16d39c6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -40,7 +40,7 @@ #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" @@ -261,33 +261,46 @@ static inline int get_new_stid(struct nfs4_stid *stid) return new_stid; } -static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct +kmem_cache *slab) { - stateid_t *s = &stid->sc_stateid; + struct idr *stateids = &cl->cl_stateids; + static int min_stateid = 0; + struct nfs4_stid *stid; int new_id; - stid->sc_type = type; + stid = kmem_cache_alloc(slab, GFP_KERNEL); + if (!stid) + return NULL; + + if (!idr_pre_get(stateids, GFP_KERNEL)) + goto out_free; + if (idr_get_new_above(stateids, stid, min_stateid, &new_id)) + goto out_free; stid->sc_client = cl; - s->si_opaque.so_clid = cl->cl_clientid; - new_id = get_new_stid(stid); - s->si_opaque.so_id = (u32)new_id; + stid->sc_type = 0; + stid->sc_stateid.si_opaque.so_id = new_id; + stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ - s->si_generation = 0; -} - -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) -{ - struct idr *stateids = &cl->cl_stateids; + stid->sc_stateid.si_generation = 0; - if (!idr_pre_get(stateids, GFP_KERNEL)) - return NULL; /* - * Note: if we fail here (or any time between now and the time - * we actually get the new idr), we won't need to undo the idr - * preallocation, since the idr code caps the number of - * preallocated entries. + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): */ - return kmem_cache_alloc(slab, GFP_KERNEL); + + min_stateid = new_id+1; + if (min_stateid == INT_MAX) + min_stateid = 0; + return stid; +out_free: + kfree(stid); + return NULL; } static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) @@ -316,7 +329,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + dp->dl_stid.sc_type = NFS4_DELEG_STID; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -337,13 +350,21 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv return dp; } +static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab) +{ + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); + kmem_cache_free(slab, s); +} + void nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); - kmem_cache_free(deleg_slab, dp); + free_stid(&dp->dl_stid, deleg_slab); num_delegations--; } } @@ -360,9 +381,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) static void unhash_stid(struct nfs4_stid *s) { - struct idr *stateids = &s->sc_client->cl_stateids; - - idr_remove(stateids, s->sc_stateid.si_opaque.so_id); + s->sc_type = 0; } /* Called under the state lock. */ @@ -519,7 +538,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { - kmem_cache_free(stateid_slab, stp); + free_stid(&stp->st_stid, stateid_slab); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -905,7 +924,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, new = __alloc_session(slotsize, numslots); if (!new) { - nfsd4_put_drc_mem(slotsize, fchan->maxreqs); + nfsd4_put_drc_mem(slotsize, numslots); return NULL; } init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn); @@ -1048,7 +1067,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); while (!list_empty(&clp->cl_sessions)) { @@ -1060,6 +1079,7 @@ free_client(struct nfs4_client *clp) } free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); + idr_destroy(&clp->cl_stateids); kfree(clp); } @@ -1258,7 +1278,12 @@ static void gen_confirm(struct nfs4_client *clp) static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) { - return idr_find(&cl->cl_stateids, t->si_opaque.so_id); + struct nfs4_stid *ret; + + ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); + if (!ret || !ret->sc_type) + return NULL; + return ret; } static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) @@ -1844,11 +1869,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); -out: nfs4_unlock_state(); +out: dprintk("%s returns %d\n", __func__, ntohl(status)); return status; out_free_conn: + nfs4_unlock_state(); free_conn(conn); out_free_session: __free_session(new); @@ -2443,9 +2469,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; - struct nfs4_client *clp = oo->oo_owner.so_client; - init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); + stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -4031,7 +4056,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp = nfs4_alloc_stateid(clp); if (stp == NULL) return NULL; - init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); + stp->st_stid.sc_type = NFS4_LOCK_STID; list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; @@ -4913,16 +4938,6 @@ nfs4_state_start_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - /* - * FIXME: For now, we hang most of the pernet global stuff off of - * init_net until nfsd is fully containerized. Eventually, we'll - * need to pass a net pointer into this function, take a reference - * to that instead and then do most of the rest of this on a per-net - * basis. - */ - if (net != &init_net) - return -EINVAL; - ret = nfs4_state_create_net(net); if (ret) return ret; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8ca6d17..0116886 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2024,12 +2024,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. * - * @countp is the buffer size in _words_; upon successful return this becomes - * replaced with the number of words written. + * countp is the buffer size in _words_ */ __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, + struct dentry *dentry, __be32 **buffer, int count, u32 *bmval, struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; @@ -2038,12 +2037,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct kstat stat; struct svc_fh tempfh; struct kstatfs statfs; - int buflen = *countp << 2; + int buflen = count << 2; __be32 *attrlenp; u32 dummy; u64 dummy64; u32 rdattr_err = 0; - __be32 *p = buffer; + __be32 *p = *buffer; __be32 status; int err; int aclsupport = 0; @@ -2447,7 +2446,7 @@ out_acl: } *attrlenp = htonl((char *)p - (char *)attrlenp - 4); - *countp = p - buffer; + *buffer = p; status = nfs_ok; out: @@ -2459,7 +2458,6 @@ out_nfserr: status = nfserrno(err); goto out; out_resource: - *countp = 0; status = nfserr_resource; goto out; out_serverfault: @@ -2478,7 +2476,7 @@ static inline int attributes_need_mount(u32 *bmval) static __be32 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, - const char *name, int namlen, __be32 *p, int *buflen) + const char *name, int namlen, __be32 **p, int buflen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -2584,10 +2582,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ - nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen); + nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen); switch (nfserr) { case nfs_ok: - p += buflen; break; case nfserr_resource: nfserr = nfserr_toosmall; @@ -2714,10 +2711,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, - resp->p, &buflen, getattr->ga_bmval, + &resp->p, buflen, getattr->ga_bmval, resp->rqstp, 0); - if (!nfserr) - resp->p += buflen; return nfserr; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 2cbac34..62c1ee1 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,22 +9,22 @@ */ #include <linux/slab.h> +#include <linux/sunrpc/addr.h> +#include <linux/highmem.h> +#include <net/checksum.h> #include "nfsd.h" #include "cache.h" -/* Size of reply cache. Common values are: - * 4.3BSD: 128 - * 4.4BSD: 256 - * Solaris2: 1024 - * DEC Unix: 512-4096 - */ -#define CACHESIZE 1024 +#define NFSDDBG_FACILITY NFSDDBG_REPCACHE + #define HASHSIZE 64 static struct hlist_head * cache_hash; static struct list_head lru_head; -static int cache_disabled = 1; +static struct kmem_cache *drc_slab; +static unsigned int num_drc_entries; +static unsigned int max_drc_entries; /* * Calculate the hash index from an XID. @@ -37,6 +37,14 @@ static inline u32 request_hash(u32 xid) } static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); +static void cache_cleaner_func(struct work_struct *unused); +static int nfsd_reply_cache_shrink(struct shrinker *shrink, + struct shrink_control *sc); + +struct shrinker nfsd_reply_cache_shrinker = { + .shrink = nfsd_reply_cache_shrink, + .seeks = 1, +}; /* * locking for the reply cache: @@ -44,30 +52,86 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); * Otherwise, it when accessing _prev or _next, the lock must be held. */ static DEFINE_SPINLOCK(cache_lock); +static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); -int nfsd_reply_cache_init(void) +/* + * Put a cap on the size of the DRC based on the amount of available + * low memory in the machine. + * + * 64MB: 8192 + * 128MB: 11585 + * 256MB: 16384 + * 512MB: 23170 + * 1GB: 32768 + * 2GB: 46340 + * 4GB: 65536 + * 8GB: 92681 + * 16GB: 131072 + * + * ...with a hard cap of 256k entries. In the worst case, each entry will be + * ~1k, so the above numbers should give a rough max of the amount of memory + * used in k. + */ +static unsigned int +nfsd_cache_size_limit(void) +{ + unsigned int limit; + unsigned long low_pages = totalram_pages - totalhigh_pages; + + limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); + return min_t(unsigned int, limit, 256*1024); +} + +static struct svc_cacherep * +nfsd_reply_cache_alloc(void) { struct svc_cacherep *rp; - int i; - INIT_LIST_HEAD(&lru_head); - i = CACHESIZE; - while (i) { - rp = kmalloc(sizeof(*rp), GFP_KERNEL); - if (!rp) - goto out_nomem; - list_add(&rp->c_lru, &lru_head); + rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); + if (rp) { rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; + INIT_LIST_HEAD(&rp->c_lru); INIT_HLIST_NODE(&rp->c_hash); - i--; } + return rp; +} + +static void +nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF) + kfree(rp->c_replvec.iov_base); + hlist_del(&rp->c_hash); + list_del(&rp->c_lru); + --num_drc_entries; + kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free(struct svc_cacherep *rp) +{ + spin_lock(&cache_lock); + nfsd_reply_cache_free_locked(rp); + spin_unlock(&cache_lock); +} + +int nfsd_reply_cache_init(void) +{ + register_shrinker(&nfsd_reply_cache_shrinker); + drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), + 0, 0, NULL); + if (!drc_slab) + goto out_nomem; - cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); + cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!cache_hash) goto out_nomem; - cache_disabled = 0; + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); @@ -79,27 +143,33 @@ void nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; + unregister_shrinker(&nfsd_reply_cache_shrinker); + cancel_delayed_work_sync(&cache_cleaner); + while (!list_empty(&lru_head)) { rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); - if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF) - kfree(rp->c_replvec.iov_base); - list_del(&rp->c_lru); - kfree(rp); + nfsd_reply_cache_free_locked(rp); } - cache_disabled = 1; - kfree (cache_hash); cache_hash = NULL; + + if (drc_slab) { + kmem_cache_destroy(drc_slab); + drc_slab = NULL; + } } /* - * Move cache entry to end of LRU list + * Move cache entry to end of LRU list, and queue the cleaner to run if it's + * not already scheduled. */ static void lru_put_end(struct svc_cacherep *rp) { + rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &lru_head); + schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } /* @@ -112,83 +182,214 @@ hash_refile(struct svc_cacherep *rp) hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid)); } +static inline bool +nfsd_cache_entry_expired(struct svc_cacherep *rp) +{ + return rp->c_state != RC_INPROG && + time_after(jiffies, rp->c_timestamp + RC_EXPIRE); +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static void +prune_cache_entries(void) +{ + struct svc_cacherep *rp, *tmp; + + list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { + if (!nfsd_cache_entry_expired(rp) && + num_drc_entries <= max_drc_entries) + break; + nfsd_reply_cache_free_locked(rp); + } + + /* + * Conditionally rearm the job. If we cleaned out the list, then + * cancel any pending run (since there won't be any work to do). + * Otherwise, we rearm the job or modify the existing one to run in + * RC_EXPIRE since we just ran the pruner. + */ + if (list_empty(&lru_head)) + cancel_delayed_work(&cache_cleaner); + else + mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); +} + +static void +cache_cleaner_func(struct work_struct *unused) +{ + spin_lock(&cache_lock); + prune_cache_entries(); + spin_unlock(&cache_lock); +} + +static int +nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned int num; + + spin_lock(&cache_lock); + if (sc->nr_to_scan) + prune_cache_entries(); + num = num_drc_entries; + spin_unlock(&cache_lock); + + return num; +} + +/* + * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + */ +static __wsum +nfsd_cache_csum(struct svc_rqst *rqstp) +{ + int idx; + unsigned int base; + __wsum csum; + struct xdr_buf *buf = &rqstp->rq_arg; + const unsigned char *p = buf->head[0].iov_base; + size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, + RC_CSUMLEN); + size_t len = min(buf->head[0].iov_len, csum_len); + + /* rq_arg.head first */ + csum = csum_partial(p, len, 0); + csum_len -= len; + + /* Continue into page array */ + idx = buf->page_base / PAGE_SIZE; + base = buf->page_base & ~PAGE_MASK; + while (csum_len) { + p = page_address(buf->pages[idx]) + base; + len = min_t(size_t, PAGE_SIZE - base, csum_len); + csum = csum_partial(p, len, csum); + csum_len -= len; + base = 0; + ++idx; + } + return csum; +} + +/* + * Search the request hash for an entry that matches the given rqstp. + * Must be called with cache_lock held. Returns the found entry or + * NULL on failure. + */ +static struct svc_cacherep * +nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) +{ + struct svc_cacherep *rp; + struct hlist_head *rh; + __be32 xid = rqstp->rq_xid; + u32 proto = rqstp->rq_prot, + vers = rqstp->rq_vers, + proc = rqstp->rq_proc; + + rh = &cache_hash[request_hash(xid)]; + hlist_for_each_entry(rp, rh, c_hash) { + if (xid == rp->c_xid && proc == rp->c_proc && + proto == rp->c_prot && vers == rp->c_vers && + rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum && + rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) && + rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr)) + return rp; + } + return NULL; +} + /* * Try to find an entry matching the current call in the cache. When none - * is found, we grab the oldest unlocked entry off the LRU list. - * Note that no operation within the loop may sleep. + * is found, we try to grab the oldest expired entry off the LRU list. If + * a suitable one isn't there, then drop the cache_lock and allocate a + * new one, then search again in case one got inserted while this thread + * didn't hold the lock. */ int nfsd_cache_lookup(struct svc_rqst *rqstp) { - struct hlist_node *hn; - struct hlist_head *rh; - struct svc_cacherep *rp; + struct svc_cacherep *rp, *found; __be32 xid = rqstp->rq_xid; u32 proto = rqstp->rq_prot, vers = rqstp->rq_vers, proc = rqstp->rq_proc; + __wsum csum; unsigned long age; int type = rqstp->rq_cachetype; int rtn; rqstp->rq_cacherep = NULL; - if (cache_disabled || type == RC_NOCACHE) { + if (type == RC_NOCACHE) { nfsdstats.rcnocache++; return RC_DOIT; } + csum = nfsd_cache_csum(rqstp); + spin_lock(&cache_lock); rtn = RC_DOIT; - rh = &cache_hash[request_hash(xid)]; - hlist_for_each_entry(rp, hn, rh, c_hash) { - if (rp->c_state != RC_UNUSED && - xid == rp->c_xid && proc == rp->c_proc && - proto == rp->c_prot && vers == rp->c_vers && - time_before(jiffies, rp->c_timestamp + 120*HZ) && - memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) { - nfsdstats.rchits++; - goto found_entry; + rp = nfsd_cache_search(rqstp, csum); + if (rp) + goto found_entry; + + /* Try to use the first entry on the LRU */ + if (!list_empty(&lru_head)) { + rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); + if (nfsd_cache_entry_expired(rp) || + num_drc_entries >= max_drc_entries) { + lru_put_end(rp); + prune_cache_entries(); + goto setup_entry; } } - nfsdstats.rcmisses++; - /* This loop shouldn't take more than a few iterations normally */ - { - int safe = 0; - list_for_each_entry(rp, &lru_head, c_lru) { - if (rp->c_state != RC_INPROG) - break; - if (safe++ > CACHESIZE) { - printk("nfsd: loop in repcache LRU list\n"); - cache_disabled = 1; - goto out; - } + /* Drop the lock and allocate a new entry */ + spin_unlock(&cache_lock); + rp = nfsd_reply_cache_alloc(); + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + return RC_DOIT; } + spin_lock(&cache_lock); + ++num_drc_entries; + + /* + * Must search again just in case someone inserted one + * after we dropped the lock above. + */ + found = nfsd_cache_search(rqstp, csum); + if (found) { + nfsd_reply_cache_free_locked(rp); + rp = found; + goto found_entry; } - /* All entries on the LRU are in-progress. This should not happen */ - if (&rp->c_lru == &lru_head) { - static int complaints; - - printk(KERN_WARNING "nfsd: all repcache entries locked!\n"); - if (++complaints > 5) { - printk(KERN_WARNING "nfsd: disabling repcache.\n"); - cache_disabled = 1; - } - goto out; - } + /* + * We're keeping the one we just allocated. Are we now over the + * limit? Prune one off the tip of the LRU in trade for the one we + * just allocated if so. + */ + if (num_drc_entries >= max_drc_entries) + nfsd_reply_cache_free_locked(list_first_entry(&lru_head, + struct svc_cacherep, c_lru)); +setup_entry: + nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; rp->c_xid = xid; rp->c_proc = proc; - memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr)); + rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); rp->c_prot = proto; rp->c_vers = vers; - rp->c_timestamp = jiffies; + rp->c_len = rqstp->rq_arg.len; + rp->c_csum = csum; hash_refile(rp); + lru_put_end(rp); /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { @@ -201,9 +402,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) return rtn; found_entry: + nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ age = jiffies - rp->c_timestamp; - rp->c_timestamp = jiffies; lru_put_end(rp); rtn = RC_DROPIT; @@ -232,7 +433,7 @@ found_entry: break; default: printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - rp->c_state = RC_UNUSED; + nfsd_reply_cache_free_locked(rp); } goto out; @@ -257,11 +458,11 @@ found_entry: void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) { - struct svc_cacherep *rp; + struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; int len; - if (!(rp = rqstp->rq_cacherep) || cache_disabled) + if (!rp) return; len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); @@ -269,7 +470,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) /* Don't cache excessive amounts of data and XDR failures */ if (!statp || len > (256 >> 2)) { - rp->c_state = RC_UNUSED; + nfsd_reply_cache_free(rp); return; } @@ -283,21 +484,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) cachv = &rp->c_replvec; cachv->iov_base = kmalloc(len << 2, GFP_KERNEL); if (!cachv->iov_base) { - spin_lock(&cache_lock); - rp->c_state = RC_UNUSED; - spin_unlock(&cache_lock); + nfsd_reply_cache_free(rp); return; } cachv->iov_len = len << 2; memcpy(cachv->iov_base, statp, len << 2); break; + case RC_NOCACHE: + nfsd_reply_cache_free(rp); + return; } spin_lock(&cache_lock); lru_put_end(rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; rp->c_state = RC_DONE; - rp->c_timestamp = jiffies; spin_unlock(&cache_lock); return; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2db7021..13a21c8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -10,7 +10,7 @@ #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -125,11 +125,11 @@ static const struct file_operations transaction_ops = { .llseek = default_llseek, }; -static int exports_open(struct inode *inode, struct file *file) +static int exports_net_open(struct net *net, struct file *file) { int err; struct seq_file *seq; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); err = seq_open(file, &nfs_exports_op); if (err) @@ -140,8 +140,26 @@ static int exports_open(struct inode *inode, struct file *file) return 0; } -static const struct file_operations exports_operations = { - .open = exports_open, +static int exports_proc_open(struct inode *inode, struct file *file) +{ + return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct file_operations exports_proc_operations = { + .open = exports_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int exports_nfsd_open(struct inode *inode, struct file *file) +{ + return exports_net_open(inode->i_sb->s_fs_info, file); +} + +static const struct file_operations exports_nfsd_operations = { + .open = exports_nfsd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -220,6 +238,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) struct sockaddr *sap = (struct sockaddr *)&address; size_t salen = sizeof(address); char *fo_path; + struct net *net = file->f_dentry->d_sb->s_fs_info; /* sanity check */ if (size == 0) @@ -232,7 +251,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0) + if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; return nlmsvc_unlock_all_by_ip(sap); @@ -317,6 +336,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) int len; struct auth_domain *dom; struct knfsd_fh fh; + struct net *net = file->f_dentry->d_sb->s_fs_info; if (size == 0) return -EINVAL; @@ -352,7 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (!dom) return -ENOMEM; - len = exp_rootfh(&init_net, dom, path, &fh, maxsize); + len = exp_rootfh(net, dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -396,7 +416,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; if (size > 0) { int newthreads; @@ -447,7 +467,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int len; int npools; int *nthreads; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(net); @@ -510,7 +530,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) unsigned minor; ssize_t tlen = 0; char *sep; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (size>0) { @@ -534,7 +554,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) else num = simple_strtol(vers, &minorp, 0); if (*minorp == '.') { - if (num < 4) + if (num != 4) return -EINVAL; minor = simple_strtoul(minorp+1, NULL, 0); if (minor == 0) @@ -792,7 +812,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size, static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; mutex_lock(&nfsd_mutex); rv = __write_ports(file, buf, size, net); @@ -827,7 +847,7 @@ int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (size > 0) { @@ -923,7 +943,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } @@ -939,7 +960,8 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } @@ -995,7 +1017,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); rv = __write_recoverydir(file, buf, size, nn); @@ -1013,7 +1036,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { static struct tree_descr nfsd_files[] = { - [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", @@ -1037,20 +1060,35 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - return simple_fill_super(sb, 0x6e667364, nfsd_files); + struct net *net = data; + int ret; + + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); + if (ret) + return ret; + sb->s_fs_info = get_net(net); + return 0; } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_single(fs_type, flags, data, nfsd_fill_super); + return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); +} + +static void nfsd_umount(struct super_block *sb) +{ + struct net *net = sb->s_fs_info; + + kill_litter_super(sb); + put_net(net); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", .mount = nfsd_mount, - .kill_sb = kill_litter_super, + .kill_sb = nfsd_umount, }; #ifdef CONFIG_PROC_FS @@ -1061,7 +1099,8 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, &exports_operations); + entry = proc_create("exports", 0, entry, + &exports_proc_operations); if (!entry) return -ENOMEM; return 0; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index be7af50..262df5c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -652,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Check whether we have this call in the cache. */ switch (nfsd_cache_lookup(rqstp)) { - case RC_INTR: case RC_DROPIT: return 0; case RC_REPLY: @@ -703,8 +702,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { int ret; - struct net *net = &init_net; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv == NULL) { @@ -721,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { int ret = seq_release(inode, file); - struct net *net = &init_net; + struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); /* this function really, really should have been called svc_put() */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0889bfb..546f898 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -563,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 *buffer, int *countp, + struct dentry *dentry, __be32 **buffer, int countp, u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6baadb5..4bb21d6 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,6 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) void __fsnotify_update_child_dentry_flags(struct inode *inode) { struct dentry *alias; - struct hlist_node *p; int watched; if (!S_ISDIR(inode->i_mode)) @@ -64,7 +63,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index f31e90f..74825be 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -36,12 +36,11 @@ static void fsnotify_recalc_inode_mask_locked(struct inode *inode) { struct fsnotify_mark *mark; - struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&inode->i_lock); - hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) + hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) new_mask |= mark->mask; inode->i_fsnotify_mask = new_mask; } @@ -87,11 +86,11 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) void fsnotify_clear_marks_by_inode(struct inode *inode) { struct fsnotify_mark *mark, *lmark; - struct hlist_node *pos, *n; + struct hlist_node *n; LIST_HEAD(free_list); spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) { + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { list_add(&mark->i.free_i_list, &free_list); hlist_del_init_rcu(&mark->i.i_list); fsnotify_get_mark(mark); @@ -129,11 +128,10 @@ static struct fsnotify_mark *fsnotify_find_inode_mark_locked( struct inode *inode) { struct fsnotify_mark *mark; - struct hlist_node *pos; assert_spin_locked(&inode->i_lock); - hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) { + hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -194,8 +192,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups) { - struct fsnotify_mark *lmark; - struct hlist_node *node, *last = NULL; + struct fsnotify_mark *lmark, *last = NULL; int ret = 0; mark->flags |= FSNOTIFY_MARK_FLAG_INODE; @@ -214,8 +211,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) { - last = node; + hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { + last = lmark; if ((lmark->group == group) && !allow_dups) { ret = -EEXIST; @@ -235,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(last, &mark->i.i_list); + hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 871569c..4216308 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -197,7 +197,6 @@ static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); - idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); atomic_dec(&group->inotify_data.user->inotify_devs); free_uid(group->inotify_data.user); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 07f7a92..e0f7c12 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -364,22 +364,20 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, { int ret; - do { - if (unlikely(!idr_pre_get(idr, GFP_KERNEL))) - return -ENOMEM; + idr_preload(GFP_KERNEL); + spin_lock(idr_lock); - spin_lock(idr_lock); - ret = idr_get_new_above(idr, i_mark, *last_wd + 1, - &i_mark->wd); + ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); + if (ret >= 0) { /* we added the mark to the idr, take a reference */ - if (!ret) { - *last_wd = i_mark->wd; - fsnotify_get_mark(&i_mark->fsn_mark); - } - spin_unlock(idr_lock); - } while (ret == -EAGAIN); + i_mark->wd = ret; + *last_wd = i_mark->wd; + fsnotify_get_mark(&i_mark->fsn_mark); + } - return ret; + spin_unlock(idr_lock); + idr_preload_end(); + return ret < 0 ? ret : 0; } static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 4df58b8..68ca5a8 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -33,12 +33,12 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { struct fsnotify_mark *mark, *lmark; - struct hlist_node *pos, *n; + struct hlist_node *n; struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { list_add(&mark->m.free_m_list, &free_list); hlist_del_init_rcu(&mark->m.m_list); fsnotify_get_mark(mark); @@ -71,12 +71,11 @@ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) { struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; - struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) + hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) new_mask |= mark->mask; m->mnt_fsnotify_mask = new_mask; } @@ -114,11 +113,10 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ { struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; - struct hlist_node *pos; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -153,8 +151,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, int allow_dups) { struct mount *m = real_mount(mnt); - struct fsnotify_mark *lmark; - struct hlist_node *node, *last = NULL; + struct fsnotify_mark *lmark, *last = NULL; int ret = 0; mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; @@ -173,8 +170,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) { - last = node; + hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { + last = lmark; if ((lmark->group == group) && !allow_dups) { ret = -EEXIST; @@ -194,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(last, &mark->m.m_list); + hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0d2bf56..aa88bd8 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -304,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn) static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) { - int ret = 0; - - do { - if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { - ret = -EAGAIN; - break; - } - spin_lock(&nn->nn_lock); - ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); - if (ret == 0) - list_add_tail(&nsw->ns_node_item, - &nn->nn_status_list); - spin_unlock(&nn->nn_lock); - } while (ret == -EAGAIN); + int ret; - if (ret == 0) { - init_waitqueue_head(&nsw->ns_wq); - nsw->ns_sys_status = O2NET_ERR_NONE; - nsw->ns_status = 0; + spin_lock(&nn->nn_lock); + ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + nsw->ns_id = ret; + list_add_tail(&nsw->ns_node_item, &nn->nn_status_list); } + spin_unlock(&nn->nn_lock); + if (ret < 0) + return ret; - return ret; + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; + return 0; } static void o2net_complete_nsw_locked(struct o2net_node *nn, diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 8db4b58..ef99972 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -169,11 +169,10 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed) { - struct hlist_node *p; struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 01ebfd0..eeac97b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2083,7 +2083,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, u8 dead_node, u8 new_master) { int i; - struct hlist_node *hash_iter; struct hlist_head *bucket; struct dlm_lock_resource *res, *next; @@ -2114,7 +2113,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, hash_iter, bucket, hash_node) { + hlist_for_each_entry(res, bucket, hash_node) { if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2273,7 +2272,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { - struct hlist_node *iter; struct dlm_lock_resource *res; int i; struct hlist_head *bucket; @@ -2299,7 +2297,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, iter, bucket, hash_node) { + hlist_for_each_entry(res, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f169da4..b7e74b5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle, * cluster groups will be staying in cache for the duration of * this operation. */ - ac->ac_allow_chain_relink = 0; + ac->ac_disable_chain_relink = 1; /* Claim the first region */ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, @@ -1823,7 +1823,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, * Do this *after* figuring out how many bits we're taking out * of our target group. */ - if (ac->ac_allow_chain_relink && + if (!ac->ac_disable_chain_relink && (prev_group_bh) && (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, @@ -1928,7 +1928,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; - ac->ac_allow_chain_relink = 1; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); @@ -1947,7 +1946,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, * searching each chain in order. Don't allow chain relinking * because we only calculate enough journal credits for one * relink per alloc. */ - ac->ac_allow_chain_relink = 0; + ac->ac_disable_chain_relink = 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b8afabf..a36d0aa 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -49,7 +49,7 @@ struct ocfs2_alloc_context { /* these are used by the chain search */ u16 ac_chain; - int ac_allow_chain_relink; + int ac_disable_chain_relink; group_search_t *ac_group_search; u64 ac_last_group; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 0ba9ea1..2e3ea30 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7189,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir, struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); - if (!ret) { + if (ret) { mlog_errno(ret); goto leave; } @@ -30,6 +30,7 @@ #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> +#include <linux/compat.h> #include "internal.h" @@ -140,6 +141,13 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) return do_sys_truncate(path, length); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) +{ + return do_sys_truncate(path, length); +} +#endif + static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; @@ -195,6 +203,13 @@ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +{ + return do_sys_ftruncate(fd, length, 1); +} +#endif + /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) diff --git a/fs/proc/base.c b/fs/proc/base.c index f3b133d..69078c7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -73,6 +73,7 @@ #include <linux/security.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/printk.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -952,7 +953,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2983dc5..4b3b3ff 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/printk.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> @@ -132,11 +133,8 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes, } if (start == NULL) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > PAGE_SIZE) /* Apparent buffer overflow */ n = PAGE_SIZE; - } n -= *ppos; if (n <= 0) break; @@ -144,26 +142,19 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes, n = count; start = page + *ppos; } else if (start < page) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > PAGE_SIZE) /* Apparent buffer overflow */ n = PAGE_SIZE; - } if (n > count) { /* * Don't reduce n because doing so might * cut off part of a data block. */ - printk(KERN_WARNING - "proc_file_read: Read count exceeded\n"); + pr_warn("proc_file_read: count exceeded\n"); } } else /* start >= page */ { unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */ n = PAGE_SIZE - startoff; - } if (n > count) n = count; } @@ -569,7 +560,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); break; } @@ -830,9 +821,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); + WARN(de->subdir, "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 70322e1..a86aebc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> +#include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> @@ -495,13 +496,13 @@ int proc_fill_super(struct super_block *s) pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { - printk(KERN_ERR "proc_fill_super: get root inode failed\n"); + pr_err("proc_fill_super: get root inode failed\n"); return -ENOMEM; } s->s_root = d_make_root(root_inode); if (!s->s_root) { - printk(KERN_ERR "proc_fill_super: allocate dentry failed\n"); + pr_err("proc_fill_super: allocate dentry failed\n"); return -ENOMEM; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 252544c..85ff3a4 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/proc_fs.h> +#include <linux/binfmts.h> struct ctl_table_header; struct mempolicy; @@ -108,7 +109,7 @@ static inline int task_dumpable(struct task_struct *task) if (mm) dumpable = get_dumpable(mm); task_unlock(task); - if (dumpable == SUID_DUMPABLE_ENABLED) + if (dumpable == SUID_DUMP_USER) return 1; return 0; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e96d4f1..eda6f01 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -17,6 +17,7 @@ #include <linux/elfcore.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> #include <linux/slab.h> @@ -619,7 +620,7 @@ static int __init proc_kcore_init(void) proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); if (!proc_root_kcore) { - printk(KERN_ERR "couldn't create /proc/kcore\n"); + pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ } /* Store text area if it's special */ diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index de20ec4..30b590f 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -8,6 +8,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/printk.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/of.h> @@ -110,8 +111,8 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde, if (ent->data == oldprop) break; if (ent == NULL) { - printk(KERN_WARNING "device-tree: property \"%s\" " - " does not exist\n", oldprop->name); + pr_warn("device-tree: property \"%s\" does not exist\n", + oldprop->name); } else { ent->data = newprop; ent->size = newprop->length; @@ -153,8 +154,8 @@ static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, realloc: fixed_name = kmalloc(fixup_len, GFP_KERNEL); if (fixed_name == NULL) { - printk(KERN_ERR "device-tree: Out of memory trying to fixup " - "name \"%s\"\n", name); + pr_err("device-tree: Out of memory trying to fixup " + "name \"%s\"\n", name); return name; } @@ -175,8 +176,8 @@ retry: goto retry; } - printk(KERN_WARNING "device-tree: Duplicate name in %s, " - "renamed to \"%s\"\n", np->full_name, fixed_name); + pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", + np->full_name, fixed_name); return fixed_name; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 612df79..ac05f33 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@ #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> +#include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/namei.h> @@ -57,7 +58,7 @@ static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); - printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); + pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) @@ -134,9 +135,9 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else if (cmp > 0) p = &(*p)->rb_right; else { - printk(KERN_ERR "sysctl duplicate entry: "); + pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - printk(KERN_CONT "/%s\n", entry->procname); + pr_cont("/%s\n", entry->procname); return -EEXIST; } } @@ -927,9 +928,9 @@ found: subdir->header.nreg++; failed: if (unlikely(IS_ERR(subdir))) { - printk(KERN_ERR "sysctl could not get directory: "); + pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - printk(KERN_CONT "/%*.*s %ld\n", + pr_cont("/%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); @@ -995,8 +996,8 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", - path, table->procname, &vaf); + pr_err("sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); va_end(args); return -EINVAL; @@ -1510,9 +1511,9 @@ static void put_links(struct ctl_table_header *header) drop_sysctl_table(link_head); } else { - printk(KERN_ERR "sysctl link missing during unregister: "); + pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - printk(KERN_CONT "/%s\n", name); + pr_cont("/%s\n", name); } } } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0d5071d..b870f74 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -15,6 +15,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> #include <linux/crash_dump.h> @@ -175,15 +176,15 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); if (!curr_m) return -EINVAL; - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; while (buflen) { + tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + tmp = read_from_oldmem(buffer, tsz, &start, 1); if (tmp < 0) return tmp; @@ -198,12 +199,6 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, struct vmcore, list); start = curr_m->paddr; } - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; } return acc; } @@ -553,8 +548,7 @@ static int __init parse_crash_elf64_headers(void) ehdr.e_ehsize != sizeof(Elf64_Ehdr) || ehdr.e_phentsize != sizeof(Elf64_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } @@ -609,8 +603,7 @@ static int __init parse_crash_elf32_headers(void) ehdr.e_ehsize != sizeof(Elf32_Ehdr) || ehdr.e_phentsize != sizeof(Elf32_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } @@ -653,8 +646,7 @@ static int __init parse_crash_elf_headers(void) if (rc < 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { - printk(KERN_WARNING "Warning: Core image elf header" - " not found\n"); + pr_warn("Warning: Core image elf header not found\n"); return -EINVAL; } @@ -673,8 +665,7 @@ static int __init parse_crash_elf_headers(void) /* Determine vmcore size. */ vmcore_size = get_vmcore_size_elf32(elfcorebuf); } else { - printk(KERN_WARNING "Warning: Core image elf header is not" - " sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } return 0; @@ -690,7 +681,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { - printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + pr_warn("Kdump: vmcore not initialized\n"); return rc; } diff --git a/fs/read_write.c b/fs/read_write.c index 3ae6dbe..a698eff 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> +#include <linux/compat.h> #include "read_write.h" #include <asm/uaccess.h> @@ -247,6 +248,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) return retval; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) +{ + return sys_lseek(fd, offset, whence); +} +#endif + #ifdef __ARCH_WANT_SYS_LLSEEK SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, @@ -278,7 +286,6 @@ out_putf: } #endif - /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others diff --git a/fs/seq_file.c b/fs/seq_file.c index f9538ea..38bb59f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -308,27 +308,27 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) mutex_lock(&m->lock); m->version = file->f_version; switch (whence) { - case 1: - offset += file->f_pos; - case 0: - if (offset < 0) - break; - retval = offset; - if (offset != m->read_pos) { - while ((retval=traverse(m, offset)) == -EAGAIN) - ; - if (retval) { - /* with extreme prejudice... */ - file->f_pos = 0; - m->read_pos = 0; - m->version = 0; - m->index = 0; - m->count = 0; - } else { - m->read_pos = offset; - retval = file->f_pos = offset; - } + case SEEK_CUR: + offset += file->f_pos; + case SEEK_SET: + if (offset < 0) + break; + retval = offset; + if (offset != m->read_pos) { + while ((retval = traverse(m, offset)) == -EAGAIN) + ; + if (retval) { + /* with extreme prejudice... */ + file->f_pos = 0; + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + } else { + m->read_pos = offset; + retval = file->f_pos = offset; } + } } file->f_version = m->version; mutex_unlock(&m->lock); @@ -447,14 +447,13 @@ struct super_block *sget(struct file_system_type *type, void *data) { struct super_block *s = NULL; - struct hlist_node *node; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { - hlist_for_each_entry(old, node, &type->fs_supers, s_instances) { + hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) @@ -554,10 +553,9 @@ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; - struct hlist_node *node; spin_lock(&sb_lock); - hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) { + hlist_for_each_entry(sb, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); @@ -842,7 +840,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) { + if (dev == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 2ce9a5d..15c68f9 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -461,14 +461,13 @@ const struct file_operations bin_fops = { void unmap_bin_file(struct sysfs_dirent *attr_sd) { struct bin_buffer *bb; - struct hlist_node *tmp; if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) return; mutex_lock(&sysfs_bin_lock); - hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) { + hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) { struct inode *inode = file_inode(bb->file); unmap_mapping_range(inode->i_mapping, 0, 0, 1); diff --git a/fs/timerfd.c b/fs/timerfd.c index 0e606b1..32b644f 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -383,10 +383,10 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } -#ifdef COMPAT +#ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct itimerspec __user *, utmr, - struct itimerspec __user *, otmr) + const struct compat_itimerspec __user *, utmr, + struct compat_itimerspec __user *, otmr) { struct itimerspec new, old; int ret; @@ -402,12 +402,12 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, } COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, - struct itimerspec __user *, otmr) + struct compat_itimerspec __user *, otmr) { struct itimerspec kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec(otmr, &t) ? -EFAULT: 0; + return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; } #endif diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96fcbb8..d1dba7c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1442,9 +1442,8 @@ xlog_recover_find_tid( xlog_tid_t tid) { xlog_recover_t *trans; - struct hlist_node *n; - hlist_for_each_entry(trans, n, head, r_list) { + hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } |