diff options
Diffstat (limited to 'fs')
427 files changed, 18940 insertions, 9634 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 9a1d426..15b6791 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -37,7 +37,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) return ERR_PTR(-ENOMEM); size = v9fs_fid_xattr_get(fid, name, value, size); if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) goto err_out; } @@ -131,7 +131,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(acl, buffer, size); + retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (retval < 0) goto err_free_out; switch (type) { @@ -251,7 +251,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -304,7 +304,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { /* update the cached acl value */ - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index b85efa7..392c5da 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -560,6 +560,11 @@ static int v9fs_init_inode_cache(void) */ static void v9fs_destroy_inode_cache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(v9fs_inode_cache); } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index dd6f7ee..c2483e9 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 0225742..0efd152 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -164,3 +164,11 @@ config BINFMT_MISC You may say M here for module support and later load the module when you have use for it; the module is called binfmt_misc. If you don't know what to answer at this point, say Y. + +config COREDUMP + bool "Enable core dump support" if EXPERT + default y + help + This option enables support for performing core dumps. You almost + certainly want to say Y here. Not necessary on systems that never + need debugging or only ever run flawless code. diff --git a/fs/Makefile b/fs/Makefile index 2fb9779..1d7af79 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_FHANDLE) += fhandle.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 718ac1f..585adaf 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -46,8 +46,8 @@ struct adfs_sb_info { struct adfs_discmap *s_map; /* bh list containing map */ struct adfs_dir_ops *s_dir; /* directory operations */ - uid_t s_uid; /* owner uid */ - gid_t s_gid; /* owner gid */ + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ umode_t s_other_mask; /* ADFS other perm -> unix perm */ int s_ftsuffix; /* ,xyz hex filetype suffix option */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 1dab6a1..e9bad50 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -304,8 +304,8 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) * we can't change the UID or GID of any file - * we have a global UID/GID in the superblock */ - if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid)) + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid))) error = -EPERM; if (error) diff --git a/fs/adfs/super.c b/fs/adfs/super.c index bdaec92..d571229 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> +#include <linux/user_namespace.h> #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" @@ -130,10 +131,10 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root) { struct adfs_sb_info *asb = ADFS_SB(root->d_sb); - if (asb->s_uid != 0) - seq_printf(seq, ",uid=%u", asb->s_uid); - if (asb->s_gid != 0) - seq_printf(seq, ",gid=%u", asb->s_gid); + if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid)); + if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid)); if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) @@ -175,12 +176,16 @@ static int parse_options(struct super_block *sb, char *options) case Opt_uid: if (match_int(args, &option)) return -EINVAL; - asb->s_uid = option; + asb->s_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(asb->s_uid)) + return -EINVAL; break; case Opt_gid: if (match_int(args, &option)) return -EINVAL; - asb->s_gid = option; + asb->s_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(asb->s_gid)) + return -EINVAL; break; case Opt_ownmask: if (match_octal(args, &option)) @@ -275,6 +280,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(adfs_inode_cachep); } @@ -369,8 +379,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = asb; /* set default options */ - asb->s_uid = 0; - asb->s_gid = 0; + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; asb->s_ftsuffix = 0; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 6e21641..3952121 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -88,8 +88,8 @@ struct affs_sb_info { u32 s_root_block; /* FFS root block number. */ int s_hashsize; /* Size of hash table. */ unsigned long s_flags; /* See below. */ - uid_t s_uid; /* uid to override */ - gid_t s_gid; /* gid to override */ + kuid_t s_uid; /* uid to override */ + kgid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ struct mutex s_bmlock; /* Protects bitmap access. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 8bc4a59..15c4842 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -80,17 +80,17 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (id == 0 || sbi->s_flags & SF_SETUID) inode->i_uid = sbi->s_uid; else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) - inode->i_uid = 0; + i_uid_write(inode, 0); else - inode->i_uid = id; + i_uid_write(inode, id); id = be16_to_cpu(tail->gid); if (id == 0 || sbi->s_flags & SF_SETGID) inode->i_gid = sbi->s_gid; else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) - inode->i_gid = 0; + i_gid_write(inode, 0); else - inode->i_gid = id; + i_gid_write(inode, id); switch (be32_to_cpu(tail->stype)) { case ST_ROOT: @@ -193,13 +193,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) tail->size = cpu_to_be32(inode->i_size); secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change); if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { - uid = inode->i_uid; - gid = inode->i_gid; + uid = i_uid_read(inode); + gid = i_gid_read(inode); if (AFFS_SB(sb)->s_flags & SF_MUFS) { - if (inode->i_uid == 0 || inode->i_uid == 0xFFFF) - uid = inode->i_uid ^ ~0; - if (inode->i_gid == 0 || inode->i_gid == 0xFFFF) - gid = inode->i_gid ^ ~0; + if (uid == 0 || uid == 0xFFFF) + uid = uid ^ ~0; + if (gid == 0 || gid == 0xFFFF) + gid = gid ^ ~0; } if (!(AFFS_SB(sb)->s_flags & SF_SETUID)) tail->uid = cpu_to_be16(uid); diff --git a/fs/affs/super.c b/fs/affs/super.c index c70f1e5..b84dc73 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -147,6 +147,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(affs_inode_cachep); } @@ -188,7 +193,7 @@ static const match_table_t tokens = { }; static int -parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root, +parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) { char *p; @@ -253,13 +258,17 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s case Opt_setgid: if (match_int(&args[0], &option)) return 0; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 0; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) return 0; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 0; *mount_opts |= SF_SETUID; break; case Opt_verbose: @@ -301,8 +310,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) int num_bm; int i, j; s32 key; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; int reserved; unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ @@ -527,8 +536,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) { struct affs_sb_info *sbi = AFFS_SB(sb); int blocksize; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; int mode; int reserved; int root_block; @@ -551,7 +560,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - flush_delayed_work_sync(&sbi->sb_work); + flush_delayed_work(&sbi->sb_work); replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 587ef51..7ef637d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -351,9 +351,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work) */ void afs_flush_callback_breaks(struct afs_server *server) { - cancel_delayed_work(&server->cb_break_work); - queue_delayed_work(afs_callback_update_worker, - &server->cb_break_work, 0); + mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0); } #if 0 diff --git a/fs/afs/server.c b/fs/afs/server.c index d59b751..f342acf 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -285,12 +285,7 @@ static void afs_reap_server(struct work_struct *work) expiry = server->time_of_death + afs_server_timeout; if (expiry > now) { delay = (expiry - now) * HZ; - if (!queue_delayed_work(afs_wq, &afs_server_reaper, - delay)) { - cancel_delayed_work(&afs_server_reaper); - queue_delayed_work(afs_wq, &afs_server_reaper, - delay); - } + mod_delayed_work(afs_wq, &afs_server_reaper, delay); break; } @@ -323,6 +318,5 @@ static void afs_reap_server(struct work_struct *work) void __exit afs_purge_servers(void) { afs_server_timeout = 0; - cancel_delayed_work(&afs_server_reaper); - queue_delayed_work(afs_wq, &afs_server_reaper, 0); + mod_delayed_work(afs_wq, &afs_server_reaper, 0); } diff --git a/fs/afs/super.c b/fs/afs/super.c index df8c604..4316500 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -123,6 +123,11 @@ void __exit afs_fs_exit(void) BUG(); } + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(afs_inode_cachep); _leave(""); } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 431984d..57bcb15 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -561,12 +561,7 @@ static void afs_vlocation_reaper(struct work_struct *work) if (expiry > now) { delay = (expiry - now) * HZ; _debug("delay %lu", delay); - if (!queue_delayed_work(afs_wq, &afs_vlocation_reap, - delay)) { - cancel_delayed_work(&afs_vlocation_reap); - queue_delayed_work(afs_wq, &afs_vlocation_reap, - delay); - } + mod_delayed_work(afs_wq, &afs_vlocation_reap, delay); break; } @@ -614,13 +609,10 @@ void afs_vlocation_purge(void) spin_lock(&afs_vlocation_updates_lock); list_del_init(&afs_vlocation_updates); spin_unlock(&afs_vlocation_updates_lock); - cancel_delayed_work(&afs_vlocation_update); - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, 0); + mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0); destroy_workqueue(afs_vlocation_update_worker); - cancel_delayed_work(&afs_vlocation_reap); - queue_delayed_work(afs_wq, &afs_vlocation_reap, 0); + mod_delayed_work(afs_wq, &afs_vlocation_reap, 0); } /* @@ -14,6 +14,7 @@ #include <linux/fcntl.h> #include <linux/security.h> #include <linux/evm.h> +#include <linux/ima.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -247,6 +248,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) if (!error) { fsnotify_change(dentry, ia_valid); + ima_inode_post_setattr(dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index abf645c..a162141 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p) return ino && ino->sbi->type & *(unsigned *)p; } -static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - __set_close_on_exec(fd, fdt); - spin_unlock(&files->file_lock); -} - - /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). @@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { int err, fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(O_CLOEXEC); if (likely(fd >= 0)) { struct file *filp; struct path path; @@ -264,7 +250,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) goto out; } - autofs_dev_ioctl_fd_install(fd, filp); + fd_install(fd, filp); } return fd; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d..dce436e 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -175,8 +175,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, return; } - pipe = sbi->pipe; - get_file(pipe); + pipe = get_file(sbi->pipe); mutex_unlock(&sbi->wq_mutex); diff --git a/fs/befs/befs.h b/fs/befs/befs.h index d9a40ab..b266428 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -20,8 +20,8 @@ typedef u64 befs_blocknr_t; */ typedef struct befs_mount_options { - gid_t gid; - uid_t uid; + kgid_t gid; + kuid_t uid; int use_gid; int use_uid; int debug; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index cf7f3c6..2b3bda8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -15,6 +15,7 @@ #include <linux/vfs.h> #include <linux/parser.h> #include <linux/namei.h> +#include <linux/sched.h> #include "befs.h" #include "btree.h" @@ -352,9 +353,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) */ inode->i_uid = befs_sb->mount_opts.use_uid ? - befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid); + befs_sb->mount_opts.uid : + make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid)); inode->i_gid = befs_sb->mount_opts.use_gid ? - befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); + befs_sb->mount_opts.gid : + make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid)); set_nlink(inode, 1); @@ -454,6 +457,11 @@ befs_init_inodecache(void) static void befs_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(befs_inode_cachep); } @@ -674,10 +682,12 @@ parse_options(char *options, befs_mount_options * opts) char *p; substring_t args[MAX_OPT_ARGS]; int option; + kuid_t uid; + kgid_t gid; /* Initialize options */ - opts->uid = 0; - opts->gid = 0; + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; opts->use_uid = 0; opts->use_gid = 0; opts->iocharset = NULL; @@ -696,23 +706,29 @@ parse_options(char *options, befs_mount_options * opts) case Opt_uid: if (match_int(&args[0], &option)) return 0; - if (option < 0) { + uid = INVALID_UID; + if (option >= 0) + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { printk(KERN_ERR "BeFS: Invalid uid %d, " "using default\n", option); break; } - opts->uid = option; + opts->uid = uid; opts->use_uid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; - if (option < 0) { + gid = INVALID_GID; + if (option >= 0) + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { printk(KERN_ERR "BeFS: Invalid gid %d, " "using default\n", option); break; } - opts->gid = option; + opts->gid = gid; opts->use_gid = 1; break; case Opt_charset: diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 9870417..737aaa3 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -76,8 +76,8 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); - inode->i_uid = le32_to_cpu(di->i_uid); - inode->i_gid = le32_to_cpu(di->i_gid); + i_uid_write(inode, le32_to_cpu(di->i_uid)); + i_gid_write(inode, le32_to_cpu(di->i_gid)); set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); @@ -139,8 +139,8 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_ino = cpu_to_le16(ino); di->i_mode = cpu_to_le32(inode->i_mode); - di->i_uid = cpu_to_le32(inode->i_uid); - di->i_gid = cpu_to_le32(inode->i_gid); + di->i_uid = cpu_to_le32(i_uid_read(inode)); + di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -280,6 +280,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(bfs_inode_cachep); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index d146e18..0e7a6f8 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -32,31 +32,8 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -static int aout_core_dump(struct coredump_params *cprm); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, - .core_dump = aout_core_dump, - .min_coredump = PAGE_SIZE -}; - -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; - } - return 0; -} +#ifdef CONFIG_COREDUMP /* * Routine writes a core dump image in the current directory. * Currently only a stub-function. @@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end) * field, which also makes sure the core-dumps won't be recursive if the * dumping of the process results in another error.. */ - static int aout_core_dump(struct coredump_params *cprm) { struct file *file = cprm->file; @@ -89,7 +65,7 @@ static int aout_core_dump(struct coredump_params *cprm) current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = cprm->signr; + dump.signal = cprm->siginfo->si_signo; aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen @@ -135,6 +111,32 @@ end_coredump: set_fs(fs); return has_dumped; } +#else +#define aout_core_dump NULL +#endif + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, + .core_dump = aout_core_dump, + .min_coredump = PAGE_SIZE +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end > start) { + unsigned long addr; + addr = vm_brk(start, end - start); + if (BAD_ADDR(addr)) + return addr; + } + return 0; +} /* * create_aout_tables() parses the env- and arg-strings in new user diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1b52956..fbd9f60 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/vmalloc.h> #include <linux/security.h> #include <linux/random.h> #include <linux/elf.h> @@ -35,7 +36,13 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> -#include <asm/exec.h> + +#ifndef user_long_t +#define user_long_t long +#endif +#ifndef user_siginfo_t +#define user_siginfo_t siginfo_t +#endif static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); @@ -881,7 +888,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } if (elf_interpreter) { - unsigned long uninitialized_var(interp_map_addr); + unsigned long interp_map_addr = 0; elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, @@ -1115,7 +1122,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, if (always_dump_vma(vma)) goto whole; - if (vma->vm_flags & VM_NODUMP) + if (vma->vm_flags & VM_DONTDUMP) return 0; /* Hugetlb memory check */ @@ -1127,7 +1134,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (vma->vm_flags & VM_IO) return 0; /* By default, dump shared memory if mapped from an anonymous file. */ @@ -1372,6 +1379,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); } +static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, + siginfo_t *siginfo) +{ + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); + set_fs(old_fs); + fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); +} + +#define MAX_FILE_NOTE_SIZE (4*1024*1024) +/* + * Format of NT_FILE note: + * + * long count -- how many files are mapped + * long page_size -- units for file_ofs + * array of [COUNT] elements of + * long start + * long end + * long file_ofs + * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... + */ +static void fill_files_note(struct memelfnote *note) +{ + struct vm_area_struct *vma; + unsigned count, size, names_ofs, remaining, n; + user_long_t *data; + user_long_t *start_end_ofs; + char *name_base, *name_curpos; + + /* *Estimated* file count and total data size needed */ + count = current->mm->map_count; + size = count * 64; + + names_ofs = (2 + 3 * count) * sizeof(data[0]); + alloc: + if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + goto err; + size = round_up(size, PAGE_SIZE); + data = vmalloc(size); + if (!data) + goto err; + + start_end_ofs = data + 2; + name_base = name_curpos = ((char *)data) + names_ofs; + remaining = size - names_ofs; + count = 0; + for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + struct file *file; + const char *filename; + + file = vma->vm_file; + if (!file) + continue; + filename = d_path(&file->f_path, name_curpos, remaining); + if (IS_ERR(filename)) { + if (PTR_ERR(filename) == -ENAMETOOLONG) { + vfree(data); + size = size * 5 / 4; + goto alloc; + } + continue; + } + + /* d_path() fills at the end, move name down */ + /* n = strlen(filename) + 1: */ + n = (name_curpos + remaining) - filename; + remaining = filename - name_curpos; + memmove(name_curpos, filename, n); + name_curpos += n; + + *start_end_ofs++ = vma->vm_start; + *start_end_ofs++ = vma->vm_end; + *start_end_ofs++ = vma->vm_pgoff; + count++; + } + + /* Now we know exact count of files, can store it */ + data[0] = count; + data[1] = PAGE_SIZE; + /* + * Count usually is less than current->mm->map_count, + * we need to move filenames down. + */ + n = current->mm->map_count - count; + if (n != 0) { + unsigned shift_bytes = n * 3 * sizeof(data[0]); + memmove(name_base - shift_bytes, name_base, + name_curpos - name_base); + name_curpos -= shift_bytes; + } + + size = name_curpos - (char *)data; + fill_note(note, "CORE", NT_FILE, size, data); + err: ; +} + #ifdef CORE_DUMP_USE_REGSET #include <linux/regset.h> @@ -1385,7 +1489,10 @@ struct elf_thread_core_info { struct elf_note_info { struct elf_thread_core_info *thread; struct memelfnote psinfo; + struct memelfnote signote; struct memelfnote auxv; + struct memelfnote files; + user_siginfo_t csigdata; size_t size; int thread_notes; }; @@ -1480,7 +1587,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - long signr, struct pt_regs *regs) + siginfo_t *siginfo, struct pt_regs *regs) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1550,7 +1657,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, signr, &info->size)) + if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) return 0; /* @@ -1559,9 +1666,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); + fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + info->size += notesize(&info->signote); + fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); + fill_files_note(&info->files); + info->size += notesize(&info->files); + return 1; } @@ -1588,8 +1701,12 @@ static int write_note_info(struct elf_note_info *info, if (first && !writenote(&info->psinfo, file, foffset)) return 0; + if (first && !writenote(&info->signote, file, foffset)) + return 0; if (first && !writenote(&info->auxv, file, foffset)) return 0; + if (first && !writenote(&info->files, file, foffset)) + return 0; for (i = 1; i < info->thread_notes; ++i) if (t->notes[i].data && @@ -1616,6 +1733,7 @@ static void free_note_info(struct elf_note_info *info) kfree(t); } kfree(info->psinfo.data); + vfree(info->files.data); } #else @@ -1681,6 +1799,7 @@ struct elf_note_info { #ifdef ELF_CORE_COPY_XFPREGS elf_fpxregset_t *xfpu; #endif + user_siginfo_t csigdata; int thread_status_size; int numnote; }; @@ -1690,48 +1809,37 @@ static int elf_note_info_init(struct elf_note_info *info) memset(info, 0, sizeof(*info)); INIT_LIST_HEAD(&info->thread_list); - /* Allocate space for six ELF notes */ - info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); + /* Allocate space for ELF notes */ + info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL); if (!info->notes) return 0; info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); if (!info->psinfo) - goto notes_free; + return 0; info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); if (!info->prstatus) - goto psinfo_free; + return 0; info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); if (!info->fpu) - goto prstatus_free; + return 0; #ifdef ELF_CORE_COPY_XFPREGS info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); if (!info->xfpu) - goto fpu_free; + return 0; #endif return 1; -#ifdef ELF_CORE_COPY_XFPREGS - fpu_free: - kfree(info->fpu); -#endif - prstatus_free: - kfree(info->prstatus); - psinfo_free: - kfree(info->psinfo); - notes_free: - kfree(info->notes); - return 0; } static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - long signr, struct pt_regs *regs) + siginfo_t *siginfo, struct pt_regs *regs) { struct list_head *t; if (!elf_note_info_init(info)) return 0; - if (signr) { + if (siginfo->si_signo) { struct core_thread *ct; struct elf_thread_status *ets; @@ -1749,13 +1857,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, int sz; ets = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, ets); + sz = elf_dump_thread_status(siginfo->si_signo, ets); info->thread_status_size += sz; } } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, signr); + fill_prstatus(info->prstatus, current, siginfo->si_signo); elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ @@ -1772,9 +1880,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - info->numnote = 2; + fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_auxv_note(info->notes + 3, current->mm); + fill_files_note(info->notes + 4); - fill_auxv_note(&info->notes[info->numnote++], current->mm); + info->numnote = 5; /* Try to dump the FPU. */ info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, @@ -1836,6 +1946,9 @@ static void free_note_info(struct elf_note_info *info) kfree(list_entry(tmp, struct elf_thread_status, list)); } + /* Free data allocated by fill_files_note(): */ + vfree(info->notes[4].data); + kfree(info->prstatus); kfree(info->psinfo); kfree(info->notes); @@ -1962,7 +2075,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) + if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3d77cf8..a460491 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,7 +39,6 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> -#include <asm/exec.h> typedef char *elf_caddr_t; @@ -1205,7 +1204,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) int dump_ok; /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + if (vma->vm_flags & VM_IO) { kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); return 0; } @@ -1642,7 +1641,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto cleanup; #endif - if (cprm->signr) { + if (cprm->siginfo->si_signo) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1661,13 +1660,13 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(cprm->signr, tmp); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, cprm->signr); + fill_prstatus(prstatus, current, cprm->siginfo->si_signo); elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 178cb70..e280352 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -107,7 +107,7 @@ static struct linux_binfmt flat_format = { static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) cprm->signr); + current->comm, current->pid, (int) cprm->siginfo->si_signo); return(1); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 37967bc..b3c1d3d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -132,8 +132,7 @@ int set_blocksize(struct block_device *bdev, int size) /* Check that the block device is not memory mapped */ mapping = bdev->bd_inode->i_mapping; mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear)) { + if (mapping_mapped(mapping)) { mutex_unlock(&mapping->i_mmap_mutex); percpu_up_write(&bdev->bd_block_size_semaphore); return -EBUSY; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 761e2cd..0c16e3d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); } if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; @@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return ret; @@ -141,7 +141,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) goto out; } @@ -169,7 +169,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ff6475f..f318793 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "backref.h" @@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { ret = ulist_add(parents, eb->start, - (unsigned long)eie, GFP_NOFS); + (uintptr_t)eie, GFP_NOFS); if (ret < 0) break; if (!extent_item_pos) { @@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; - ref->inode_list = - node ? (struct extent_inode_elem *)node->aux : 0; + ref->inode_list = node ? + (struct extent_inode_elem *)(uintptr_t)node->aux : 0; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { @@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; - new_ref->inode_list = - (struct extent_inode_elem *)node->aux; + new_ref->inode_list = (struct extent_inode_elem *) + (uintptr_t)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -914,8 +915,8 @@ again: free_extent_buffer(eb); } ret = ulist_add_merge(refs, ref->parent, - (unsigned long)ref->inode_list, - (unsigned long *)&eie, GFP_NOFS); + (uintptr_t)ref->inode_list, + (u64 *)&eie, GFP_NOFS); if (!ret && extent_item_pos) { /* * we've recorded that parent, so we must extend @@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks) while ((node = ulist_next(blocks, &uiter))) { if (!node->aux) continue; - eie = (struct extent_inode_elem *)node->aux; + eie = (struct extent_inode_elem *)(uintptr_t)node->aux; for (; eie; eie = eie_next) { eie_next = eie->next; kfree(eie); @@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, found_key); } -/* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + +static char *ref_to_path(struct btrfs_root *fs_root, + struct btrfs_path *path, + u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size) { - u32 len; int slot; u64 next_inum; int ret; @@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct extent_buffer *eb = eb_in; struct btrfs_key found_key; int leave_spinning = path->leave_spinning; + struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; path->leave_spinning = 1; while (1) { - len = btrfs_inode_ref_name_len(eb, iref); - bytes_left -= len; + bytes_left -= name_len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, - (unsigned long)(iref + 1), len); + name_off, name_len); if (eb != eb_in) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, ret = -ENOENT; if (ret) break; + next_inum = found_key.offset; /* regular exit ahead */ @@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); } btrfs_release_path(path); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + parent = next_inum; --bytes_left; if (bytes_left >= 0) @@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } /* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_iref_to_path(struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + return ref_to_path(fs_root, path, + btrfs_inode_ref_name_len(eb_in, iref), + (unsigned long)(iref + 1), + eb_in, parent, dest, size); +} + +/* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for * tree blocks and <0 on error. */ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key) + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) { int ret; u64 flags; @@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, (unsigned long long)found_key->objectid, (unsigned long long)found_key->offset, (unsigned long long)flags, item_size); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - return BTRFS_EXTENT_FLAG_TREE_BLOCK; - if (flags & BTRFS_EXTENT_FLAG_DATA) - return BTRFS_EXTENT_FLAG_DATA; + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG_ON(1); + return 0; + } return -EIO; } @@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " - "%#lx\n", root_node->val, ref_node->val, - ref_node->aux); - ret = iterate_leaf_refs( - (struct extent_inode_elem *)ref_node->aux, - root_node->val, extent_item_objectid, - iterate, ctx); + "%#llx\n", root_node->val, ref_node->val, + (long long)ref_node->aux); + ret = iterate_leaf_refs((struct extent_inode_elem *) + (uintptr_t)ref_node->aux, + root_node->val, + extent_item_objectid, + iterate, ctx); } ulist_free(roots); roots = NULL; @@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, { int ret; u64 extent_item_pos; + u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; - ret = extent_from_logical(fs_info, logical, path, - &found_key); + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); btrfs_release_path(path); if (ret < 0) return ret; - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; extent_item_pos = logical - found_key.objectid; @@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) { int ret = 0; int slot; @@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, while (!ret) { path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + &found_key); if (ret < 0) break; if (ret) { @@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, "tree %llu\n", cur, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); - ret = iterate(parent, iref, eb, ctx); + ret = iterate(parent, name_len, + (unsigned long)(iref + 1), eb, ctx); if (ret) break; len = sizeof(*iref) + name_len; @@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, return ret; } +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = iterate(parent, name_len, + (unsigned long)&extref->name, eb, ctx); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += sizeof(*extref); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, iterate_irefs_t *iterate, + void *ctx) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ -static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx) +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx) { struct inode_fs_paths *ipath = ctx; char *fspath; @@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, - inum, fspath_min, bytes_left); + fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, + bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); if (fspath > fspath_min) { - pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { - pr_debug("missed path, not enough space. missing bytes: %lu, " - "constructed so far: %s\n", - (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; @@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + inode_to_path, ipath); } struct btrfs_data_container *init_data_container(u32 total_bytes) @@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kmalloc(alloc_bytes, GFP_NOFS); + data = vmalloc(alloc_bytes); if (!data) return ERR_PTR(-ENOMEM); @@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - kfree(ipath->fspath); + vfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 032f4dc..e755330 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -33,14 +33,13 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx); int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key); + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_extent_item *ei, u32 item_size, @@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); void free_ipath(struct inode_fs_paths *ipath); +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); + #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5b2ad6b..ed8ca7c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -38,6 +38,7 @@ #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 +#define BTRFS_INODE_NEEDS_FULL_SYNC 7 /* in memory btrfs inode */ struct btrfs_inode { @@ -143,6 +144,9 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; + /* a local copy of root's last_log_commit */ + unsigned long last_log_commit; + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent @@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); if (BTRFS_I(inode)->logged_trans == generation && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; + BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit) + return 1; + return 0; } #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9197e2e..5a3e45d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -37,8 +37,9 @@ * the file system was mounted, (i.e., they have been * referenced by the super block) or they have been * written since then and the write completion callback - * was called and a FLUSH request to the device where - * these blocks are located was received and completed. + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. * 2b. All referenced blocks need to have a generation * number which is equal to the parent's number. * @@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, (unsigned long long)l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; } else if (l->parent_generation != l->block_ref_to->generation && BTRFSIC_GENERATION_UNKNOWN != diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 43d1c5a..c6467aa 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_start; struct extent_map *em; int ret = -ENOMEM; + int faili = 0; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!cb->compressed_pages[pg_index]) + if (!cb->compressed_pages[pg_index]) { + faili = pg_index - 1; + ret = -ENOMEM; goto fail2; + } } + faili = nr_pages - 1; cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, return 0; fail2: - for (pg_index = 0; pg_index < nr_pages; pg_index++) - free_page((unsigned long)cb->compressed_pages[pg_index]); + while (faili >= 0) { + __free_page(cb->compressed_pages[faili]); + faili--; + } kfree(cb->compressed_pages); fail1: diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d183f6..b334362 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, } /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int ret = 0; - int slot; - int i; - u32 nritems; - u32 total_data = 0; - u32 total_size = 0; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct btrfs_key found_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - for (i = 0; i < nr; i++) { - if (total_size + data_size[i] + sizeof(struct btrfs_item) > - BTRFS_LEAF_DATA_SIZE(root)) { - break; - nr = i; - } - total_data += data_size[i]; - total_size += data_size[i] + sizeof(struct btrfs_item); - } - BUG_ON(nr == 0); - - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - for (i = nr; i >= 0; i--) { - total_data -= data_size[i]; - total_size -= data_size[i] + sizeof(struct btrfs_item); - if (total_size < btrfs_leaf_free_space(root, leaf)) - break; - } - nr = i; - } - - slot = path->slots[0]; - BUG_ON(slot < 0); - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* figure out how many keys we can insert in here */ - total_data = data_size[0]; - for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) - break; - total_data += data_size[i]; - } - nr = i; - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); - } - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } else { - /* - * this sucks but it has to be done, if we are inserting at - * the end of the leaf only insert 1 of the items, since we - * have no way of knowing whats on the next leaf and we'd have - * to drop our current locks to figure it out - */ - nr = 1; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); - } - btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -out: - if (!ret) - ret = nr; - return ret; -} - -/* * this is a helper for btrfs_insert_empty_items, the main goal here is * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot @@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + BUG_ON(*level == 0); path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], path->slots[*level]); path->slots[*level - 1] = 0; @@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root, path->slots[*level]++; - while (path->slots[*level] == nritems) { + while (path->slots[*level] >= nritems) { if (*level == root_level) return -1; @@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); if (ret) { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = changed_cb(left_root, right_root, left_path, right_path, &left_key, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d195b5..926c9ff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -116,7 +116,7 @@ struct btrfs_ordered_sum; #define BTRFS_FREE_SPACE_OBJECTID -11ULL /* - * The inode number assigned to the special inode for sotring + * The inode number assigned to the special inode for storing * free ino cache */ #define BTRFS_FREE_INO_OBJECTID -12ULL @@ -154,6 +154,13 @@ struct btrfs_ordered_sum; */ #define BTRFS_NAME_LEN 255 +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* 32 bytes in various csum fields */ #define BTRFS_CSUM_SIZE 32 @@ -489,6 +496,8 @@ struct btrfs_super_block { */ #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) + #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -496,7 +505,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* * A leaf is full of items. offset and size tell us where to find @@ -643,6 +653,14 @@ struct btrfs_inode_ref { /* name goes here */ } __attribute__ ((__packed__)); +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + struct btrfs_timespec { __le64 sec; __le32 nsec; @@ -1028,12 +1046,22 @@ struct btrfs_space_info { wait_queue_head_t wait; }; +#define BTRFS_BLOCK_RSV_GLOBAL 1 +#define BTRFS_BLOCK_RSV_DELALLOC 2 +#define BTRFS_BLOCK_RSV_TRANS 3 +#define BTRFS_BLOCK_RSV_CHUNK 4 +#define BTRFS_BLOCK_RSV_DELOPS 5 +#define BTRFS_BLOCK_RSV_EMPTY 6 +#define BTRFS_BLOCK_RSV_TEMP 7 + struct btrfs_block_rsv { u64 size; u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full; + unsigned short full; + unsigned short type; + unsigned short failfast; }; /* @@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache { * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; + + /* For delayed block group creation */ + struct list_head new_bg_list; }; /* delayed seq elem */ @@ -1240,7 +1271,6 @@ struct btrfs_fs_info { struct mutex reloc_mutex; struct list_head trans_list; - struct list_head hashers; struct list_head dead_roots; struct list_head caching_block_groups; @@ -1366,9 +1396,6 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; - spinlock_t ref_cache_lock; - u64 total_ref_cache_size; - /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1441,6 +1468,8 @@ struct btrfs_fs_info { /* next backup root to be overwritten */ int backup_root_index; + + int num_tolerated_disk_barrier_failures; }; /* @@ -1481,9 +1510,9 @@ struct btrfs_root { wait_queue_head_t log_commit_wait[2]; atomic_t log_writers; atomic_t log_commit[2]; + atomic_t log_batch; unsigned long log_transid; unsigned long last_log_commit; - unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; @@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_INODE_ITEM_KEY 1 #define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 #define BTRFS_XATTR_ITEM_KEY 24 #define BTRFS_ORPHAN_ITEM_KEY 48 /* reserve 2-15 close to the inode for later flexibility */ @@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags, BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); @@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, @@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod); +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod, + u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, + u64 ref_objectid, const char *name, + int name_len, + struct btrfs_inode_extref **extref_ret); + /* file-item.c */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); @@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, @@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); + /* file.c */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, + u64 start, u64 end, int skip_pinned, + int modified); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache); +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ + #define btrfs_abort_transaction(trans, root, errno) \ do { \ __btrfs_abort_transaction(trans, root, __func__, \ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 07d5eeb..478f66b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("delayed_node", + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, @@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata( * we're accounted for. */ if (!src_rsv || (!trans->bytes_reserved && - src_rsv != &root->fs_info->delalloc_block_rsv)) { + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); /* * Since we're under a transaction reserve_metadata_bytes could @@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { spin_lock(&BTRFS_I(inode)->lock); if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) { @@ -1715,8 +1715,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { - btrfs_set_stack_inode_uid(inode_item, inode->i_uid); - btrfs_set_stack_inode_gid(inode_item, inode->i_gid); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); btrfs_set_stack_inode_mode(inode_item, inode->i_mode); btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); @@ -1764,8 +1764,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - inode->i_uid = btrfs_stack_inode_uid(inode_item); - inode->i_gid = btrfs_stack_inode_gid(inode_item); + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ab53005..c9d7036 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,7 +18,7 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ -/* these are the possible values of struct btrfs_delayed_ref->action */ +/* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 22e98e0..7cda519 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,10 @@ #include "check-integrity.h" #include "rcu-string.h" +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif + static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; - } + if (!em) + em = ERR_PTR(-EIO); } else if (ret) { free_extent_map(em); - em = NULL; + em = ERR_PTR(ret); } write_unlock(&em_tree->lock); - if (ret) - em = ERR_PTR(ret); out: return em; } @@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); return 0; } - if (eb->pages[0] != page) { - WARN_ON(1); - return 0; - } if (!PageUptodate(page)) { WARN_ON(1); return 0; @@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); } +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; +} + static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int async = check_async_write(inode, bio_flags); int ret; if (!(rw & REQ_WRITE)) { @@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + return ret; + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } /* @@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - cannot_commit = true; + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; goto sleep; } if (transid == trans->transid) { @@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); @@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); - btrfs_init_block_rsv(&fs_info->global_block_rsv); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); - btrfs_init_block_rsv(&fs_info->trans_block_rsv); - btrfs_init_block_rsv(&fs_info->chunk_block_rsv); - btrfs_init_block_rsv(&fs_info->empty_block_rsv); - btrfs_init_block_rsv(&fs_info->delayed_block_rsv); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -2491,6 +2501,8 @@ retry_root_backup: printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); @@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) printk_in_rcu("btrfs: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; - } - if (!bio_flagged(bio, BIO_UPTODATE)) { + } else if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; - if (!bio_flagged(bio, BIO_EOPNOTSUPP)) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ @@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors = 0; + int errors_send = 0; + int errors_wait = 0; int ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_send++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 0); if (ret) - errors++; + errors_send++; } /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_wait++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 1); if (ret) - errors++; + errors_wait++; } - if (errors) + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) return -EIO; return 0; } +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 + && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - if (do_barriers) - barrier_all_devices(root->fs_info); + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { @@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root) printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); - } free_extent_buffer(fs_info->extent_root->node); free_extent_buffer(fs_info->extent_root->commit_root); @@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)) -{ - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *eb; - - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) - goto out; - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) - goto out; - - if (!btrfs_try_tree_write_lock(eb)) { - flush_fn(data); - btrfs_tree_lock(eb); - } - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - btrfs_tree_unlock(eb); -out: - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - return 0; -} - static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { @@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); + mark, NULL); if (ret) break; @@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) } static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c5b00a7..2025a91 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ba58024..3d3e2c1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); + struct btrfs_root *extent_root, u64 flags, + int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, while (start < end) { ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); if (ret) break; @@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; @@ -2551,6 +2544,12 @@ again: } if (run_all) { + if (!list_empty(&trans->new_bgs)) { + spin_unlock(&delayed_refs->lock); + btrfs_create_pending_block_groups(trans, root); + spin_lock(&delayed_refs->lock); + } + node = rb_first(&delayed_refs->root); if (!node) goto out; @@ -3406,7 +3405,6 @@ alloc: return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, alloc_target, CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); @@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes, - int force) + struct btrfs_space_info *sinfo, int force) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root, * and purposes it's used space. Don't worry about locking the * global_rsv, it doesn't change except when the transaction commits. */ - num_allocated += global_rsv->size; + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += global_rsv->size; /* * in limited mode, we want to have some free space up to @@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; - - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, } static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) + struct btrfs_root *extent_root, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; @@ -3601,7 +3591,7 @@ again: return 0; } - if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { + if (!should_alloc_chunk(extent_root, space_info, force)) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { @@ -3669,6 +3659,46 @@ out: return ret; } +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + int flush) +{ + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + /* * shrink metadata reservation for delalloc */ @@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); return; } @@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, WB_REASON_FS_FREE_SPACE); + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + wait_event(root->fs_info->async_submit_wait, + !atomic_read(&root->fs_info->async_delalloc_pages)); + spin_lock(&space_info->lock); - if (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use + orig <= - space_info->total_bytes) { + if (can_overcommit(root, space_info, orig, !trans)) { spin_unlock(&space_info->lock); break; } @@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -3784,11 +3818,12 @@ commit: } enum flush_state { - FLUSH_DELALLOC = 1, - FLUSH_DELALLOC_WAIT = 2, - FLUSH_DELAYED_ITEMS_NR = 3, - FLUSH_DELAYED_ITEMS = 4, - COMMIT_TRANS = 5, + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, }; static int flush_space(struct btrfs_root *root, @@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root, int ret = 0; switch (state) { - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, - state == FLUSH_DELALLOC_WAIT); - break; case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: if (state == FLUSH_DELAYED_ITEMS_NR) { @@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root, ret = btrfs_run_delayed_items_nr(trans, root, nr); btrfs_end_transaction(trans, root); break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; case COMMIT_TRANS: ret = may_commit_transaction(root, space_info, orig_bytes, 0); break; @@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELALLOC; + int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; bool flushing = false; - bool committed = false; again: ret = 0; @@ -3922,57 +3969,12 @@ again: (orig_bytes * 2); } - if (ret) { - u64 profile = btrfs_get_alloc_profile(root, 0); - u64 avail; - - /* - * If we have a lot of space that's pinned, don't bother doing - * the overcommit dance yet and just commit the transaction. - */ - avail = (space_info->total_bytes - space_info->bytes_used) * 8; - do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !committed) { - space_info->flush = 1; - flushing = true; - spin_unlock(&space_info->lock); - ret = may_commit_transaction(root, space_info, - orig_bytes, 1); - if (ret) - goto out; - committed = true; - goto again; - } - - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. - */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; - - /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. - */ - if (flush) - avail >>= 3; - else - avail >>= 1; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (used + num_bytes < space_info->total_bytes + avail) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; } /* @@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); + rsv->type = type; } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv); + btrfs_init_block_rsv(block_rsv, type); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); return block_rsv; @@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { + if (!rsv) + return; btrfs_block_rsv_release(root, rsv, (u64)-1); kfree(rsv); } @@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; /* - * two for root back/forward refs, two for directory entries - * and one for root of the snapshot. + * two for root back/forward refs, two for directory entries, + * one for root of the snapshot and one for parent inode. */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); dst_rsv->space_info = src_rsv->space_info; return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } extent_slot = path->slots[0]; } } else if (ret == -ENOENT) { @@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); } else { - goto abort; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } leaf = path->nodes[0]; @@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } } else { if (found_extent) { @@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } out: btrfs_free_path(path); return ret; - -abort: - btrfs_abort_transaction(trans, extent_root, ret); - goto out; } /* @@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int loop = 0; int index = 0; @@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) @@ -5806,10 +5821,6 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); if (used_block_group != block_group) btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); @@ -5842,34 +5853,17 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, - root, ret); - goto out; - } - allowed_chunk_alloc = 0; - if (ret == 1) - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == - CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + ret = do_chunk_alloc(trans, root, data, + CHUNK_ALLOC_FORCE); + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, + root, ret); + goto out; } - - /* - * We didn't allocate a chunk, go ahead and drop the - * empty size and loop again. - */ - if (!done_chunk_alloc) - loop = LOOP_NO_EMPTY_SIZE; } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, data = btrfs_get_alloc_profile(root, data); again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, - CHUNK_ALLOC_NO_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - } - WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, hint_byte, ins, data); @@ -5967,12 +5947,6 @@ again: num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret) { + if (ret && !block_rsv->failfast) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); @@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type) { u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + return do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); } @@ -7810,6 +7784,34 @@ error: return ret; } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, + new_bg_list) { + list_del_init(&block_group->new_bg_list); + + if (ret) + continue; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + } +} + int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); btrfs_init_free_space_ctl(cache); @@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); /* Logic error */ - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - return ret; - } + list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c87847..8036d3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -45,6 +45,7 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree) int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) @@ -107,6 +108,12 @@ void extent_io_exit(void) list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) @@ -936,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * @end: the end offset in bytes (inclusive) * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist @@ -945,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask) + int bits, int clear_bits, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -962,6 +971,15 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* * this search will find all the extents that end after * our range starts. @@ -992,6 +1010,7 @@ hit_next: */ if (state->start == start && state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1032,6 +1051,7 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1070,6 +1090,7 @@ hit_next: &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1092,6 +1113,7 @@ hit_next: extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); clear_state_bit(tree, prealloc, &clear_bits, 0); prealloc = NULL; goto out; @@ -1144,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, NULL, cached_state, mask); } +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { @@ -1288,18 +1318,42 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) + u64 *start_ret, u64 *end_ret, int bits, + struct extent_state **cached_state) { struct extent_state *state; + struct rb_node *n; int ret = 1; spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && state->tree) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + state = find_first_extent_bit_state(tree, start, bits); +got_it: if (state) { + cache_state(state, cached_state); *start_ret = state->start; *end_ret = state->end; ret = 0; } +out: spin_unlock(&tree->lock); return ret; } @@ -2062,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } read_unlock(&em_tree->lock); - if (!em || IS_ERR(em)) { + if (!em) { kfree(failrec); return -EIO; } @@ -2298,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; - pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " - "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%ld\n", (u64)bio->bi_sector, err, (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2703,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - BUG_ON(ret == -ENOMEM); - nr++; - *bio_flags = this_bio_flag; + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } } - if (ret) + if (ret) { SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1); + } cur = cur + iosize; pg_offset += iosize; } @@ -3155,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb, struct block_device *bdev = fs_info->fs_devices->latest_bdev; u64 offset = eb->start; unsigned long i, num_pages; + unsigned long bio_flags = 0; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); @@ -3169,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb, ret = submit_extent_page(rw, eb->tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, 0, 0); + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; if (ret) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); SetPageError(p); @@ -3204,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; int ret = 0; int done = 0; @@ -3248,19 +3311,34 @@ retry: break; } + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ if (!eb) { + spin_unlock(&mapping->private_lock); WARN_ON(1); continue; } - if (eb == prev_eb) + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); continue; + } - if (!atomic_inc_not_zero(&eb->refs)) { - WARN_ON(1); + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) continue; - } prev_eb = eb; ret = lock_extent_buffer_for_io(eb, fs_info, &epd); @@ -3451,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->sync_io) rw = WRITE_SYNC; - ret = submit_one_bio(rw, epd->bio, 0, 0); + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -3474,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -3498,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -3537,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = extent_write_cache_pages(tree, mapping, wbc, @@ -3914,18 +3995,6 @@ out: return ret; } -inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - -inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - static void __free_extent_buffer(struct extent_buffer *eb) { #if LEAK_DEBUG @@ -4041,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) return eb; err: - for (i--; i > 0; i--) + for (i--; i >= 0; i--) __free_page(eb->pages[i]); __free_extent_buffer(eb); return NULL; @@ -4186,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); - if (!p) { - WARN_ON(1); + if (!p) goto free_eb; - } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { @@ -4332,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb, 0); - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 25900af..711d12b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -27,6 +27,7 @@ * type for this bio */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_TREE_LOG 2 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask); + int bits, int clear_bits, + struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits); + u64 *start_ret, u64 *end_ret, int bits, + struct extent_state **cached_state); struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, int bits); int extent_invalidatepage(struct extent_io_tree *tree, @@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); -unsigned long num_extent_pages(u64 start, u64 len); -struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + return eb->pages[i]; +} static inline void extent_buffer_get(struct extent_buffer *eb) { diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7c97b33..b8cbc8d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", + extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) @@ -35,6 +35,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree) { tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void) em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); return em; } @@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); + WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } } @@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; + if (merge->generation > em->generation) { + em->mod_start = em->start; + em->mod_len = em->len; + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + + list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; + if (merge->generation > em->generation) { + em->mod_len = em->len; + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + list_del_init(&merge->list); free_extent_map(merge); } } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +/** + * unpint_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * @prealloc: if this is set we need to clear the prealloc flag + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) { int ret = 0; struct extent_map *em; + bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) if (!em) goto out; + list_move(&em->list, &tree->modified_extents); + em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } try_merge_map(tree, em); + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + free_extent_map(em); out: write_unlock(&tree->lock); @@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree, } atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + try_merge_map(tree, em); out: return ret; @@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); em->in_tree = 0; return ret; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 1195f09..6792255 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,6 +13,7 @@ #define EXTENT_FLAG_COMPRESSED 1 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ struct extent_map { struct rb_node rb_node; @@ -20,18 +21,23 @@ struct extent_map { /* all of these are in bytes */ u64 start; u64 len; + u64 mod_start; + u64 mod_len; u64 orig_start; u64 block_start; u64 block_len; + u64 generation; unsigned long flags; struct block_device *bdev; atomic_t refs; unsigned int in_tree; unsigned int compress_type; + struct list_head list; }; struct extent_map_tree { struct rb_root map; + struct list_head modified_extents; rwlock_t lock; }; @@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 857d93c..1ad08e4e4 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,11 +25,12 @@ #include "transaction.h" #include "print-tree.h" -#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) -#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ + PAGE_CACHE_SIZE)) #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285..9ab1bed 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,6 +39,7 @@ #include "tree-log.h" #include "locking.h" #include "compat.h" +#include "volumes.h" /* * when auto defrag is enabled we @@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 len = end - start + 1; + u64 gen; int ret; int testend = 1; unsigned long flags; @@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, testend = 0; } while (1) { + int no_splits = 0; + if (!split) split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); /* -ENOMEM */ + if (!split || !split2) + no_splits = 1; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, break; } flags = em->flags; + gen = em->generation; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { if (testend && em->start + em->len >= start + len) { free_extent_map(em); @@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); remove_extent_mapping(em_tree, em); + if (no_splits) + goto next; if (em->block_start < EXTENT_MAP_LAST_BYTE && em->start < start) { @@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; - + split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = split2; split2 = NULL; @@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; + split->generation = gen; if (compressed) { split->block_len = em->block_len; @@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = NULL; } +next: write_unlock(&em_tree->lock); /* once for us */ @@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); if (split2) free_extent_map(split2); - return 0; } /* @@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache) +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache) { - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_path *path; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int recow; int ret; int modify_tree = -1; + int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int found = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (start >= BTRFS_I(inode)->disk_i_size) modify_tree = 0; @@ -666,6 +675,7 @@ next_slot: goto next_slot; } + found = 1; search_start = max(key.offset, start); if (recow || !modify_tree) { modify_tree = -1; @@ -707,14 +717,13 @@ next_slot: extent_end - start); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, start - extent_offset, 0); BUG_ON(ret); /* -ENOMEM */ - *hint_byte = disk_bytenr; } key.offset = start; } @@ -734,10 +743,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, end - key.offset); - *hint_byte = disk_bytenr; - } break; } @@ -753,10 +760,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, extent_end - start); - *hint_byte = disk_bytenr; - } if (end == extent_end) break; @@ -777,12 +782,13 @@ next_slot: del_nr++; } - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, extent_end - key.offset); extent_end = ALIGN(extent_end, root->sectorsize); - } else if (disk_bytenr > 0) { + } else if (update_refs && disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, @@ -791,7 +797,6 @@ next_slot: BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); - *hint_byte = disk_bytenr; } if (end == extent_end) @@ -806,7 +811,7 @@ next_slot: del_nr); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto out; + break; } del_nr = 0; @@ -825,7 +830,24 @@ next_slot: btrfs_abort_transaction(trans, root, ret); } -out: + if (drop_end) + *drop_end = found ? min(end, extent_end) : end; + btrfs_release_path(path); + return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, + drop_cache); btrfs_free_path(path); return ret; } @@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int ret; u64 ino = btrfs_ino(inode); - btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -935,12 +955,16 @@ again: btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); btrfs_mark_buffer_dirty(leaf); @@ -958,12 +982,16 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); path->slots[0]++; new_key.offset = start; btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, @@ -991,12 +1019,14 @@ again: leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); @@ -1056,12 +1086,14 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_mark_buffer_dirty(leaf); @@ -1173,8 +1205,8 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state, GFP_NOFS); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, &cached_state, GFP_NOFS); @@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); /* - * we wait first, since the writeback may change the inode, also wait - * ordered range does a filemape_write_and_wait_range which is why we - * don't do it above like other file systems. + * We flush the dirty pages again to avoid some dirty pages in the + * range being left. */ - root->log_batch++; + atomic_inc(&root->log_batch); btrfs_wait_ordered_range(inode, start, end); - root->log_batch++; + atomic_inc(&root->log_batch); /* * check the transaction that last modified this inode @@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; + + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); mutex_unlock(&inode->i_mutex); goto out; } @@ -1599,6 +1647,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1610,11 +1659,328 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + return ret; + BUG_ON(!ret); + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + u64 num_bytes; + + path->slots[0]++; + key.offset = offset; + btrfs_set_item_key_safe(trans, root, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, end - offset, 0, end - offset, + 0, 0, 0); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + do { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, hole_em); + if (!ret) + list_move(&hole_em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + free_extent_map(hole_em); + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } + + return 0; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_block_rsv *rsv; + struct btrfs_trans_handle *trans; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + u64 lockstart = (offset + mask) & ~mask; + u64 lockend = ((offset + len) & ~mask) - 1; + u64 cur_offset = lockstart; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 drop_end; + unsigned long nr; + int ret = 0; + int err = 0; + bool same_page = (offset >> PAGE_CACHE_SHIFT) == + ((offset + len) >> PAGE_CACHE_SHIFT); + + btrfs_wait_ordered_range(inode, offset, len); + + mutex_lock(&inode->i_mutex); + if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + /* + * Only do this if we are in the same page and we aren't doing the + * entire page. + */ + if (same_page && len < PAGE_CACHE_SIZE) { + ret = btrfs_truncate_page(inode, offset, len, 0); + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero back part of the first page */ + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero the front end of the last page */ + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + if (lockend < lockstart) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len < lockstart || + ordered->file_offset > lockend)) && + !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_UPTODATE, 0, + cached_state)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_NOFS); + btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out_free; + } + rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + trans->block_rsv = rsv; + + while (cur_offset < lockend) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, lockend + 1, + &drop_end, 1); + if (ret != -ENOSPC) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + break; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + } + + if (ret) { + err = ret; + goto out_trans; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out_free: + btrfs_free_path(path); + btrfs_free_block_rsv(root, rsv); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + mutex_unlock(&inode->i_mutex); + if (ret && !err) + err = ret; + return err; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(inode, offset, len); + /* * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, len); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); if (ret) return ret; @@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6b10acf..1027b85 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, block_group->key.offset)) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) { ret = 0; break; @@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); - for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); if ((next_zero - i) >= bits) { @@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, again: found_bits = 0; - for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index db2ff97..1d98281 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); } + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) crc32c(parent_objectid, name, len); +} + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a13cf1a..48b8fda 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "print-tree.h" @@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } -struct btrfs_inode_ref * +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, + const char *name, int name_len, + struct btrfs_inode_extref **extref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_extref *extref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + /* + * Search all extended backrefs in this item. We're only + * looking through any collisions so most of the time this is + * just going to compare against one buffer. If all is well, + * we'll return success and the inode ref object. + */ + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_ptr = (unsigned long)(&extref->name); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (ref_name_len == name_len && + btrfs_inode_extref_parent(leaf, extref) == ref_objectid && + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { + if (extref_ret) + *extref_ret = extref; + return 1; + } + + cur_offset += ref_name_len + sizeof(*extref); + } + return 0; +} + +static struct btrfs_inode_ref * btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod) + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) { + int ret; struct btrfs_key key; struct btrfs_inode_ref *ref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - int ret; key.objectid = inode_objectid; key.type = BTRFS_INODE_REF_KEY; @@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, return ref; } -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) +{ + int ret; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + return NULL; + return extref; +} + +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod, + u64 *ret_index) +{ + struct btrfs_inode_ref *ref; + struct btrfs_inode_extref *extref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, + inode_objectid, ref_objectid, ins_len, + cow); + if (IS_ERR(ref)) + return PTR_ERR(ref); + + if (ref != NULL) { + *ret_index = btrfs_inode_ref_index(path->nodes[0], ref); + return 0; + } + + btrfs_release_path(path); + + extref = btrfs_lookup_inode_extref(trans, root, path, name, + name_len, inode_objectid, + ref_objectid, ins_len, cow); + if (IS_ERR(extref)) + return PTR_ERR(extref); + + if (extref) { + *ret_index = btrfs_inode_extref_index(path->nodes[0], extref); + return 0; + } + + return -ENOENT; +} + +int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; struct btrfs_key key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + int ret; + int del_len = name_len + sizeof(*extref); + unsigned long ptr; + unsigned long item_start; + u32 item_size; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + /* + * Sanity check - did we find the right item for this name? + * This should always succeed so error here will make the FS + * readonly. + */ + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, &extref)) { + btrfs_std_error(root->fs_info, -ENOENT); + ret = -EROFS; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + + if (del_len == item_size) { + /* + * Common case only one ref in the item, remove the + * whole item. + */ + ret = btrfs_del_item(trans, root, path); + goto out; + } + + ptr = (unsigned long)extref; + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + + memmove_extent_buffer(leaf, ptr, ptr + del_len, + item_size - (ptr + del_len - item_start)); + + btrfs_truncate_item(trans, root, path, item_size - del_len, 1); + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; struct btrfs_inode_ref *ref; struct extent_buffer *leaf; unsigned long ptr; @@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 item_size; u32 sub_item_len; int ret; + int search_ext_refs = 0; int del_len = name_len + sizeof(*ref); key.objectid = inode_objectid; @@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; + search_ext_refs = 1; goto out; } else if (ret < 0) { goto out; } if (!find_name_in_backref(path, name, name_len, &ref)) { ret = -ENOENT; + search_ext_refs = 1; goto out; } leaf = path->nodes[0]; @@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(trans, root, path, - item_size - sub_item_len, 1); + btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + + if (search_ext_refs) { + /* + * No refs were found, or we could not find the + * name in our ref array. Find and remove the extended + * inode ref then. + */ + return btrfs_del_inode_extref(trans, root, name, name_len, + inode_objectid, ref_objectid, index); + } + + return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_inode_extref *extref; + int ret; + int ins_len = name_len + sizeof(*extref); + unsigned long ptr; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_item *item; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, NULL)) + goto out; + + btrfs_extend_item(trans, root, path, ins_len); + ret = 0; + } + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); + ptr += btrfs_item_size(leaf, item) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_index(path->nodes[0], extref, index); + btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + + ptr = (unsigned long)&extref->name; + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + out: btrfs_free_path(path); return ret; @@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); + + if (ret == -EMLINK) { + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + /* We ran out of space in the ref array. Need to + * add an extended ref. */ + if (btrfs_super_incompat_flags(disk_super) + & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + ret = btrfs_insert_inode_extref(trans, root, name, + name_len, + inode_objectid, + ref_objectid, index); + } + return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec154f9..85a1e50 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 inline_len = actual_end - start; u64 aligned_end = (end + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - u64 hint_byte; u64 data_len = inline_len; int ret; @@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, inode, start, aligned_end, - &hint_byte, 1); + ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); if (ret) return ret; @@ -664,7 +662,7 @@ retry: async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1); - if (ret) + if (ret && ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); } @@ -1308,6 +1306,7 @@ out_check: em->block_start = disk_bytenr; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -1364,11 +1363,7 @@ out_check: } error: - if (nolock) { - err = btrfs_end_transaction_nolock(trans, root); - } else { - err = btrfs_end_transaction(trans, root); - } + err = btrfs_end_transaction(trans, root); if (!ret) ret = err; @@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; - u64 hint; int ret; path = btrfs_alloc_path(); @@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, - &hint, 0); + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, 0); if (ret) goto out; @@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); - btrfs_unlock_up_safe(path, 1); - btrfs_set_lock_blocking(leaf); - btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); inode_add_bytes(inode, num_bytes); @@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, - ordered_extent->len); } - + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, ordered_extent->len, + trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); goto out_unlock; @@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, root, ret); goto out_unlock; } + } else { + btrfs_set_inode_last_trans(trans, inode); } ret = 0; out_unlock: @@ -1958,12 +1951,8 @@ out_unlock: out: if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) { - if (nolock) - btrfs_end_transaction_nolock(trans, root); - else - btrfs_end_transaction(trans, root); - } + if (trans) + btrfs_end_transaction(trans, root); if (ret) clear_extent_uptodate(io_tree, ordered_extent->file_offset, @@ -1971,8 +1960,8 @@ out: ordered_extent->len - 1, NULL, GFP_NOFS); /* - * This needs to be dont to make sure anybody waiting knows we are done - * upating everything for this ordered extent. + * This needs to be done to make sure anybody waiting knows we are done + * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); @@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; - down_read(&root->fs_info->cleanup_work_sem); spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } - up_read(&root->fs_info->cleanup_work_sem); } enum btrfs_orphan_cleanup_state { @@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int ret; if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(root); + block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!block_rsv) return -ENOMEM; } @@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; - atomic_dec(&root->orphan_inodes); + atomic_inc(&root->orphan_inodes); } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, @@ -2572,8 +2559,8 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - inode->i_uid = btrfs_inode_uid(leaf, inode_item); - inode->i_gid = btrfs_inode_gid(leaf, inode_item); + i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); tspec = btrfs_inode_atime(inode_item); @@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + + /* + * If we were modified in the current generation and evicted from memory + * and then re-read we need to do a full sync since we don't have any + * idea about which extents were modified before we were evicted from + * cache. + */ + if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; @@ -2651,8 +2650,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); btrfs_set_inode_mode(leaf, item, inode->i_mode); btrfs_set_inode_nlink(leaf, item, inode->i_nlink); @@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; - struct btrfs_inode_ref *ref; struct btrfs_dir_item *di; struct inode *inode = dentry->d_inode; u64 index; @@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, } btrfs_release_path(path); - ref = btrfs_lookup_inode_ref(trans, root, path, - dentry->d_name.name, dentry->d_name.len, - ino, dir_ino, 0); - if (IS_ERR(ref)) { - err = PTR_ERR(ref); + ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, + dentry->d_name.len, ino, dir_ino, 0, + &index); + if (ret) { + err = ret; goto out; } - BUG_ON(!ref); /* Logic error */ + if (check_path_shared(root, path)) goto out; - index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(path); /* @@ -3061,7 +3059,7 @@ out: static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (trans->block_rsv == &root->fs_info->global_block_rsv) { + if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; unsigned long nr = 0; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + return -EPERM; trans = __unlink_start_trans(dir, dentry); if (IS_ERR(trans)) @@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = -1; + /* + * We want to drop from the next block forward in case this new size is + * not block aligned since we will be keeping the last block of the + * extent just the way it is. + */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -3429,12 +3433,6 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot) { - if (root->ref_cows && - BTRFS_I(inode)->location.objectid != - BTRFS_FREE_INO_OBJECTID) { - err = -EAGAIN; - goto out; - } if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -3465,12 +3463,20 @@ error: } /* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + * offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero. This is used with truncate and hole punching. */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front) { - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) u64 page_start; u64 page_end; - if ((offset & (blocksize - 1)) == 0) + if ((offset & (blocksize - 1)) == 0 && + (!len || ((len & (blocksize - 1)) == 0))) goto out; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -3532,7 +3539,8 @@ again: } clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -3545,8 +3553,13 @@ again: ret = 0; if (offset != PAGE_CACHE_SIZE) { + if (!len) + len = PAGE_CACHE_SIZE - offset; kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + if (front) + memset(kaddr, 0, offset); + else + memset(kaddr + offset, 0, len); flush_dcache_page(page); kunmap(page); } @@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 mask = root->sectorsize - 1; u64 hole_start = (oldsize + mask) & ~mask; u64 block_end = (size + mask) & ~mask; @@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - u64 hint_byte = 0; + struct extent_map *hole_em; hole_size = last_byte - cur_offset; trans = btrfs_start_transaction(root, 3); @@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } - err = btrfs_drop_extents(trans, inode, cur_offset, - cur_offset + hole_size, - &hint_byte, 1); + err = btrfs_drop_extents(trans, root, inode, + cur_offset, + cur_offset + hole_size, 1); if (err) { btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); @@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } - btrfs_drop_extent_cache(inode, hole_start, - last_byte - 1, 0); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + hole_size - 1, 0); + hole_em = alloc_extent_map(); + if (!hole_em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + hole_em->start = cur_offset; + hole_em->len = hole_size; + hole_em->orig_start = cur_offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + while (1) { + write_lock(&em_tree->lock); + err = add_extent_mapping(em_tree, hole_em); + if (!err) + list_move(&hole_em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + if (err != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + + hole_size - 1, 0); + } + free_extent_map(hole_em); +next: btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); } @@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); goto no_delete; } rsv->size = min_size; + rsv->failfast = 1; global_rsv = &root->fs_info->global_block_rsv; btrfs_i_size_write(inode, 0); /* - * This is a bit simpler than btrfs_truncate since - * - * 1) We've already reserved our space for our orphan item in the - * unlink. - * 2) We're going to delete the inode item, so we don't need to update - * it at all. - * - * So we just need to reserve some slack space in case we add bytes when - * doing the truncate. + * This is a bit simpler than btrfs_truncate since we've already + * reserved our space for our orphan item in the unlink, so we just + * need to reserve some slack space in case we add bytes and update + * inode item when doing the truncate. */ while (1) { ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); @@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_noflush(root, 1); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -EAGAIN) + if (ret != -ENOSPC) break; + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; @@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - if (nolock) - ret = btrfs_end_transaction_nolock(trans, root); - else - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); } return ret; } @@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; + /* + * We could have gotten an inode number from somebody who was fsynced + * and then removed in this same transaction, so let's just set full + * sync since it will be a full sync anyway and this will blow away the + * old info in the log. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + if (S_ISDIR(mode)) owner = 0; else @@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ key[1].objectid = objectid; btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); key[1].offset = ref_objectid; @@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EXDEV; - if (inode->i_nlink == ~0U) + if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; err = btrfs_set_inode_index(dir, &index); @@ -5450,7 +5505,8 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + if (em) + trace_btrfs_get_extent(root, em); if (path) btrfs_free_path(path); @@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, return ret; } +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + int type) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + if (!em) + return ERR_PTR(-ENOMEM); + + em->start = start; + em->orig_start = orig_start; + em->len = len; + em->block_len = block_len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + if (type == BTRFS_ORDERED_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + + do { + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + + return em; +} + + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { + u64 orig_start = em->start; + + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + em = create_pinned_em(inode, start, len, + orig_start, + block_start, len, type); + if (IS_ERR(em)) { + btrfs_end_transaction(trans, root); + goto unlock_err; + } + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); btrfs_end_transaction(trans, root); @@ -5999,7 +6110,8 @@ unlock: if (lockstart < lockend) { if (create && len < lockend - lockstart) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, unlock_bits, 1, 0, + lockstart + len - 1, + unlock_bits | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); /* * Beside unlock, we also need to cleanup reserved space @@ -6007,8 +6119,8 @@ unlock: */ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING, - 1, 0, NULL, GFP_NOFS); + unlock_bits | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); } else { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, @@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, - &cached_state, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -6687,7 +6800,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -6718,6 +6832,7 @@ again: BTRFS_I(inode)->last_trans = root->fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode) u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); if (ret) return ret; @@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode) * 3) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; + rsv->failfast = 1; /* * 1 for the truncate slack space @@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode) &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); - while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size); - if (ret) { - /* - * This can only happen with the original transaction we - * started above, every other time we shouldn't have a - * transaction started yet. - */ - if (ret == -EAGAIN) - goto end_trans; - err = ret; - break; - } - - if (!trans) { - /* Just need the 1 for updating the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = err = PTR_ERR(trans); - trans = NULL; - break; - } - } - - trans->block_rsv = rsv; + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + trans->block_rsv = rsv; + while (1) { ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; break; } @@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } -end_trans: + nr = trans->blocks_used; btrfs_end_transaction(trans, root); - trans = NULL; btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = err = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; } if (ret == 0 && inode->i_nlink > 0) { @@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; + ei->last_log_commit = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -7076,6 +7189,11 @@ static void init_once(void *foo) void btrfs_destroy_cachep(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); if (btrfs_inode_cachep) kmem_cache_destroy(btrfs_inode_cachep); if (btrfs_trans_handle_cachep) @@ -7090,31 +7208,31 @@ void btrfs_destroy_cachep(void) int btrfs_init_cachep(void) { - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) @@ -7508,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; @@ -7548,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + + em->start = cur_offset; + em->orig_start = cur_offset; + em->len = ins.offset; + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->generation = trans->transid; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset - 1, + 0); + } + free_extent_map(em); +next: num_bytes -= ins.offset; cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9df50fa..e568c47 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) int ret; u64 ip_oldflags; unsigned int i_oldflags; + umode_t mode; if (btrfs_root_readonly(root)) return -EROFS; @@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip_oldflags = ip->flags; i_oldflags = inode->i_flags; + mode = inode->i_mode; flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); @@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_DIRSYNC; else ip->flags &= ~BTRFS_INODE_DIRSYNC; - if (flags & FS_NOCOW_FL) - ip->flags |= BTRFS_INODE_NODATACOW; - else - ip->flags &= ~BTRFS_INODE_NODATACOW; + if (flags & FS_NOCOW_FL) { + if (S_ISREG(mode)) { + /* + * It's safe to turn csums off here, no extents exist. + * Otherwise we want the flag to reflect the real COW + * status of the file and will not set it. + */ + if (inode->i_size == 0) + ip->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; + } else { + ip->flags |= BTRFS_INODE_NODATACOW; + } + } else { + /* + * Revert back under same assuptions as above + */ + if (S_ISREG(mode)) { + if (inode->i_size == 0) + ip->flags &= ~(BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM); + } else { + ip->flags &= ~BTRFS_INODE_NODATACOW; + } + } /* * The COMPRESS flag can only be changed by users, while the NOCOMPRESS @@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!pending_snapshot) return -ENOMEM; - btrfs_init_block_rsv(&pending_snapshot->block_rsv); + btrfs_init_block_rsv(&pending_snapshot->block_rsv, + BTRFS_BLOCK_RSV_TEMP); pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; @@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, *inherit = NULL; /* take responsibility to free it */ } - trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + trans = btrfs_start_transaction(root->fs_info->extent_root, 6); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; @@ -575,13 +599,13 @@ fail: */ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) { - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); if (!(dir->i_mode & S_ISVTX)) return 0; - if (inode->i_uid == fsuid) + if (uid_eq(inode->i_uid, fsuid)) return 0; - if (dir->i_uid == fsuid) + if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable(CAP_FOWNER); } @@ -1022,8 +1046,8 @@ again: page_start, page_end - 1, 0, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + &cached_state, GFP_NOFS); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1034,8 +1058,8 @@ again: } - btrfs_set_extent_delalloc(inode, page_start, page_end - 1, - &cached_state); + set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, + &cached_state, GFP_NOFS); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -1397,7 +1421,6 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, u64 *transid, bool readonly, struct btrfs_qgroup_inherit **inherit) { - struct file *src_file; int namelen; int ret = 0; @@ -1421,25 +1444,24 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly, inherit); } else { + struct fd src = fdget(fd); struct inode *src_inode; - src_file = fget(fd); - if (!src_file) { + if (!src.file) { ret = -EINVAL; goto out_drop_write; } - src_inode = src_file->f_path.dentry->d_inode; + src_inode = src.file->f_path.dentry->d_inode; if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { printk(KERN_INFO "btrfs: Snapshot src from " "another FS\n"); ret = -EINVAL; - fput(src_file); - goto out_drop_write; + } else { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid, readonly, inherit); } - ret = btrfs_mksubvol(&file->f_path, name, namelen, - BTRFS_I(src_inode)->root, - transid, readonly, inherit); - fput(src_file); + fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -2341,7 +2363,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct file *src_file; + struct fd src_file; struct inode *src; struct btrfs_trans_handle *trans; struct btrfs_path *path; @@ -2353,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; - u64 hint_byte; /* * TODO: @@ -2376,24 +2397,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (ret) return ret; - src_file = fget(srcfd); - if (!src_file) { + src_file = fdget(srcfd); + if (!src_file.file) { ret = -EBADF; goto out_drop_write; } ret = -EXDEV; - if (src_file->f_path.mnt != file->f_path.mnt) + if (src_file.file->f_path.mnt != file->f_path.mnt) goto out_fput; - src = src_file->f_dentry->d_inode; + src = src_file.file->f_dentry->d_inode; ret = -EINVAL; if (src == inode) goto out_fput; /* the src must be open for reading */ - if (!(src_file->f_mode & FMODE_READ)) + if (!(src_file.file->f_mode & FMODE_READ)) goto out_fput; /* don't make the dst file partly checksummed */ @@ -2458,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len); - ordered = btrfs_lookup_first_ordered_extent(src, off+len); + lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1); if (!ordered && - !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, - EXTENT_DELALLOC, 0, NULL)) + !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1, + EXTENT_DELALLOC, 0, NULL)) break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); btrfs_wait_ordered_range(src, off, len); @@ -2538,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(path); if (key.offset + datal <= off || - key.offset >= off+len) + key.offset >= off + len - 1) goto next; memcpy(&new_key, &key, sizeof(new_key)); @@ -2576,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datal -= off - key.offset; } - ret = btrfs_drop_extents(trans, inode, + ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, - &hint_byte, 1); + 1); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2639,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset += skip; } - if (key.offset + datal > off+len) - trim = key.offset + datal - (off+len); + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); if (comp && (skip || trim)) { ret = -EINVAL; @@ -2650,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, size -= skip + trim; datal -= skip + trim; - ret = btrfs_drop_extents(trans, inode, + ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, - &hint_byte, 1); + 1); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2717,14 +2738,14 @@ next: ret = 0; out: btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); vfree(buf); btrfs_free_path(path); out_fput: - fput(src_file); + fdput(src_file); out_drop_write: mnt_drop_write_file(file); return ret; @@ -2852,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return 0; } -static void get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space) +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) { struct btrfs_block_group_cache *block_group; @@ -2961,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { - get_block_group_info(&info->block_groups[c], - &space); + btrfs_get_block_group_info( + &info->block_groups[c], &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; @@ -3210,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_item_pos; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; - struct btrfs_key key; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3232,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 4096); + size = min_t(u32, loi->size, 64 * 1024); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -3240,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - ret = extent_from_logical(root->fs_info, loi->logical, path, &key); - btrfs_release_path(path); - - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, + build_ino_list, inodes); + if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; - extent_item_pos = loi->logical - key.objectid; - ret = iterate_extent_inodes(root->fs_info, key.objectid, - extent_item_pos, 0, build_ino_list, - inodes); - - if (ret < 0) - goto out; - ret = copy_to_user((void *)(unsigned long)loi->inodes, (void *)(unsigned long)inodes, size); if (ret) @@ -3263,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, out: btrfs_free_path(path); - kfree(inodes); + vfree(inodes); kfree(loi); return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 051c7fe..7772f02 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -25,6 +25,8 @@ #include "btrfs_inode.h" #include "extent_io.h" +static struct kmem_cache *btrfs_ordered_extent_cache; + static u64 entry_end(struct btrfs_ordered_extent *entry) { if (entry->file_offset + entry->len < entry->file_offset) @@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, struct btrfs_ordered_extent *entry; tree = &BTRFS_I(inode)->ordered_tree; - entry = kzalloc(sizeof(*entry), GFP_NOFS); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) return -ENOMEM; @@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) list_del(&sum->list); kfree(sum); } - kfree(entry); + kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, cur = splice.next; ordered = list_entry(cur, struct btrfs_ordered_extent, root_extent_list); - if (nocow_only && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - list_move(&ordered->root_extent_list, - &root->fs_info->ordered_extents); - cond_resched_lock(&root->fs_info->ordered_extent_lock); - continue; - } - list_del_init(&ordered->root_extent_list); atomic_inc(&ordered->refs); @@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; u64 disk_i_size; u64 new_i_size; - u64 i_size_test; u64 i_size = i_size_read(inode); struct rb_node *node; struct rb_node *prev = NULL; @@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) + if (test->file_offset >= disk_i_size) { + /* + * we don't update disk_i_size now, so record this + * undealt i_size. Or we will not know the real + * i_size. + */ + if (test->outstanding_isize < offset) + test->outstanding_isize = offset; + if (ordered && + ordered->outstanding_isize > + test->outstanding_isize) + test->outstanding_isize = + ordered->outstanding_isize; goto out; - } - new_i_size = min_t(u64, offset, i_size); - - /* - * at this point, we know we can safely update i_size to at least - * the offset from this ordered extent. But, we need to - * walk forward and see if ios from higher up in the file have - * finished. - */ - if (ordered) { - node = rb_next(&ordered->rb_node); - } else { - if (prev) - node = rb_next(prev); - else - node = rb_first(&tree->tree); - } - - /* - * We are looking for an area between our current extent and the next - * ordered extent to update the i_size to. There are 3 cases here - * - * 1) We don't actually have anything and we can update to i_size. - * 2) We have stuff but they already did their i_size update so again we - * can just update to i_size. - * 3) We have an outstanding ordered extent so the most we can update - * our disk_i_size to is the start of the next offset. - */ - i_size_test = i_size; - for (; node; node = rb_next(node)) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - - if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) - continue; - if (test->file_offset > offset) { - i_size_test = test->file_offset; - break; } } + new_i_size = min_t(u64, offset, i_size); /* - * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents, we can safely set - * disk_i_size to this. + * Some ordered extents may completed before the current one, and + * we hold the real i_size in ->outstanding_isize. */ - if (i_size_test > offset) - new_i_size = min_t(u64, i_size_test, i_size); + if (ordered && ordered->outstanding_isize > new_i_size) + new_i_size = min_t(u64, ordered->outstanding_isize, i_size); BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: @@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, } spin_unlock(&root->fs_info->ordered_extent_lock); } + +int __init ordered_data_init(void) +{ + btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", + sizeof(struct btrfs_ordered_extent), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_extent_cache) + return -ENOMEM; + return 0; +} + +void ordered_data_exit(void) +{ + if (btrfs_ordered_extent_cache) + kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e03c560..dd27a0b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -96,6 +96,13 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* + * the end of the ordered extent which is behind it but + * didn't update disk_i_size. Please see the comment of + * btrfs_ordered_update_i_size(); + */ + u64 outstanding_isize; + /* flags (described above) */ unsigned long flags; @@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +int __init ordered_data_init(void); +void ordered_data_exit(void); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 38b42e7..5039686 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ulist_reinit(tmp); /* XXX id not needed */ - ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)tmp_unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; if (qg->refcnt < seq) qg->refcnt = seq + 1; else @@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, + (u64)(uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, * step 2: walk from the new root */ ulist_reinit(tmp); - ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if (qg->refcnt < seq) { /* not visited by step 1 */ qg->rfer += sgn * node->num_bytes; @@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, continue; ulist_reinit(tmp); - ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)tmp_unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; if (qg->tag == seq) continue; @@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, + (uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1371,10 +1371,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); - if (!srcgroup) { - ret = -EINVAL; + if (!srcgroup) goto unlock; - } dstgroup->rfer = srcgroup->rfer - level_size; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; srcgroup->excl = level_size; @@ -1383,10 +1381,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, qgroup_dirty(fs_info, srcgroup); } - if (!inherit) { - ret = -EINVAL; + if (!inherit) goto unlock; - } i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { @@ -1473,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * be exceeded */ ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + if (!ulist) { + ret = -ENOMEM; + goto out; + } + ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + qg->rfer + num_bytes > @@ -1493,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } if (ret) @@ -1506,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; qg->reserved += num_bytes; } @@ -1545,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) goto out; ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + if (!ulist) { + btrfs_std_error(fs_info, -ENOMEM); + goto out; + } + ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 48a4882..a955669 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -68,7 +68,7 @@ struct reada_extent { u32 blocksize; int err; struct list_head extctl; - struct kref refcnt; + int refcnt; spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; @@ -126,7 +126,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); if (!re) @@ -336,7 +336,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); if (re) @@ -352,7 +352,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); - kref_init(&re->refcnt); + re->refcnt = 1; /* * map block @@ -398,7 +398,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); BUG_ON(!re_exist); - kref_get(&re_exist->refcnt); + re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); goto error; } @@ -465,10 +465,6 @@ error: return re_exist; } -static void reada_kref_dummy(struct kref *kr) -{ -} - static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { @@ -476,7 +472,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, unsigned long index = re->logical >> PAGE_CACHE_SHIFT; spin_lock(&fs_info->reada_lock); - if (!kref_put(&re->refcnt, reada_kref_dummy)) { + if (--re->refcnt) { spin_unlock(&fs_info->reada_lock); return; } @@ -671,7 +667,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; } dev->reada_next = re->logical + re->blocksize; - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4da0865..776f0aa 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { - if (inode && !IS_ERR(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) { + if (!IS_ERR(inode)) iput(inode); return -ENOENT; } @@ -3621,7 +3621,7 @@ next: ret = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret == 0 && start <= key.objectid) { btrfs_release_path(path); @@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc) struct btrfs_trans_handle *trans; int ret; - rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, + BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; @@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->flags); btrfs_start_delalloc_inodes(fs_info->tree_root, 0); - btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 10d8e4d..eb923d0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + ret = btrfs_del_item(trans, root, path); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); @@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root out: btrfs_free_path(path); return ret; - -out_abort: - btrfs_abort_transaction(trans, root, ret); - goto out; } int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b223620..27892f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - u32 item_size; - int ret; + unsigned long ptr = 0; + u64 extent_item_pos; + u64 flags = 0; u64 ref_root; + u32 item_size; u8 ref_level; - unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_item_pos; + int ret; path = btrfs_alloc_path(); @@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (!path || !swarn.scratch_buf || !swarn.msg_buf) goto out; - ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, + &flags); if (ret < 0) goto out; @@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) item_size = btrfs_item_size_nr(eb, path->slots[0]); btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); @@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); sdev->stat.malloc_errors++; spin_unlock(&sdev->stat_lock); + kfree(bbio); return -ENOMEM; } sblock->page_count++; @@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) scrub_block_put(sblock); } - if (sbio->err) { - /* what is this good for??? */ - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; - - for (i = 0; i < sbio->page_count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - } - bio_put(sbio->bio); sbio->bio = NULL; spin_lock(&sdev->list_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fb5ffe9..c7beb54 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -107,7 +107,6 @@ struct send_ctx { int cur_inode_new; int cur_inode_new_gen; int cur_inode_deleted; - int cur_inode_first_ref_orphan; u64 cur_inode_size; u64 cur_inode_mode; @@ -126,7 +125,15 @@ struct send_ctx { struct name_cache_entry { struct list_head list; - struct list_head use_list; + /* + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. + */ + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -328,6 +335,7 @@ out: return ret; } +#if 0 static void fs_path_remove(struct fs_path *p) { BUG_ON(p->reversed); @@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p) p->end--; *p->end = 0; } +#endif static int fs_path_copy(struct fs_path *p, struct fs_path *from) { @@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void) return path; } -static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; mm_segment_t old_fs; @@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) set_fs(KERNEL_DS); while (pos < len) { - ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, - &sctx->send_off); + ret = vfs_write(filp, (char *)buf + pos, len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; @@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx) strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - return write_buf(sctx, &hdr, sizeof(hdr)); + return write_buf(sctx->send_filp, &hdr, sizeof(hdr), + &sctx->send_off); } /* @@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx) crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); - ret = write_buf(sctx, sctx->send_buf, sctx->send_size); + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); sctx->total_send_size += sctx->send_size; sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; @@ -687,7 +697,8 @@ out: */ static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid) + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) { int ret; struct btrfs_inode_item *ii; @@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root, *uid = btrfs_inode_uid(path->nodes[0], ii); if (gid) *gid = btrfs_inode_gid(path->nodes[0], ii); + if (rdev) + *rdev = btrfs_inode_rdev(path->nodes[0], ii); out: btrfs_free_path(path); @@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx, struct extent_buffer *eb; struct btrfs_item *item; struct btrfs_dir_item *di; - struct btrfs_path *tmp_path = NULL; struct btrfs_key di_key; char *buf = NULL; char *buf2 = NULL; @@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx, goto out; } - tmp_path = alloc_path_for_send(); - if (!tmp_path) { - ret = -ENOMEM; - goto out; - } - eb = path->nodes[0]; slot = path->slots[0]; item = btrfs_item_nr(eb, slot); @@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx, } out: - btrfs_free_path(tmp_path); if (buf_virtual) vfree(buf); else @@ -1026,12 +1031,12 @@ struct backref_ctx { u64 extent_len; /* Just to check for bugs in backref resolving */ - int found_in_send_root; + int found_itself; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) { - u64 root = (u64)key; + u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; if (root < cr->root->objectid) @@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset/found_refs */ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) { @@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) u64 i_size; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)root, bctx->sctx->clone_roots, + found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, bctx->sctx->clone_roots_cnt, sizeof(struct clone_root), __clone_root_cmp_bsearch); @@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (found->root == bctx->sctx->send_root && ino == bctx->cur_objectid && offset == bctx->cur_offset) { - bctx->found_in_send_root = 1; + bctx->found_itself = 1; } /* - * There are inodes that have extents that lie behind it's i_size. Don't + * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); + ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, + NULL); if (ret < 0) return ret; @@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (ino >= bctx->cur_objectid) return 0; - /*if (ino > ctx->cur_objectid) +#if 0 + if (ino > bctx->cur_objectid) return 0; - if (offset + ctx->extent_len > ctx->cur_offset) - return 0;*/ - - bctx->found++; - found->found_refs++; - found->ino = ino; - found->offset = offset; - return 0; + if (offset + bctx->extent_len > bctx->cur_offset) + return 0; +#endif } bctx->found++; @@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } /* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + * * path must point to the extent item when called. */ static int find_extent_clone(struct send_ctx *sctx, @@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx, int ret; int extent_type; u64 logical; + u64 disk_byte; u64 num_bytes; u64 extent_item_pos; + u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx; + struct backref_ctx *backref_ctx = NULL; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; + int compressed; u32 i; tmp_path = alloc_path_for_send(); if (!tmp_path) return -ENOMEM; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); + if (!backref_ctx) { + ret = -ENOMEM; + goto out; + } + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; goto out; } + compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = btrfs_file_extent_disk_bytenr(eb, fi); - if (logical == 0) { + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) { ret = -ENOENT; goto out; } - logical += btrfs_file_extent_offset(eb, fi); + logical = disk_byte + btrfs_file_extent_offset(eb, fi); - ret = extent_from_logical(sctx->send_root->fs_info, - logical, tmp_path, &found_key); + ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, + &found_key, &flags); btrfs_release_path(tmp_path); if (ret < 0) goto out; - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = -EIO; goto out; } @@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->found_refs = 0; } - backref_ctx.sctx = sctx; - backref_ctx.found = 0; - backref_ctx.cur_objectid = ino; - backref_ctx.cur_offset = data_offset; - backref_ctx.found_in_send_root = 0; - backref_ctx.extent_len = num_bytes; + backref_ctx->sctx = sctx; + backref_ctx->found = 0; + backref_ctx->cur_objectid = ino; + backref_ctx->cur_offset = data_offset; + backref_ctx->found_itself = 0; + backref_ctx->extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx, * __iterate_backrefs work. */ if (data_offset + num_bytes >= ino_size) - backref_ctx.extent_len = ino_size - data_offset; + backref_ctx->extent_len = ino_size - data_offset; /* * Now collect all backrefs. */ + if (compressed == BTRFS_COMPRESS_NONE) + extent_item_pos = logical - found_key.objectid; + else + extent_item_pos = 0; + extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, - __iterate_backrefs, &backref_ctx); + __iterate_backrefs, backref_ctx); + if (ret < 0) goto out; - if (!backref_ctx.found_in_send_root) { + if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; printk(KERN_ERR "btrfs: ERROR did not find backref in " "send_root. inode=%llu, offset=%llu, " - "logical=%llu\n", - ino, data_offset, logical); + "disk_byte=%llu found extent=%llu\n", + ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " "num_bytes=%llu, logical=%llu\n", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx->found) verbose_printk("btrfs: no clones found\n"); cur_clone_root = NULL; @@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " else if (sctx->clone_roots[i].root == sctx->send_root) /* prefer clones from send_root over others */ cur_clone_root = sctx->clone_roots + i; - break; } } @@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " out: btrfs_free_path(tmp_path); + kfree(backref_ctx); return ret; } @@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx, len = btrfs_file_extent_inline_len(path->nodes[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - if (ret < 0) - goto out; out: btrfs_free_path(path); @@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; } if (!left_ret && !right_ret) { - if (left_gen == gen && right_gen == gen) + if (left_gen == gen && right_gen == gen) { ret = inode_state_no_change; - else if (left_gen == gen) { + } else if (left_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_create; else @@ -1516,6 +1539,10 @@ out: return ret; } +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */ static int get_first_ref(struct send_ctx *sctx, struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) @@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx, btrfs_release_path(path); ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx, if (ret < 0) goto out; - if (name_len != fs_path_len(tmp_name)) { + if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { ret = 0; goto out; } - ret = memcmp(tmp_name->start, name, name_len); - if (ret) - ret = 0; - else - ret = 1; + ret = !memcmp(tmp_name->start, name, name_len); out: fs_path_free(sctx, tmp_name); return ret; } +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen) @@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, goto out; } + /* + * Check if the overwritten ref was already processed. If yes, the ref + * was already unlinked/moved, so we can safely assume that we will not + * overwrite anything at this point in time. + */ if (other_inode > sctx->send_progress) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, NULL, NULL, NULL); + who_gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1642,6 +1680,13 @@ out: return ret; } +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */ static int did_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 ino, u64 ino_gen, @@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -1690,6 +1735,11 @@ out: return ret; } +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) { int ret = 0; @@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, name->start, fs_path_len(name)); - if (ret < 0) - goto out; out: fs_path_free(sctx, name); return ret; } +/* + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * so we need to do some special handling in case we have clashes. This function + * takes care of this with the help of name_cache_entry::radix_list. + * In case of error, nce is kfreed. + */ static int name_cache_insert(struct send_ctx *sctx, struct name_cache_entry *nce) { int ret = 0; - struct name_cache_entry **ncea; - - ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); - if (ncea) { - if (!ncea[0]) - ncea[0] = nce; - else if (!ncea[1]) - ncea[1] = nce; - else - BUG(); - } else { - ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS); - if (!ncea) + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); + if (!nce_head) return -ENOMEM; + INIT_LIST_HEAD(nce_head); - ncea[0] = nce; - ncea[1] = NULL; - ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); - if (ret < 0) + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); + if (ret < 0) { + kfree(nce_head); + kfree(nce); return ret; + } } + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx, static void name_cache_delete(struct send_ctx *sctx, struct name_cache_entry *nce) { - struct name_cache_entry **ncea; - - ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); - BUG_ON(!ncea); - - if (ncea[0] == nce) - ncea[0] = NULL; - else if (ncea[1] == nce) - ncea[1] = NULL; - else - BUG(); + struct list_head *nce_head; - if (!ncea[0] && !ncea[1]) { - radix_tree_delete(&sctx->name_cache, nce->ino); - kfree(ncea); - } + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + BUG_ON(!nce_head); + list_del(&nce->radix_list); list_del(&nce->list); - sctx->name_cache_size--; + + if (list_empty(nce_head)) { + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + kfree(nce_head); + } } static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, u64 ino, u64 gen) { - struct name_cache_entry **ncea; + struct list_head *nce_head; + struct name_cache_entry *cur; - ncea = radix_tree_lookup(&sctx->name_cache, ino); - if (!ncea) + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + if (!nce_head) return NULL; - if (ncea[0] && ncea[0]->gen == gen) - return ncea[0]; - else if (ncea[1] && ncea[1]->gen == gen) - return ncea[1]; + list_for_each_entry(cur, nce_head, radix_list) { + if (cur->ino == ino && cur->gen == gen) + return cur; + } return NULL; } +/* + * Removes the entry from the list and adds it back to the end. This marks the + * entry as recently used so that name_cache_clean_unused does not remove it. + */ static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) { list_del(&nce->list); list_add_tail(&nce->list, &sctx->name_cache_list); } +/* + * Remove some entries from the beginning of name_cache_list. + */ static void name_cache_clean_unused(struct send_ctx *sctx) { struct name_cache_entry *nce; @@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx) static void name_cache_free(struct send_ctx *sctx) { struct name_cache_entry *nce; - struct name_cache_entry *tmp; - list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { + while (!list_empty(&sctx->name_cache_list)) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); name_cache_delete(sctx, nce); + kfree(nce); } } +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, u64 *parent_ino, @@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; + /* + * First check if we already did a call to this function with the same + * ino/gen. If yes, check if the cache entry is still up-to-date. If yes + * return the cached result. + */ nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { @@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, if (!path) return -ENOMEM; + /* + * If the inode is not existent yet, add the orphan name and return 1. + * This should only happen for the parent dir that we determine in + * __record_new_ref + */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) goto out; @@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } + /* + * Depending on whether the inode was already processed or not, use + * send_root or parent_root for ref lookup. + */ if (ino < sctx->send_progress) ret = get_first_ref(sctx, sctx->send_root, ino, parent_ino, parent_gen, dest); @@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, if (ret < 0) goto out; + /* + * Check if the ref was overwritten by an inode's ref that was processed + * earlier. If yes, treat as orphan and return 1. + */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, dest->start, dest->end - dest->start); if (ret < 0) @@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } out_cache: + /* + * Store the result of the lookup in the name cache. + */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); if (!nce) { ret = -ENOMEM; @@ -1901,7 +1985,6 @@ out_cache: nce->name_len = fs_path_len(dest); nce->ret = ret; strcpy(nce->name, dest->start); - memset(&nce->use_list, 0, sizeof(nce->use_list)); if (ino < sctx->send_progress) nce->need_later_update = 0; @@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx) read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); btrfs_release_path(path); - if (ret < 0) - goto out; - if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) @@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); btrfs_inode_mtime(ii)); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, btrfs_inode_ctime(ii)); - /* TODO otime? */ + /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); @@ -2292,39 +2372,39 @@ out: * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. */ -static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, - struct btrfs_key *key) +static int send_create_inode(struct send_ctx *sctx, u64 ino) { int ret = 0; - struct extent_buffer *eb = path->nodes[0]; - struct btrfs_inode_item *ii; struct fs_path *p; - int slot = path->slots[0]; int cmd; + u64 gen; u64 mode; + u64 rdev; -verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); +verbose_printk("btrfs: send_create_inode %llu\n", ino); p = fs_path_alloc(sctx); if (!p) return -ENOMEM; - ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - mode = btrfs_inode_mode(eb, ii); + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, + NULL, &rdev); + if (ret < 0) + goto out; - if (S_ISREG(mode)) + if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; - else if (S_ISDIR(mode)) + } else if (S_ISDIR(mode)) { cmd = BTRFS_SEND_C_MKDIR; - else if (S_ISLNK(mode)) + } else if (S_ISLNK(mode)) { cmd = BTRFS_SEND_C_SYMLINK; - else if (S_ISCHR(mode) || S_ISBLK(mode)) + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { cmd = BTRFS_SEND_C_MKNOD; - else if (S_ISFIFO(mode)) + } else if (S_ISFIFO(mode)) { cmd = BTRFS_SEND_C_MKFIFO; - else if (S_ISSOCK(mode)) + } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; - else { + } else { printk(KERN_WARNING "btrfs: unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; @@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); if (ret < 0) goto out; - ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = gen_unique_name(sctx, ino, gen, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); + ret = read_symlink(sctx, sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { - TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev); } ret = send_cmd(sctx); @@ -2364,6 +2444,92 @@ out: return ret; } +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ + int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key di_key; + struct extent_buffer *eb; + struct btrfs_dir_item *di; + int slot; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, + 1, 0); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (di_key.objectid < sctx->send_progress) { + ret = 1; + goto out; + } + + key.offset = found_key.offset + 1; + btrfs_release_path(path); + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + * directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ + int ret; + + if (S_ISDIR(sctx->cur_inode_mode)) { + ret = did_create_dir(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + } + + ret = send_create_inode(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + +out: + return ret; +} + struct recorded_ref { struct list_head list; char *dir_path; @@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir, static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) { struct recorded_ref *cur; - struct recorded_ref *tmp; - list_for_each_entry_safe(cur, tmp, head, list) { + while (!list_empty(head)) { + cur = list_entry(head->next, struct recorded_ref, list); fs_path_free(sctx, cur->full_path); + list_del(&cur->list); kfree(cur); } - INIT_LIST_HEAD(head); } static void free_recorded_refs(struct send_ctx *sctx) @@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx) } /* - * Renames/moves a file/dir to it's orphan name. Used when the first + * Renames/moves a file/dir to its orphan name. Used when the first * ref of an unprocessed inode gets overwritten and for all non empty * directories. */ @@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) struct btrfs_key loc; struct btrfs_dir_item *di; + /* + * Don't try to rmdir the top/root subvolume dir. + */ + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -2513,160 +2685,6 @@ out: return ret; } -struct finish_unordered_dir_ctx { - struct send_ctx *sctx; - struct fs_path *cur_path; - struct fs_path *dir_path; - u64 dir_ino; - int need_delete; - int delete_pass; -}; - -int __finish_unordered_dir(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) -{ - int ret = 0; - struct finish_unordered_dir_ctx *fctx = ctx; - struct send_ctx *sctx = fctx->sctx; - u64 di_gen; - u64 di_mode; - int is_orphan = 0; - - if (di_key->objectid >= fctx->dir_ino) - goto out; - - fs_path_reset(fctx->cur_path); - - ret = get_inode_info(sctx->send_root, di_key->objectid, - NULL, &di_gen, &di_mode, NULL, NULL); - if (ret < 0) - goto out; - - ret = is_first_ref(sctx, sctx->send_root, di_key->objectid, - fctx->dir_ino, name, name_len); - if (ret < 0) - goto out; - if (ret) { - is_orphan = 1; - ret = gen_unique_name(sctx, di_key->objectid, di_gen, - fctx->cur_path); - } else { - ret = get_cur_path(sctx, di_key->objectid, di_gen, - fctx->cur_path); - } - if (ret < 0) - goto out; - - ret = fs_path_add(fctx->dir_path, name, name_len); - if (ret < 0) - goto out; - - if (!fctx->delete_pass) { - if (S_ISDIR(di_mode)) { - ret = send_rename(sctx, fctx->cur_path, - fctx->dir_path); - } else { - ret = send_link(sctx, fctx->dir_path, - fctx->cur_path); - if (is_orphan) - fctx->need_delete = 1; - } - } else if (!S_ISDIR(di_mode)) { - ret = send_unlink(sctx, fctx->cur_path); - } else { - ret = 0; - } - - fs_path_remove(fctx->dir_path); - -out: - return ret; -} - -/* - * Go through all dir items and see if we find refs which could not be created - * in the past because the dir did not exist at that time. - */ -static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen) -{ - int ret = 0; - struct btrfs_path *path = NULL; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *eb; - struct finish_unordered_dir_ctx fctx; - int slot; - - path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - memset(&fctx, 0, sizeof(fctx)); - fctx.sctx = sctx; - fctx.cur_path = fs_path_alloc(sctx); - fctx.dir_path = fs_path_alloc(sctx); - if (!fctx.cur_path || !fctx.dir_path) { - ret = -ENOMEM; - goto out; - } - fctx.dir_ino = dir; - - ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path); - if (ret < 0) - goto out; - - /* - * We do two passes. The first links in the new refs and the second - * deletes orphans if required. Deletion of orphans is not required for - * directory inodes, as we always have only one ref and use rename - * instead of link for those. - */ - -again: - key.objectid = dir; - key.type = BTRFS_DIR_ITEM_KEY; - key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); - - if (found_key.objectid != key.objectid || - found_key.type != key.type) { - btrfs_release_path(path); - break; - } - - ret = iterate_dir_item(sctx, sctx->send_root, path, - &found_key, __finish_unordered_dir, - &fctx); - if (ret < 0) - goto out; - - key.offset = found_key.offset + 1; - btrfs_release_path(path); - } - - if (!fctx.delete_pass && fctx.need_delete) { - fctx.delete_pass = 1; - goto again; - } - -out: - btrfs_free_path(path); - fs_path_free(sctx, fctx.cur_path); - fs_path_free(sctx, fctx.dir_path); - return ret; -} - /* * This does all the move/link/unlink/rmdir magic. */ @@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx) { int ret = 0; struct recorded_ref *cur; + struct recorded_ref *cur2; struct ulist *check_dirs = NULL; struct ulist_iterator uit; struct ulist_node *un; @@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx) verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); + /* + * This should never happen as the root dir always has the same ref + * which is always '..' + */ + BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + valid_path = fs_path_alloc(sctx); if (!valid_path) { ret = -ENOMEM; @@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); list_for_each_entry(cur, &sctx->new_refs, list) { /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * the the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + + /* * Check if this new ref would overwrite the first ref of * another unprocessed inode. If yes, orphanize the * overwritten inode. If we find an overwritten ref that is @@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + if (is_orphan) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; } + } else if (S_ISDIR(sctx->cur_inode_mode) && + !list_empty(&sctx->deleted_refs)) { + /* + * We have a moved dir. Add the old parent to check_dirs + */ + cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, + list); + ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, + GFP_NOFS); + if (ret < 0) + goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { /* * We have a non dir inode. Go through all deleted refs and @@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (!ret) { - /* - * In case the inode was moved to a directory - * that was not created yet (see - * __record_new_ref), we can not unlink the ref - * as it will be needed later when the parent - * directory is created, so that we can move in - * the inode to the new dir. - */ - if (!is_orphan && - sctx->cur_inode_first_ref_orphan) { - ret = orphanize_inode(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - cur->full_path); - if (ret < 0) - goto out; - ret = gen_unique_name(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - valid_path); - if (ret < 0) - goto out; - is_orphan = 1; - - } else { - ret = send_unlink(sctx, cur->full_path); - if (ret < 0) - goto out; - } + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; } ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, GFP_NOFS); @@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref * of this inode and no new refs were added for the current - * inode. - * We can however not delete the orphan in case the inode relies - * in a directory that was not created yet (see - * __record_new_ref) + * inode. Unlinking does not mean that the inode is deleted in + * all cases. There may still be links to this inode in other + * places. */ - if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + if (is_orphan) { ret = send_unlink(sctx, valid_path); if (ret < 0) goto out; @@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ ULIST_ITER_INIT(&uit); while ((un = ulist_next(check_dirs, &uit))) { + /* + * In case we had refs into dirs that were not processed yet, + * we don't need to do the utime and rmdir logic for these dirs. + * The dir will be processed later. + */ if (un->val > sctx->cur_ino) continue; @@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } - /* - * Current inode is now at it's new position, so we must increase - * send_progress - */ - sctx->send_progress = sctx->cur_ino + 1; - - /* - * We may have a directory here that has pending refs which could not - * be created before (because the dir did not exist before, see - * __record_new_ref). finish_outoforder_dir will link/move the pending - * refs. - */ - if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) { - ret = finish_outoforder_dir(sctx, sctx->cur_ino, - sctx->cur_inode_gen); - if (ret < 0) - goto out; - } - ret = 0; out: @@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index, return -ENOMEM; ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, - NULL); - if (ret < 0) - goto out; - - /* - * The parent may be non-existent at this point in time. This happens - * if the ino of the parent dir is higher then the current ino. In this - * case, we can not process this ref until the parent dir is finally - * created. If we reach the parent dir later, process_recorded_refs - * will go through all dir items and process the refs that could not be - * processed before. In case this is the first ref, we set - * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to - * keep an orphan of the inode so that it later can be used for - * link/move - */ - ret = is_inode_existent(sctx, dir, gen); + NULL, NULL); if (ret < 0) goto out; - if (!ret) { - ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir, - name->start, fs_path_len(name)); - if (ret < 0) - goto out; - if (ret) - sctx->cur_inode_first_ref_orphan = 1; - ret = 0; - goto out; - } ret = get_cur_path(sctx, dir, gen, p); if (ret < 0) @@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, return -ENOMEM; ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx, key.offset = 0; while (1) { ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) { - btrfs_release_path(path); + if (ret < 0) goto out; - } - if (ret) { - btrfs_release_path(path); + if (ret) break; - } eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { - btrfs_release_path(path); + found_key.type != key.type) break; - } - ret = iterate_inode_ref(sctx, sctx->parent_root, path, - &found_key, 0, cb, sctx); + ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, + sctx); btrfs_release_path(path); if (ret < 0) goto out; key.offset = found_key.offset + 1; } + btrfs_release_path(path); ret = process_recorded_refs(sctx); @@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; loff_t pos = offset; - int readed = 0; + int num_read = 0; mm_segment_t old_fs; p = fs_path_alloc(sctx); @@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); if (ret < 0) goto out; - readed = ret; - if (!readed) + num_read = ret; + if (!num_read) goto out; ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); @@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); ret = send_cmd(sctx); @@ -3604,7 +3609,7 @@ out: set_fs(old_fs); if (ret < 0) return ret; - return readed; + return num_read; } /* @@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_root *clone_root2 = clone_root->root; struct fs_path *p; u64 gen; @@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - if (clone_root2 == sctx->send_root) { + if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); + ret = get_inode_path(sctx, clone_root->root, + clone_root->ino, p); } if (ret < 0) goto out; TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root2->root_item.uuid); + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root2->root_item.ctransid); + clone_root->root->root_item.ctransid); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); @@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx, ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) + if (type == BTRFS_FILE_EXTENT_INLINE) { len = btrfs_file_extent_inline_len(path->nodes[0], ei); - else + /* + * it is possible the inline item won't cover the whole page, + * but there may be items after this page. Make + * sure to send the whole thing + */ + len = PAGE_CACHE_ALIGN(len); + } else { len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + } if (offset + len > sctx->cur_inode_size) len = sctx->cur_inode_size - offset; @@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, u64 left_offset_fixed; u64 left_len; u64 right_len; + u64 left_gen; + u64 right_gen; u8 left_type; u8 right_type; @@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx, eb = left_path->nodes[0]; slot = left_path->slots[0]; - ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - left_len = btrfs_file_extent_num_bytes(eb, ei); - left_offset = btrfs_file_extent_offset(eb, ei); if (left_type != BTRFS_FILE_EXTENT_REG) { ret = 0; goto out; } + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + left_gen = btrfs_file_extent_generation(eb, ei); /* * Following comments will refer to these graphics. L is the left @@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_len = btrfs_file_extent_num_bytes(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG) { ret = 0; @@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len < ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) { ret = 0; goto out; } @@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, /* * Check if we have the same extent. */ - if (left_disknr + left_offset_fixed != - right_disknr + right_offset) { + if (left_disknr != right_disknr || + left_offset_fixed != right_offset || + left_gen != right_gen) { ret = 0; goto out; } @@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = process_recorded_refs(sctx); + if (ret < 0) + goto out; + + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + */ + sctx->send_progress = sctx->cur_ino + 1; out: return ret; @@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid); + &left_mode, &left_uid, &left_gid, NULL); if (ret < 0) goto out; @@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } else { ret = get_inode_info(sctx->parent_root, sctx->cur_ino, NULL, NULL, &right_mode, &right_uid, - &right_gid); + &right_gid, NULL); if (ret < 0) goto out; @@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; - sctx->cur_inode_first_ref_orphan = 0; + + /* + * Set send_progress to current inode. This will tell all get_cur_xxx + * functions that the current inode's refs are not updated yet. Later, + * when process_recorded_refs is finished, it is set to cur_ino + 1. + */ sctx->send_progress = sctx->cur_ino; if (result == BTRFS_COMPARE_TREE_NEW || @@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx, right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], right_ii); - if (left_gen != right_gen) + + /* + * The cur_ino = root dir case is special here. We can't treat + * the inode as deleted+reused because it would generate a + * stream that tries to delete/mkdir the root dir. + */ + if (left_gen != right_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) sctx->cur_inode_new_gen = 1; } @@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - ret = send_create_inode(sctx, sctx->left_path, - sctx->cmp_key); + ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = 0; @@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + /* + * We need to do some special handling in case the inode was + * reported as changed with a changed generation number. This + * means that the original inode was deleted and new inode + * reused the same inum. So we have to treat the old inode as + * deleted and the new one as new. + */ if (sctx->cur_inode_new_gen) { + /* + * First, process the inode as if it was deleted. + */ sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = 0; sctx->cur_inode_deleted = 1; @@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx, if (ret < 0) goto out; + /* + * Now process the inode as if it was new. + */ sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = 1; sctx->cur_inode_deleted = 0; @@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); - ret = send_create_inode(sctx, sctx->left_path, - sctx->cmp_key); + ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); if (ret < 0) goto out; + /* + * Advance send_progress now as we did not get into + * process_recorded_refs_if_needed in the new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * Now process all extents and xattrs of the inode as if + * they were all new. + */ ret = process_all_extents(sctx); if (ret < 0) goto out; @@ -4172,6 +4230,16 @@ out: return ret; } +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */ static int changed_ref(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx, return ret; } +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */ static int changed_xattr(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx, return ret; } +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */ static int changed_extent(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx, return ret; } - +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */ static int changed_cb(struct btrfs_root *left_root, struct btrfs_root *right_root, struct btrfs_path *left_path, @@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root, if (ret < 0) goto out; + /* Ignore non-FS objects */ + if (key->objectid == BTRFS_FREE_INO_OBJECTID || + key->objectid == BTRFS_FREE_SPACE_OBJECTID) + goto out; + if (key->type == BTRFS_INODE_ITEM_KEY) ret = changed_inode(sctx, result); else if (key->type == BTRFS_INODE_REF_KEY) @@ -4299,7 +4385,8 @@ join_trans: } /* - * Make sure the tree has not changed + * Make sure the tree has not changed after re-joining. We detect this + * by comparing start_ctransid and ctransid. They should always match. */ spin_lock(&send_root->root_times_lock); ctransid = btrfs_root_ctransid(&send_root->root_item); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9934e94..1bf4f32 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,4 +130,5 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 83d6f9f..915ac14 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); + WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n"); trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->blocks_used) { - btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); + char nbuf[16]; + const char *errstr; + + errstr = btrfs_decode_error(root->fs_info, errno, nbuf); + btrfs_printk(root->fs_info, + "%s:%d: Aborting unused transaction(%s).\n", + function, line, errstr); return; } trans->transaction->aborted = errno; @@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); + } else { + printk(KERN_INFO "btrfs: setting nodatacow\n"); + } + info->compress_type = BTRFS_COMPRESS_NONE; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; @@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; @@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag"); + printk(KERN_INFO "btrfs: enabling auto defrag\n"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery"); + printk(KERN_INFO "btrfs: enabling auto recovery\n"); btrfs_set_opt(info->mount_opt, RECOVERY); break; case Opt_skip_balance: @@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&fs_info->trans_lock); - if (!fs_info->running_transaction) { - spin_unlock(&fs_info->trans_lock); - return 0; - } - spin_unlock(&fs_info->trans_lock); + btrfs_wait_ordered_extents(root, 0); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; return PTR_ERR(trans); + } return btrfs_commit_transaction(trans, root); } @@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_lock(&fs_info->transaction_kthread_mutex); - mutex_lock(&fs_info->cleaner_mutex); - return 0; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_unlock(&fs_info->cleaner_mutex); - mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } @@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "misc_deregister failed for control device"); + printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); } static int __init init_btrfs_fs(void) @@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_extent_io; - err = btrfs_delayed_inode_init(); + err = ordered_data_init(); if (err) goto free_extent_map; + err = btrfs_delayed_inode_init(); + if (err) + goto free_ordered_data; + err = btrfs_interface_init(); if (err) goto free_delayed_inode; @@ -1641,6 +1664,8 @@ unregister_ioctl: btrfs_interface_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); +free_ordered_data: + ordered_data_exit(); free_extent_map: extent_map_exit(); free_extent_io: @@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); btrfs_delayed_inode_exit(); + ordered_data_exit(); extent_map_exit(); extent_io_exit(); btrfs_interface_exit(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 27c2600..77db875 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int nofail) +static noinline int join_transaction(struct btrfs_root *root, int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -67,7 +67,13 @@ loop: } if (fs_info->trans_no_join) { - if (!nofail) { + /* + * If we are JOIN_NOLOCK we're already committing a current + * transaction, we just need a handle to deal with something + * when committing the transaction, such as inode cache and + * space cache. It is a special case. + */ + if (type != TRANS_JOIN_NOLOCK) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } @@ -87,6 +93,13 @@ loop: } spin_unlock(&fs_info->trans_lock); + /* + * If we are ATTACH, we just want to catch the current transaction, + * and commit it. If there is no transaction, just return ENOENT. + */ + if (type == TRANS_ATTACH) + return -ENOENT; + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root) } } -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, -}; - static int may_wait_transaction(struct btrfs_root *root, int type) { if (root->fs_info->log_root_recovering) @@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type) + u64 num_items, int type, + int noflush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); + if (noflush) + ret = btrfs_block_rsv_add_noflush(root, + &root->fs_info->trans_block_rsv, + num_bytes); + else + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes); if (ret) return ERR_PTR(ret); } @@ -335,19 +347,34 @@ again: if (!h) return ERR_PTR(-ENOMEM); - sb_start_intwrite(root->fs_info->sb); + /* + * If we are JOIN_NOLOCK we're already committing a transaction and + * waiting on this guy, so we don't need to do the sb_start_intwrite + * because we're already holding a ref. We need this because we could + * have raced in and did an fsync() on a file which can kick a commit + * and then we deadlock with somebody doing a freeze. + * + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it, so we needn't do sb_start_intwrite(). + */ + if (type < TRANS_JOIN_NOLOCK) + sb_start_intwrite(root->fs_info->sb); if (may_wait_transaction(root, type)) wait_current_trans(root); do { - ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + ret = join_transaction(root, type); if (ret == -EBUSY) wait_current_trans(root); } while (ret == -EBUSY); if (ret < 0) { - sb_end_intwrite(root->fs_info->sb); + /* We must get the transaction if we are JOIN_NOLOCK. */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -367,7 +394,9 @@ again: h->aborted = 0; h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; + h->type = type; INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->new_bgs); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -393,21 +422,33 @@ got_it: struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START); + return start_transaction(root, num_items, TRANS_START, 0); +} + +struct btrfs_trans_handle *btrfs_start_transaction_noflush( + struct btrfs_root *root, int num_items) +{ + return start_transaction(root, num_items, TRANS_START, 1); } + struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN); + return start_transaction(root, 0, TRANS_JOIN, 0); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_USERSPACE, 0); +} + +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_ATTACH, 0); } /* wait for a transaction commit to be fully complete */ @@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle, int lock) + struct btrfs_root *root, int throttle) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; + int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; if (--trans->use_count) { @@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - sb_end_intwrite(root->fs_info->sb); + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { @@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } + if (trans->type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); @@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_end_transaction(trans, root, 0, 1); + ret = __btrfs_end_transaction(trans, root, 0); if (ret) return ret; return 0; @@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_end_transaction(trans, root, 1, 1); - if (ret) - return ret; - return 0; -} - -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = __btrfs_end_transaction(trans, root, 0, 0); + ret = __btrfs_end_transaction(trans, root, 1); if (ret) return ret; return 0; @@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 1, 1); + return __btrfs_end_transaction(trans, root, 1); } /* @@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int err = 0; int werr = 0; struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, - GFP_NOFS); + mark, &cached_state)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + cached_state = NULL; err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; @@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int err = 0; int werr = 0; struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT)) { - clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + EXTENT_NEED_WAIT, &cached_state)) { + clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; @@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct inode *parent_inode; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; @@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - rsv = trans->block_rsv; + path = btrfs_alloc_path(); + if (!path) { + ret = pending->error = -ENOMEM; + goto path_alloc_fail; + } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { ret = pending->error = -ENOMEM; - goto fail; + goto root_item_alloc_fail; } ret = btrfs_find_free_objectid(tree_root, &objectid); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); @@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } } ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, objectid, pending->inherit); - kfree(pending->inherit); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; @@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_set_inode_index(parent_inode, &index); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - parent_inode, &key, - BTRFS_FT_DIR, index); - if (ret == -EEXIST) { + + /* check if there is a file/dir which has the same name. */ + dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, + btrfs_ino(parent_inode), + dentry->d_name.name, + dentry->d_name.len, 0); + if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; - dput(parent); goto fail; - } else if (ret) { - goto abort_trans_dput; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + btrfs_abort_transaction(trans, root, ret); + goto fail; } - - btrfs_i_size_write(parent_inode, parent_inode->i_size + - dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, parent_root, parent_inode); - if (ret) - goto abort_trans_dput; + btrfs_release_path(path); /* * pull in the delayed directory update @@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - if (ret) { /* Transaction aborted */ - dput(parent); + if (ret) { /* Transaction aborted */ + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - goto abort_trans_dput; + btrfs_abort_transaction(trans, root, ret); + goto fail; } btrfs_set_lock_blocking(old); @@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) - goto abort_trans_dput; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } /* see comments in should_cow_block() */ root->force_cow = 1; @@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) - goto abort_trans_dput; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } /* * insert root back/forward references @@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); - dput(parent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto fail; + } key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - goto abort_trans; + btrfs_abort_transaction(trans, root, ret); + goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_insert_dir_item(trans, parent_root, + dentry->d_name.name, dentry->d_name.len, + parent_inode, &key, + BTRFS_FT_DIR, index); + /* We have check then name at the beginning, so it is impossible. */ + BUG_ON(ret == -EEXIST); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, parent_root, parent_inode); if (ret) - goto abort_trans; - ret = 0; + btrfs_abort_transaction(trans, root, ret); fail: - kfree(new_root_item); + dput(parent); trans->block_rsv = rsv; +no_free_objectid: + kfree(new_root_item); +root_item_alloc_fail: + btrfs_free_path(path); +path_alloc_fail: btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; - -abort_trans_dput: - dput(parent); -abort_trans: - btrfs_abort_transaction(trans, root, ret); - goto fail; } /* @@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work) struct btrfs_async_commit *ac = container_of(work, struct btrfs_async_commit, work.work); + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + current->journal_info = ac->newtrans; + btrfs_commit_transaction(ac->newtrans, ac->root); kfree(ac); } @@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, atomic_inc(&cur_trans->use_count); btrfs_end_transaction(trans, root); + + /* + * Tell lockdep we've released the freeze rwsem, since the + * async commit thread will be the one to unlock it. + */ + rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + schedule_delayed_work(&ac->work, 0); /* wait for transaction to start and unblock */ @@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ cur_trans->delayed_refs.flushing = 1; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + ret = btrfs_run_delayed_refs(trans, root, 0); if (ret) goto cleanup_transaction; @@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); - btrfs_wait_ordered_extents(root, 0, 1); + btrfs_wait_ordered_extents(root, 1); } ret = btrfs_run_delayed_items(trans, root); @@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_lock(&root->fs_info->reloc_mutex); - ret = btrfs_run_delayed_items(trans, root); + /* + * We needn't worry about the delayed items because we will + * deal with them in create_pending_snapshot(), which is the + * core function of the snapshot creation. + */ + ret = create_pending_snapshots(trans, root->fs_info); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } - ret = create_pending_snapshots(trans, root->fs_info); + /* + * We insert the dir indexes of the snapshots and update the inode + * of the snapshots' parents after the snapshot creation, so there + * are some delayed items which are not dealt with. Now deal with + * them. + * + * We needn't worry that this operation will corrupt the snapshots, + * because all the tree which are snapshoted will be forced to COW + * the nodes and leaves. + */ + ret = btrfs_run_delayed_items(trans, root); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; @@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - sb_end_intwrite(root->fs_info->sb); + if (trans->type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); trace_btrfs_transaction_commit(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e8b8416..8096194 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,14 @@ struct btrfs_transaction { int aborted; }; +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, + TRANS_JOIN_NOLOCK, + TRANS_ATTACH, +}; + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -58,8 +66,9 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; - int aborted; - int adding_csums; + short aborted; + short adding_csums; + enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to * start_transaction is the same as the one passed to end_transaction. @@ -68,6 +77,7 @@ struct btrfs_trans_handle { struct btrfs_root *root; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; + struct list_head new_bgs; }; struct btrfs_pending_snapshot { @@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_noflush( + struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c86670f..e9ebb47 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,13 +18,16 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/list_sort.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" +#include "backref.h" #include "compat.h" #include "tree-log.h" +#include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_multiple_pids = true; } - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; @@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return err; @@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 mask = root->sectorsize - 1; u64 extent_end; - u64 alloc_hint; u64 start = key->offset; u64 saved_nbytes; struct btrfs_file_extent_item *item; @@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, inode, start, extent_end, - &alloc_hint, 1); + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -744,6 +745,7 @@ out: */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, + u64 ref_objectid, char *name, int namelen) { struct btrfs_path *path; @@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log, if (ret != 0) goto out; - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, namelen, NULL)) + match = 1; + + goto out; + } + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr_end = ptr + item_size; while (ptr < ptr_end) { ref = (struct btrfs_inode_ref *)ptr; @@ -786,91 +797,42 @@ out: return match; } - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_root *log, struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) + struct btrfs_root *log_root, + struct inode *dir, struct inode *inode, + struct extent_buffer *eb, + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, char *name, int namelen, + int *search_done) { - struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; - struct inode *dir; - struct inode *inode; - unsigned long ref_ptr; - unsigned long ref_end; - char *name; - int namelen; int ret; - int search_done = 0; - - /* - * it is possible that we didn't log all the parent directories - * for a given inode. If we don't find the dir, just don't - * copy the back ref in. The link count fixup code will take - * care of the rest - */ - dir = read_one_inode(root, key->offset); - if (!dir) - return -ENOENT; - - inode = read_one_inode(root, key->objectid); - if (!inode) { - iput(dir); - return -EIO; - } - - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + char *victim_name; + int victim_name_len; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; again: - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - - /* if we already have a perfect match, we're done */ - if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { - goto out; - } - - /* - * look for a conflicting back reference in the metadata. - * if we find one we have to unlink that name of the file - * before we add our new link. Later on, we overwrite any - * existing back reference, and we don't want to create - * dangling pointers in the directory. - */ - - if (search_done) - goto insert; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret == 0) { - char *victim_name; - int victim_name_len; struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; - struct extent_buffer *leaf = path->nodes[0]; + + leaf = path->nodes[0]; /* are we trying to overwrite a back ref for the root directory * if so, just jump out, we're done */ - if (key->objectid == key->offset) - goto out_nowrite; + if (search_key.objectid == search_key.offset) + return 1; /* check all the names in this back reference to see * if they are in the log. if so, we allow them to stay @@ -889,7 +851,9 @@ again: (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log, key, victim_name, + if (!backref_in_log(log_root, &search_key, + parent_objectid, + victim_name, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(path); @@ -897,9 +861,14 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); + BUG_ON(ret); btrfs_run_delayed_items(trans, root); + kfree(victim_name); + *search_done = 1; + goto again; } kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } BUG_ON(ret); @@ -908,14 +877,78 @@ again: * NOTE: we have searched root tree and checked the * coresponding ref, it does not need to check again. */ - search_done = 1; + *search_done = 1; + } + btrfs_release_path(path); + + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + inode_objectid, parent_objectid, 0, + 0); + if (!IS_ERR_OR_NULL(extref)) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)base + cur_offset; + + victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + victim_name = kmalloc(victim_name_len, GFP_NOFS); + read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, + victim_name_len); + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name, + victim_name_len); + ret = 0; + if (!backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len)) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + btrfs_inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, + victim_parent, + inode, + victim_name, + victim_name_len); + btrfs_run_delayed_items(trans, root); + } + BUG_ON(ret); + iput(victim_parent); + kfree(victim_name); + *search_done = 1; + goto again; + } + kfree(victim_name); + BUG_ON(ret); +next: + cur_offset += victim_name_len + sizeof(*extref); + } + *search_done = 1; } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); + ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); BUG_ON(ret); @@ -931,25 +964,173 @@ again: } btrfs_release_path(path); -insert: - /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, - btrfs_inode_ref_index(eb, ref)); - BUG_ON(ret); + return 0; +} - btrfs_update_inode(trans, root, inode); +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; -out: - ref_ptr = (unsigned long)(ref + 1) + namelen; - kfree(name); - if (ref_ptr < ref_end) - goto again; + extref = (struct btrfs_inode_extref *)ref_ptr; + + *namelen = btrfs_inode_extref_name_len(eb, extref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)&extref->name, + *namelen); + + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) +{ + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + *namelen = btrfs_inode_ref_name_len(eb, ref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + struct inode *inode; + unsigned long ref_ptr; + unsigned long ref_end; + char *name; + int namelen; + int ret; + int search_done = 0; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index = 0; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, parent_objectid); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, inode_objectid); + if (!inode) { + iput(dir); + return -EIO; + } + + while (ref_ptr < ref_end) { + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) + return -ENOENT; + } else { + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); + } + if (ret) + return ret; + + /* if we already have a perfect match, we're done */ + if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, name, namelen)) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + + if (!search_done) { + ret = __add_inode_ref(trans, root, path, log, + dir, inode, eb, + inode_objectid, + parent_objectid, + ref_index, name, namelen, + &search_done); + if (ret == 1) + goto out; + BUG_ON(ret); + } + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, + 0, ref_index); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + } + + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; + kfree(name); + if (log_ref_ver) { + iput(dir); + dir = NULL; + } + } /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); BUG_ON(ret); -out_nowrite: +out: btrfs_release_path(path); iput(dir); iput(inode); @@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, return ret; } +static int count_inode_extrefs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay. So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found. If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) { - struct btrfs_path *path; int ret; struct btrfs_key key; - u64 nlink = 0; + unsigned int nlink = 0; unsigned long ptr; unsigned long ptr_end; int name_len; @@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, inode, path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, inode, path); + if (ret == -ENOENT) + ret = 0; + + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + if (nlink != inode->i_nlink) { set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); @@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = insert_orphan_item(trans, root, ino); BUG_ON(ret); } - btrfs_free_path(path); - return 0; +out: + btrfs_free_path(path); + return ret; } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, @@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); @@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); while (1) { - unsigned long batch = root->log_batch; + int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); @@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); } wait_for_writer(trans, root); - if (batch == root->log_batch) + if (batch == atomic_read(&root->log_batch)) break; } @@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); - root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); mutex_lock(&log_root_tree->log_mutex); - log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); mutex_unlock(&log_root_tree->log_mutex); @@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); - log_root_tree->log_batch = 0; log_root_tree->log_transid++; smp_mb(); @@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ btrfs_scrub_pause_super(root); - write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = write_ctree_super(trans, root->fs_info->tree_root, 1); btrfs_scrub_continue_super(root); - ret = 0; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_wake_log_root; + } mutex_lock(&root->log_mutex); if (root->last_log_commit < log_transid) @@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + NULL); if (ret) break; @@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key key; struct btrfs_key found_key; + int start_slot; key.objectid = objectid; key.type = max_key_type; @@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, if (found_key.objectid != objectid) break; - ret = btrfs_del_item(trans, log, path); - if (ret) + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], &found_key, 0, + &start_slot); + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) break; btrfs_release_path(path); } @@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + + btrfs_set_inode_sequence(leaf, item, inode->i_version); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, 0); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, 0); + } else { + btrfs_set_inode_generation(leaf, item, + BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); + } + +} + static noinline int copy_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct inode *inode, struct btrfs_path *dst_path, struct extent_buffer *src, int start_slot, int nr, int inode_only) { unsigned long src_offset; unsigned long dst_offset; + struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; int ret; @@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, char *ins_data; int i; struct list_head ordered_sums; + int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); @@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); - - if (inode_only == LOG_INODE_EXISTS && - ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], struct btrfs_inode_item); - btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); - - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' - */ - btrfs_set_inode_generation(dst_path->nodes[0], - inode_item, 0); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + inode, inode_only == LOG_INODE_EXISTS); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, struct btrfs_file_extent_item); @@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, continue; found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (found_type == BTRFS_FILE_EXTENT_REG) { u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); @@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; + return 0; +} + +struct log_args { + struct extent_buffer *src; + u64 next_offset; + int start_slot; + int nr; +}; + +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + struct extent_map *em, struct btrfs_path *path, + struct btrfs_path *dst_path, struct log_args *args) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 start = em->mod_start; + u64 search_start = start; + u64 len = em->mod_len; + u64 num_bytes; + int nritems; + int ret; + + if (BTRFS_I(inode)->logged_trans == trans->transid) { + ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, + start + len, NULL, 0); + if (ret) + return ret; + } + + while (len) { + if (args->nr) + goto next_slot; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = search_start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret) { + /* + * A rare case were we can have an em for a section of a + * larger extent so we need to make sure that this em + * falls within the extent we've found. If not we just + * bail and go back to ye-olde way of doing things but + * it happens often enough in testing that we need to do + * this dance to make sure. + */ + do { + if (path->slots[0] == 0) { + btrfs_release_path(path); + if (search_start == 0) + return -ENOENT; + search_start--; + goto again; + } + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + btrfs_release_path(path); + return -ENOENT; + } + } while (key.offset > start); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], + fi); + if (key.offset + num_bytes <= start) { + btrfs_release_path(path); + return -ENOENT; + } + } + args->src = path->nodes[0]; +next_slot: + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + fi = btrfs_item_ptr(args->src, path->slots[0], + struct btrfs_file_extent_item); + if (args->nr && + args->start_slot + args->nr == path->slots[0]) { + args->nr++; + } else if (args->nr) { + ret = copy_items(trans, inode, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 1; + args->start_slot = path->slots[0]; + } else if (!args->nr) { + args->nr = 1; + args->start_slot = path->slots[0]; + } + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + num_bytes = btrfs_file_extent_num_bytes(args->src, fi); + if (len < num_bytes) { + /* I _think_ this is ok, envision we write to a + * preallocated space that is adjacent to a previously + * written preallocated space that gets merged when we + * mark this preallocated space written. If we do not + * have the adjacent extent in cache then when we copy + * this extent it could end up being larger than our EM + * thinks it is, which is a-ok, so just set len to 0. + */ + len = 0; + } else { + len -= num_bytes; + } + start = key.offset + num_bytes; + args->next_offset = start; + search_start = start; + + if (path->slots[0] < nritems) { + if (len) + goto next_slot; + break; + } + + if (args->nr) { + ret = copy_items(trans, inode, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 0; + btrfs_release_path(path); + } + } + + return 0; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + struct log_args args; + struct extent_map *em, *n; + struct list_head extents; + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + u64 test_gen; + int ret = 0; + + INIT_LIST_HEAD(&extents); + + memset(&args, 0, sizeof(args)); + + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + if (em->generation <= test_gen) + continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); + list_add_tail(&em->list, &extents); + } + + list_sort(NULL, &extents, extent_cmp); + + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) { + free_extent_map(em); + continue; + } + + write_unlock(&tree->lock); + + /* + * If the previous EM and the last extent we left off on aren't + * sequential then we need to copy the items we have and redo + * our search + */ + if (args.nr && em->mod_start != args.next_offset) { + ret = copy_items(trans, inode, dst_path, args.src, + args.start_slot, args.nr, + LOG_INODE_ALL); + if (ret) { + free_extent_map(em); + write_lock(&tree->lock); + continue; + } + btrfs_release_path(path); + args.nr = 0; + } + + ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + free_extent_map(em); + write_lock(&tree->lock); + } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + + if (!ret && args.nr) + ret = copy_items(trans, inode, dst_path, args.src, + args.start_slot, args.nr, LOG_INODE_ALL); + btrfs_release_path(path); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int nritems; int ins_start_slot = 0; int ins_nr; + bool fast_search = false; u64 ino = btrfs_ino(inode); log = root->log_root; @@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.objectid = ino; - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; + /* today the code can only do partial logging of directories */ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; - ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + /* Only run delayed items if we are a dir or a new file */ + if (S_ISDIR(inode->i_mode) || + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } } mutex_lock(&BTRFS_I(inode)->log_mutex); @@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } else { + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + BTRFS_XATTR_ITEM_KEY); + } } if (ret) { err = ret; @@ -2912,7 +3470,7 @@ again: goto next_slot; } - ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; @@ -2930,7 +3488,7 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { @@ -2951,8 +3509,7 @@ next_slot: break; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; @@ -2960,7 +3517,24 @@ next_slot: } ins_nr = 0; } - WARN_ON(ins_nr); + + if (fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, path, + dst_path); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em, *n; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) + list_del_init(&em->list); + } + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(path); btrfs_release_path(dst_path); @@ -2971,6 +3545,7 @@ next_slot: } } BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); @@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - BUG_ON(ret != -ENOSPC); + WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ab942f4..99be4c1 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free); * In case of allocation failure -ENOMEM is returned and the ulist stays * unaltered. */ -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - gfp_t gfp_mask) +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) { return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); } -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long *old_aux, gfp_t gfp_mask) +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask) { int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 21bdc8e..21a1963 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -33,7 +33,7 @@ struct ulist_iterator { */ struct ulist_node { u64 val; /* value to store */ - unsigned long aux; /* auxiliary value saved along with the val */ + u64 aux; /* auxiliary value saved along with the val */ }; struct ulist { @@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - gfp_t gfp_mask); -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long *old_aux, gfp_t gfp_mask); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 88b969a..029b903 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, bdev = blkdev_get_by_path(device->name->str, flags, holder); if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name->str); + printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); goto error; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) free_fs_devices(cur_devices); } + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + /* * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super @@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { ret = init_first_rw_device(trans, root, device); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } ret = btrfs_finish_sprout(trans, root); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } } else { ret = btrfs_add_device(trans, root, device); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } } /* @@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); ret = btrfs_commit_transaction(trans, root); if (seeding_dev) { @@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: unlock_chunks(root); - btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); rcu_string_free(device->name); kfree(device); @@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + int num_tolerated_disk_barrier_failures; + u64 target = bctl->sys.target; + + num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (num_tolerated_disk_barrier_failures > 0 && + (target & + (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_AVAIL_ALLOC_BIT_SINGLE))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 && + (target & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + + fs_info->num_tolerated_disk_barrier_failures = + num_tolerated_disk_barrier_failures; + } + ret = insert_balance_item(fs_info->tree_root, bctl); if (ret && ret != -EEXIST) goto out; @@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, __cancel_balance(fs_info); } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + } + wake_up(&fs_info->balance_wait_q); return ret; @@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } /* * Modifying chunk tree needs allocating new blocks from both @@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, */ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } ret = __finish_chunk_alloc(trans, extent_root, sys_map, sys_chunk_offset, sys_chunk_size, sys_stripe_size); if (ret) - goto abort; + btrfs_abort_transaction(trans, root, ret); - return 0; +out: -abort: - btrfs_abort_transaction(trans, root, ret); return ret; } @@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_CRIT "unable to find logical %llu len %llu\n", + printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n", (unsigned long long)logical, (unsigned long long)*length); BUG(); @@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, total_devs = bbio->num_stripes; if (map_length < length) { - printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, (unsigned long long)length, (unsigned long long)map_length); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 92c2065..9acb846 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_in = 0; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "deflateInit failed\n"); + printk(KERN_WARNING "btrfs: deflateInit failed\n"); ret = -1; goto out; } @@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws, while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); ret = -1; @@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); + printk(KERN_WARNING "btrfs: inflateInit failed\n"); return -1; } while (workspace->inf_strm.total_in < srclen) { @@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); + printk(KERN_WARNING "btrfs: inflateInit failed\n"); return -1; } diff --git a/fs/buffer.c b/fs/buffer.c index 58e2e7b..b5f0442 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t size; int ret; - /* - * Update file times before taking page lock. We may end up failing the - * fault so this update may be superfluous but who really cares... - */ - file_update_time(vma->vm_file); - lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || @@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; sb_start_pagefault(sb); + + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + ret = __block_page_mkwrite(vma, vmf, get_block); sb_end_pagefault(sb); return block_page_mkwrite_return(ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 452e71a..6690269 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -205,7 +205,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - page->index << PAGE_CACHE_SHIFT, &len, + (u64) page_offset(page), &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -286,7 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) int nr_pages = 0; int ret; - off = page->index << PAGE_CACHE_SHIFT; + off = (u64) page_offset(page); /* count pages */ next_index = page->index; @@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, NULL, false, 1, 0); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* build page vector */ nr_pages = len >> PAGE_CACHE_SHIFT; @@ -426,7 +426,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page->index << PAGE_CACHE_SHIFT; + loff_t page_off = page_offset(page); int len = PAGE_CACHE_SIZE; loff_t i_size; int err = 0; @@ -817,8 +817,7 @@ get_more_pages: /* ok */ if (locked_pages == 0) { /* prepare async write request */ - offset = (unsigned long long)page->index - << PAGE_CACHE_SHIFT; + offset = (u64) page_offset(page); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, @@ -832,8 +831,8 @@ get_more_pages: ci->i_truncate_size, &inode->i_mtime, true, 1, 0); - if (!req) { - rc = -ENOMEM; + if (IS_ERR(req)) { + rc = PTR_ERR(req); unlock_page(page); break; } @@ -1180,7 +1179,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = vmf->page; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - loff_t off = page->index << PAGE_CACHE_SHIFT; + loff_t off = page_offset(page); loff_t size, len; int ret; @@ -1225,6 +1224,7 @@ out: static struct vm_operations_struct ceph_vmops = { .fault = filemap_fault, .page_mkwrite = ceph_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1235,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 620daad..3251e9c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1005,7 +1005,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ecebbc0..5840d2a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -536,8 +536,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4b5762e..ba95eea 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1104,7 +1104,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, pr_err("fill_trace bad get_inode " "%llx.%llx\n", vino.ino, vino.snap); err = PTR_ERR(in); - d_delete(dn); + d_drop(dn); goto done; } dn = splice_dentry(dn, in, &have_lease, true); @@ -1277,7 +1277,7 @@ retry_lookup: in = ceph_get_inode(parent->d_sb, vino); if (IS_ERR(in)) { dout("new_inode badness\n"); - d_delete(dn); + d_drop(dn); dput(dn); err = PTR_ERR(in); goto out; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 1396ceb..36549a4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -187,14 +187,18 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) u64 tmp; struct ceph_object_layout ol; struct ceph_pg pgid; + int r; /* copy and validate */ if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; down_read(&osdc->map_sem); - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, - &dl.object_no, &dl.object_offset, &olen); + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + &dl.object_no, &dl.object_offset, + &olen); + if (r < 0) + return -EIO; dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a5a7354..1bcf712 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2625,7 +2625,8 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + if (i >= newmap->m_max_mds || + memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b982239..2eb43f2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -307,7 +307,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, { struct ceph_mount_options *fsopt; const char *dev_name_end; - int err = -ENOMEM; + int err; + + if (!dev_name || !*dev_name) + return -EINVAL; fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); if (!fsopt) @@ -328,21 +331,33 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + /* + * Distinguish the server list from the path in "dev_name". + * Internally we do not include the leading '/' in the path. + * + * "dev_name" will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* skip over leading '/' for path */ + *path = dev_name_end + 1; + } else { + /* path is empty */ + dev_name_end = dev_name + strlen(dev_name); + *path = dev_name_end; + } err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", + dev_name_end--; /* back up to ':' separator */ + if (*dev_name_end != ':') { + pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; } - dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - - /* path on server */ - *path += 2; dout("server path '%s'\n", *path); *popt = ceph_parse_options(options, dev_name, dev_name_end, @@ -603,6 +618,11 @@ bad_cap: static void destroy_caches(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_dentry_cachep); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index a08306a..2075ddf 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -9,13 +9,14 @@ config CIFS select CRYPTO_ARC4 select CRYPTO_ECB select CRYPTO_DES + select CRYPTO_SHA256 help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block (SMB) protocol, the native file sharing mechanism for most early PC operating systems. The CIFS protocol is fully supported by - file servers such as Windows 2000 (including Windows 2003, NT 4 - and Windows XP) as well by Samba (which provides excellent CIFS + file servers such as Windows 2000 (including Windows 2003, Windows 2008, + NT 4 and Windows XP) as well by Samba (which provides excellent CIFS server support for Linux and many other operating systems). Limited support for OS/2 and Windows ME and similar servers is provided as well. @@ -114,6 +115,13 @@ config CIFS_POSIX (such as Samba 3.10 and later) which can negotiate CIFS POSIX ACL support. If unsure, say N. +config CIFS_ACL + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS + help + Allows fetching CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. + config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" depends on CIFS @@ -138,21 +146,6 @@ config CIFS_DFS_UPCALL IP addresses) which is needed for implicit mounts of DFS junction points. If unsure, say N. -config CIFS_FSCACHE - bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y - help - Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data - to be cached locally on disk through the general filesystem cache - manager. If unsure, say N. - -config CIFS_ACL - bool "Provide CIFS ACL support" - depends on CIFS_XATTR && KEYS - help - Allows to fetch CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. - config CIFS_NFSD_EXPORT bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL && BROKEN @@ -161,7 +154,7 @@ config CIFS_NFSD_EXPORT config CIFS_SMB2 bool "SMB2 network file system support (EXPERIMENTAL)" - depends on EXPERIMENTAL && INET && BROKEN + depends on CIFS && EXPERIMENTAL && INET select NLS select KEYS select FSCACHE @@ -178,3 +171,12 @@ config CIFS_SMB2 (compared to cifs) due to protocol improvements. Unless you are a developer or tester, say N. + +config CIFS_FSCACHE + bool "Provide CIFS client caching support" + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index feee943..aa0d68b 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -17,4 +17,4 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 7dab9c0..71d5d0a 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -203,6 +203,27 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, int i; wchar_t wchar_to; /* needed to quiet sparse */ + /* special case for utf8 to handle no plane0 chars */ + if (!strcmp(codepage->charset, "utf8")) { + /* + * convert utf8 -> utf16, we assume we have enough space + * as caller should have assumed conversion does not overflow + * in destination len is length in wchar_t units (16bits) + */ + i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN, + (wchar_t *) to, len); + + /* if success terminate and exit */ + if (i >= 0) + goto success; + /* + * if fails fall back to UCS encoding as this + * function should not return negative values + * currently can fail only if source contains + * invalid encoded characters + */ + } + for (i = 0; len && *from; i++, from += charlen, len -= charlen) { charlen = codepage->char2uni(from, len, &wchar_to); if (charlen < 1) { @@ -215,6 +236,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, put_unaligned_le16(wchar_to, &to[i]); } +success: put_unaligned_le16(0, &to[i]); return i; } @@ -328,7 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, } ctoUTF16_out: - return i; + return j; } #ifdef CONFIG_CIFS_SMB2 diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 05f4dc2..2ee5c54 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1222,7 +1222,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, if (!open_file) return get_cifs_acl_by_path(cifs_sb, path, pacllen); - pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); + pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen); cifsFileInfo_put(open_file); return pntsd; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 6a0d741..652f505 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -29,6 +29,7 @@ #include "ntlmssp.h" #include <linux/ctype.h> #include <linux/random.h> +#include <linux/highmem.h> /* * Calculate and return the CIFS signature based on the mac key and SMB PDU. @@ -37,11 +38,13 @@ * the sequence number before this function is called. Also, this function * should be called with the server->srv_mutex held. */ -static int cifs_calc_signature(const struct kvec *iov, int n_vec, +static int cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature) { int i; int rc; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; if (iov == NULL || signature == NULL || server == NULL) return -EINVAL; @@ -91,6 +94,16 @@ static int cifs_calc_signature(const struct kvec *iov, int n_vec, } } + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdescmd5->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); if (rc) cERROR(1, "%s: Could not generate md5 hash", __func__); @@ -99,12 +112,12 @@ static int cifs_calc_signature(const struct kvec *iov, int n_vec, } /* must be called with server->srv_mutex held */ -int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, +int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { int rc = 0; char smb_signature[20]; - struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -125,7 +138,7 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - rc = cifs_calc_signature(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature(rqst, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -134,6 +147,15 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; } +int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence) +{ + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; + + return cifs_sign_rqst(&rqst, server, pexpected_response_sequence); +} + /* must be called with server->srv_mutex held */ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) @@ -147,14 +169,14 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, pexpected_response_sequence_number); } -int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, +int cifs_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; - struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base; if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -186,8 +208,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, cifs_pdu->Signature.Sequence.Reserved = 0; mutex_lock(&server->srv_mutex); - rc = cifs_calc_signature(iov, nr_iov, server, - what_we_think_sig_should_be); + rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be); mutex_unlock(&server->srv_mutex); if (rc) @@ -686,12 +707,17 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_shash_release(struct TCP_Server_Info *server) { + if (server->secmech.hmacsha256) + crypto_free_shash(server->secmech.hmacsha256); + if (server->secmech.md5) crypto_free_shash(server->secmech.md5); if (server->secmech.hmacmd5) crypto_free_shash(server->secmech.hmacmd5); + kfree(server->secmech.sdeschmacsha256); + kfree(server->secmech.sdeschmacmd5); kfree(server->secmech.sdescmd5); @@ -716,6 +742,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) goto crypto_allocate_md5_fail; } + server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); + if (IS_ERR(server->secmech.hmacsha256)) { + cERROR(1, "could not allocate crypto hmacsha256\n"); + rc = PTR_ERR(server->secmech.hmacsha256); + goto crypto_allocate_hmacsha256_fail; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.hmacmd5); server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); @@ -727,7 +760,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; server->secmech.sdeschmacmd5->shash.flags = 0x0; - size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.md5); server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); @@ -739,12 +771,29 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.sdescmd5->shash.tfm = server->secmech.md5; server->secmech.sdescmd5->shash.flags = 0x0; + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacsha256); + server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacsha256) { + cERROR(1, "%s: Can't alloc hmacsha256\n", __func__); + rc = -ENOMEM; + goto crypto_allocate_hmacsha256_sdesc_fail; + } + server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; + server->secmech.sdeschmacsha256->shash.flags = 0x0; + return 0; +crypto_allocate_hmacsha256_sdesc_fail: + kfree(server->secmech.sdescmd5); + crypto_allocate_md5_sdesc_fail: kfree(server->secmech.sdeschmacmd5); crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.hmacsha256); + +crypto_allocate_hmacsha256_fail: crypto_free_shash(server->secmech.md5); crypto_allocate_md5_fail: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index db8a404..e7931cc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -36,6 +36,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/namei.h> +#include <linux/random.h> #include <net/ipv6.h> #include "cifsfs.h" #include "cifspdu.h" @@ -51,7 +52,6 @@ #ifdef CONFIG_CIFS_SMB2 #include "smb2pdu.h" #endif -#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ int cifsFYI = 0; int cifsERROR = 1; @@ -89,6 +89,10 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +#ifdef CONFIG_CIFS_SMB2 +__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; +#endif + static int cifs_read_super(struct super_block *sb) { @@ -160,13 +164,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - int rc = -EOPNOTSUPP; + struct TCP_Server_Info *server = tcon->ses->server; unsigned int xid; + int rc = 0; xid = get_xid(); - buf->f_type = CIFS_MAGIC_NUMBER; - /* * PATH_MAX may be too long - it would presumably be total path, * but note that some servers (includinng Samba 3) have a shorter @@ -178,27 +181,8 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; /* undefined */ buf->f_ffree = 0; /* unlimited */ - /* - * We could add a second check for a QFS Unix capability bit - */ - if ((tcon->ses->capabilities & CAP_UNIX) && - (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability))) - rc = CIFSSMBQFSPosixInfo(xid, tcon, buf); - - /* - * Only need to call the old QFSInfo if failed on newer one, - * e.g. by OS/2. - **/ - if (rc && (tcon->ses->capabilities & CAP_NT_SMBS)) - rc = CIFSSMBQFSInfo(xid, tcon, buf); - - /* - * Some old Windows servers also do not support level 103, retry with - * older level one if old server failed the previous call or we - * bypassed it because we detected that this was an older LANMAN sess - */ - if (rc) - rc = SMBOldQFSInfo(xid, tcon, buf); + if (server->ops->queryfs) + rc = server->ops->queryfs(xid, tcon, buf); free_xid(xid); return 0; @@ -239,9 +223,10 @@ cifs_alloc_inode(struct super_block *sb) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ cifs_inode->time = 0; - /* Until the file is open and we have gotten oplock - info back from the server, can not assume caching of - file data or metadata */ + /* + * Until the file is open and we have gotten oplock info back from the + * server, can not assume caching of file data or metadata. + */ cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; @@ -249,11 +234,16 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; - - /* Can not set i_flags here - they get immediately overwritten - to zero by the VFS */ -/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/ +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); +#endif + /* + * Can not set i_flags here - they get immediately overwritten to zero + * by the VFS. + */ + /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); + INIT_LIST_HEAD(&cifs_inode->llist); return &cifs_inode->vfs_inode; } @@ -360,7 +350,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_security(s, tcon->ses->server); cifs_show_cache_flavor(s, cifs_sb); - seq_printf(s, ",unc=%s", tcon->treeName); + seq_printf(s, ",unc="); + seq_escape(s, tcon->treeName, " \t\n\\"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); @@ -957,7 +948,7 @@ cifs_init_once(void *inode) struct cifsInodeInfo *cifsi = inode; inode_init_once(&cifsi->vfs_inode); - mutex_init(&cifsi->lock_mutex); + init_rwsem(&cifsi->lock_sem); } static int @@ -977,6 +968,11 @@ cifs_init_inodecache(void) static void cifs_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(cifs_inode_cachep); } @@ -1127,6 +1123,10 @@ init_cifs(void) spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE); +#endif + if (cifs_max_pending < 2) { cifs_max_pending = 2; cFYI(1, "cifs_max_pending set to min of 2"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 1c49c5a..7163419 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -128,5 +128,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.78" +#define CIFS_VERSION "2.0" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 977dc0e..f5af252 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -32,6 +32,8 @@ #include "smb2pdu.h" #endif +#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ + /* * The sizes of various internal tables and strings */ @@ -128,8 +130,10 @@ struct sdesc { struct cifs_secmech { struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ struct crypto_shash *md5; /* md5 hash function */ + struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ + struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ }; /* per smb session structure/fields */ @@ -158,9 +162,24 @@ struct cifs_cred { ***************************************************************** */ +/* + * A smb_rqst represents a complete request to be issued to a server. It's + * formed by a kvec array, followed by an array of pages. Page data is assumed + * to start at the beginning of the first page. + */ +struct smb_rqst { + struct kvec *rq_iov; /* array of kvecs */ + unsigned int rq_nvec; /* number of kvecs in array */ + struct page **rq_pages; /* pointer to array of page ptrs */ + unsigned int rq_npages; /* number pages in array */ + unsigned int rq_pagesz; /* page size to use */ + unsigned int rq_tailsz; /* length of last page */ +}; + enum smb_version { Smb_1 = 1, Smb_21, + Smb_30, }; struct mid_q_entry; @@ -171,17 +190,23 @@ struct cifs_tcon; struct dfs_info3_param; struct cifs_fattr; struct smb_vol; +struct cifs_fid; +struct cifs_readdata; +struct cifs_writedata; +struct cifs_io_parms; +struct cifs_search_info; +struct cifsInodeInfo; struct smb_version_operations { int (*send_cancel)(struct TCP_Server_Info *, void *, struct mid_q_entry *); bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); /* setup request: allocate mid, sign message */ - int (*setup_request)(struct cifs_ses *, struct kvec *, unsigned int, - struct mid_q_entry **); + struct mid_q_entry *(*setup_request)(struct cifs_ses *, + struct smb_rqst *); /* setup async request: allocate mid, sign message */ - int (*setup_async_request)(struct TCP_Server_Info *, struct kvec *, - unsigned int, struct mid_q_entry **); + struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *, + struct smb_rqst *); /* check response: verify signature, map error */ int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *, bool); @@ -212,6 +237,10 @@ struct smb_version_operations { bool (*need_neg)(struct TCP_Server_Info *); /* negotiate to the server */ int (*negotiate)(const unsigned int, struct cifs_ses *); + /* set negotiated write size */ + unsigned int (*negotiate_wsize)(struct cifs_tcon *, struct smb_vol *); + /* set negotiated read size */ + unsigned int (*negotiate_rsize)(struct cifs_tcon *, struct smb_vol *); /* setup smb sessionn */ int (*sess_setup)(const unsigned int, struct cifs_ses *, const struct nls_table *); @@ -235,10 +264,22 @@ struct smb_version_operations { int (*query_path_info)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *, FILE_ALL_INFO *, bool *); + /* query file data from the server */ + int (*query_file_info)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *, FILE_ALL_INFO *); /* get server index number */ int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *, u64 *uniqueid, FILE_ALL_INFO *); + /* set size by path */ + int (*set_path_size)(const unsigned int, struct cifs_tcon *, + const char *, __u64, struct cifs_sb_info *, bool); + /* set size by file handle */ + int (*set_file_size)(const unsigned int, struct cifs_tcon *, + struct cifsFileInfo *, __u64, bool); + /* set attributes */ + int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, + const unsigned int); /* build a full path to the root of the mount */ char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *, struct cifs_tcon *); @@ -256,10 +297,84 @@ struct smb_version_operations { /* remove directory */ int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *, struct cifs_sb_info *); + /* unlink file */ + int (*unlink)(const unsigned int, struct cifs_tcon *, const char *, + struct cifs_sb_info *); + /* open, rename and delete file */ + int (*rename_pending_delete)(const char *, struct dentry *, + const unsigned int); + /* send rename request */ + int (*rename)(const unsigned int, struct cifs_tcon *, const char *, + const char *, struct cifs_sb_info *); + /* send create hardlink request */ + int (*create_hardlink)(const unsigned int, struct cifs_tcon *, + const char *, const char *, + struct cifs_sb_info *); + /* open a file for non-posix mounts */ + int (*open)(const unsigned int, struct cifs_tcon *, const char *, int, + int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *, + struct cifs_sb_info *); + /* set fid protocol-specific info */ + void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); + /* close a file */ + void (*close)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *); + /* send a flush request to the server */ + int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); + /* async read from the server */ + int (*async_readv)(struct cifs_readdata *); + /* async write to the server */ + int (*async_writev)(struct cifs_writedata *); + /* sync read from the server */ + int (*sync_read)(const unsigned int, struct cifsFileInfo *, + struct cifs_io_parms *, unsigned int *, char **, + int *); + /* sync write to the server */ + int (*sync_write)(const unsigned int, struct cifsFileInfo *, + struct cifs_io_parms *, unsigned int *, struct kvec *, + unsigned long); + /* open dir, start readdir */ + int (*query_dir_first)(const unsigned int, struct cifs_tcon *, + const char *, struct cifs_sb_info *, + struct cifs_fid *, __u16, + struct cifs_search_info *); + /* continue readdir */ + int (*query_dir_next)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *, + __u16, struct cifs_search_info *srch_inf); + /* close dir */ + int (*close_dir)(const unsigned int, struct cifs_tcon *, + struct cifs_fid *); + /* calculate a size of SMB message */ + unsigned int (*calc_smb_size)(void *); + /* check for STATUS_PENDING and process it in a positive case */ + bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* send oplock break response */ + int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *, + struct cifsInodeInfo *); + /* query remote filesystem */ + int (*queryfs)(const unsigned int, struct cifs_tcon *, + struct kstatfs *); + /* send mandatory brlock to the server */ + int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64, + __u64, __u32, int, int, bool); + /* unlock range of mandatory locks */ + int (*mand_unlock_range)(struct cifsFileInfo *, struct file_lock *, + const unsigned int); + /* push brlocks from the cache to the server */ + int (*push_mand_locks)(struct cifsFileInfo *); + /* get lease key of the inode */ + void (*get_lease_key)(struct inode *, struct cifs_fid *fid); + /* set lease key of the inode */ + void (*set_lease_key)(struct inode *, struct cifs_fid *fid); + /* generate new lease key */ + void (*new_lease_key)(struct cifs_fid *fid); }; struct smb_version_values { char *version_string; + __u16 protocol_id; + __u32 req_capabilities; __u32 large_lock_type; __u32 exclusive_lock_type; __u32 shared_lock_type; @@ -496,6 +611,51 @@ get_next_mid(struct TCP_Server_Info *server) } /* + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. + * + * Note that this might make for "interesting" allocation problems during + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. + */ +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) + +/* + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. + */ +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) + +/* + * The default wsize is 1M. find_get_pages seems to return a maximum of 256 + * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill + * a single wsize request with a single call. + */ +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60kb reads and 65535 byte writes. Default to + * those values when posix extensions aren't in force. In actuality here, we + * use 65536 to allow for a write that is a multiple of 4k. Most servers seem + * to be ok with the extra byte even though Windows doesn't send writes that + * are that large. + * + * Citation: + * + * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) +#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) + +/* * Macros to allow the TCP_Server_Info->net field and related code to drop out * when CONFIG_NET_NS isn't set. */ @@ -559,6 +719,7 @@ struct cifs_ses { __u16 session_flags; #endif /* CONFIG_CIFS_SMB2 */ }; + /* no more than one of the following three session flags may be set */ #define CIFS_SES_NT4 1 #define CIFS_SES_OS2 2 @@ -665,6 +826,7 @@ struct cifs_tcon { u64 resource_id; /* server resource id */ struct fscache_cookie *fscache; /* cookie for share */ #endif + struct list_head pending_opens; /* list of incomplete opens */ /* BB add field for back pointer to sb struct(s)? */ }; @@ -707,6 +869,15 @@ cifs_get_tlink(struct tcon_link *tlink) /* This function is always expected to succeed */ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); +#define CIFS_OPLOCK_NO_CHANGE 0xfe + +struct cifs_pending_open { + struct list_head olist; + struct tcon_link *tlink; + __u8 lease_key[16]; + __u32 oplock; +}; + /* * This info hangs off the cifsFileInfo structure, pointed to by llist. * This is used to track byte stream locks on the file @@ -740,16 +911,29 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +struct cifs_fid { + __u16 netfid; +#ifdef CONFIG_CIFS_SMB2 + __u64 persistent_fid; /* persist file id for smb2 */ + __u64 volatile_fid; /* volatile file id for smb2 */ + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ +#endif + struct cifs_pending_open *pending_open; +}; + +struct cifs_fid_locks { + struct list_head llist; + struct cifsFileInfo *cfile; /* fid that owns locks */ + struct list_head locks; /* locks held by fid above */ +}; + struct cifsFileInfo { struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ - struct list_head llist; /* - * brlocks held by this fid, protected by - * lock_mutex from cifsInodeInfo structure - */ + struct cifs_fid_locks *llist; /* brlocks held by this fid */ unsigned int uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ - __u16 netfid; /* file id from remote */ + struct cifs_fid fid; /* file id from remote */ /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; @@ -765,12 +949,60 @@ struct cifsFileInfo { struct cifs_io_parms { __u16 netfid; +#ifdef CONFIG_CIFS_SMB2 + __u64 persistent_fid; /* persist file id for smb2 */ + __u64 volatile_fid; /* volatile file id for smb2 */ +#endif __u32 pid; __u64 offset; unsigned int length; struct cifs_tcon *tcon; }; +struct cifs_readdata; + +/* asynchronous read support */ +struct cifs_readdata { + struct kref refcount; + struct list_head list; + struct completion done; + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + pid_t pid; + int result; + struct work_struct work; + int (*read_into_pages)(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, + unsigned int len); + struct kvec iov; + unsigned int pagesz; + unsigned int tailsz; + unsigned int nr_pages; + struct page *pages[]; +}; + +struct cifs_writedata; + +/* asynchronous write support */ +struct cifs_writedata { + struct kref refcount; + struct list_head list; + struct completion done; + enum writeback_sync_modes sync_mode; + struct work_struct work; + struct cifsFileInfo *cfile; + __u64 offset; + pid_t pid; + unsigned int bytes; + int result; + unsigned int pagesz; + unsigned int tailsz; + unsigned int nr_pages; + struct page *pages[1]; +}; + /* * Take a reference on the file private data. Must be called with * cifs_file_list_lock held. @@ -790,11 +1022,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); struct cifsInodeInfo { bool can_cache_brlcks; - struct mutex lock_mutex; /* - * protect the field above and llist - * from every cifsFileInfo structure - * from openFileList - */ + struct list_head llist; /* locks helb by this inode */ + struct rw_semaphore lock_sem; /* protect the fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ @@ -806,6 +1035,9 @@ struct cifsInodeInfo { u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ +#ifdef CONFIG_CIFS_SMB2 + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ +#endif #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; #endif @@ -1130,7 +1362,7 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) #define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* @@ -1267,7 +1499,13 @@ extern mempool_t *cifs_mid_poolp; #define SMB1_VERSION_STRING "1.0" extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; +#define SMB20_VERSION_STRING "2.0" +/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */ +extern struct smb_version_values smb20_values; #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; +#define SMB30_VERSION_STRING "3.0" +/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ +extern struct smb_version_values smb30_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 3fb03e2..b9d59a9 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2210,7 +2210,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __u8 DeletePending; __u8 Directory; __u16 Pad2; - __u64 IndexNumber; + __le64 IndexNumber; __le32 EASize; __le32 AccessFlags; __u64 IndexNumber1; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f1bbf83..5144e9f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -24,6 +24,7 @@ struct statfs; struct smb_vol; +struct smb_rqst; /* ***************************************************************** @@ -35,6 +36,8 @@ extern struct smb_hdr *cifs_buf_get(void); extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); +extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, + struct kvec *iov); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); extern unsigned int _get_xid(void); @@ -65,21 +68,22 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata, extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); +extern void cifs_delete_mid(struct mid_q_entry *mid); extern void cifs_wake_up_task(struct mid_q_entry *mid); -extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_receive_t *receive, - mid_callback_t *callback, void *cbdata, - const int flags); +extern int cifs_call_async(struct TCP_Server_Info *server, + struct smb_rqst *rqst, + mid_receive_t *receive, mid_callback_t *callback, + void *cbdata, const int flags); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , int * /* bytes returned */ , const int); extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, char *in_buf, int flags); -extern int cifs_setup_request(struct cifs_ses *, struct kvec *, unsigned int, - struct mid_q_entry **); -extern int cifs_setup_async_request(struct TCP_Server_Info *, struct kvec *, - unsigned int, struct mid_q_entry **); +extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *, + struct smb_rqst *); +extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, + struct smb_rqst *); extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, @@ -99,7 +103,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -extern unsigned int smbCalcSize(struct smb_hdr *ptr); +extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); @@ -120,10 +124,14 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); - -extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, - struct file *file, struct tcon_link *tlink, - __u32 oplock); +extern int cifs_unlock_range(struct cifsFileInfo *cfile, + struct file_lock *flock, const unsigned int xid); +extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); + +extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, + struct file *file, + struct tcon_link *tlink, + __u32 oplock); extern int cifs_posix_open(char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, @@ -132,18 +140,23 @@ void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, struct cifs_sb_info *cifs_sb); +extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *, + struct cifs_sb_info *); extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); -extern int cifs_get_file_info(struct file *filp); extern int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, const __u16 *fid); -extern int cifs_get_file_info_unix(struct file *filp); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); +extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, + unsigned int xid, char *full_path, __u32 dosattr); +extern int cifs_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); @@ -169,6 +182,17 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u8 type, + struct cifsLockInfo **conf_lock, + bool rw_check); +extern void cifs_add_pending_open(struct cifs_fid *fid, + struct tcon_link *tlink, + struct cifs_pending_open *open); +extern void cifs_add_pending_open_locked(struct cifs_fid *fid, + struct tcon_link *tlink, + struct cifs_pending_open *open); +extern void cifs_del_pending_open(struct cifs_pending_open *open); #if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) extern void cifs_dfs_release_automount_timer(void); @@ -179,6 +203,10 @@ extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); +extern void cifs_move_llist(struct list_head *source, struct list_head *dest); +extern void cifs_free_llist(struct list_head *llist); +extern void cifs_del_lock_waiters(struct cifsLockInfo *lock); + extern int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses); extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, @@ -190,10 +218,10 @@ extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *); extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, - const char *searchName, const struct nls_table *nls_codepage, + const char *searchName, struct cifs_sb_info *cifs_sb, __u16 *searchHandle, __u16 search_flags, struct cifs_search_info *psrch_inf, - int map, const char dirsep); + bool msearch); extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, __u16 searchHandle, __u16 search_flags, @@ -265,13 +293,11 @@ extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage); #endif /* possibly unneeded function */ extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, __u64 size, - bool setAllocationSizeFlag, - const struct nls_table *nls_codepage, - int remap_special_chars); + const char *file_name, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_allocation); extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, - __u64 size, __u16 fileHandle, __u32 opener_pid, - bool AllocSizeFlag); + struct cifsFileInfo *cfile, __u64 size, + bool set_allocation); struct cifs_unix_set_info_args { __u64 ctime; @@ -303,22 +329,17 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, - const char *name, - const struct nls_table *nls_codepage, - int remap_special_chars); + const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const char *toName, - const struct nls_table *nls_codepage, - int remap_special_chars); + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon, int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSCreateHardLink(const unsigned int xid, - struct cifs_tcon *tcon, - const char *fromName, const char *toName, - const struct nls_table *nls_codepage, - int remap_special_chars); +extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, @@ -367,8 +388,7 @@ extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, const char *buf, const char __user *ubuf, const int long_op); extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, struct kvec *iov, const int nvec, - const int long_op); + unsigned int *nbytes, struct kvec *iov, const int nvec); extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, const char *search_name, __u64 *inode_number, const struct nls_table *nls_codepage, @@ -397,10 +417,12 @@ extern void sesInfoFree(struct cifs_ses *); extern struct cifs_tcon *tconInfoAlloc(void); extern void tconInfoFree(struct cifs_tcon *); -extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); +extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number); extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); -extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, +extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); +extern int cifs_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, __u32 expected_sequence_number); extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, @@ -462,45 +484,9 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); -/* asynchronous read support */ -struct cifs_readdata { - struct kref refcount; - struct list_head list; - struct completion done; - struct cifsFileInfo *cfile; - struct address_space *mapping; - __u64 offset; - unsigned int bytes; - pid_t pid; - int result; - struct list_head pages; - struct work_struct work; - int (*marshal_iov) (struct cifs_readdata *rdata, - unsigned int remaining); - unsigned int nr_iov; - struct kvec iov[1]; -}; - void cifs_readdata_release(struct kref *refcount); int cifs_async_readv(struct cifs_readdata *rdata); - -/* asynchronous write support */ -struct cifs_writedata { - struct kref refcount; - struct list_head list; - struct completion done; - enum writeback_sync_modes sync_mode; - struct work_struct work; - struct cifsFileInfo *cfile; - __u64 offset; - pid_t pid; - unsigned int bytes; - int result; - void (*marshal_iov) (struct kvec *iov, - struct cifs_writedata *wdata); - unsigned int nr_pages; - struct page *pages[1]; -}; +int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); int cifs_async_writev(struct cifs_writedata *wdata); void cifs_writev_complete(struct work_struct *work); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f0cf934..76d0d29 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -86,32 +86,6 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ -#ifdef CONFIG_HIGHMEM -/* - * On arches that have high memory, kmap address space is limited. By - * serializing the kmap operations on those arches, we ensure that we don't - * end up with a bunch of threads in writeback with partially mapped page - * arrays, stuck waiting for kmap to come back. That situation prevents - * progress and can deadlock. - */ -static DEFINE_MUTEX(cifs_kmap_mutex); - -static inline void -cifs_kmap_lock(void) -{ - mutex_lock(&cifs_kmap_mutex); -} - -static inline void -cifs_kmap_unlock(void) -{ - mutex_unlock(&cifs_kmap_mutex); -} -#else /* !CONFIG_HIGHMEM */ -#define cifs_kmap_lock() do { ; } while(0) -#define cifs_kmap_unlock() do { ; } while(0) -#endif /* CONFIG_HIGHMEM */ - /* * Mark as invalid, all open files on tree connections since they * were closed when session to server was lost. @@ -751,6 +725,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server) ECHO_REQ *smb; int rc = 0; struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; cFYI(1, "In echo request"); @@ -768,7 +744,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov.iov_base = smb; iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, + rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, server, CIFS_ASYNC_OP | CIFS_ECHO_OP); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -902,15 +878,15 @@ PsxDelete: } int -CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, const struct nls_table *nls_codepage, - int remap) +CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) { DELETE_FILE_REQ *pSMB = NULL; DELETE_FILE_RSP *pSMBr = NULL; int rc = 0; int bytes_returned; int name_len; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; DelFileRetry: rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, @@ -919,15 +895,15 @@ DelFileRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->fileName, name, + PATH_MAX, cifs_sb->local_nls, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); + name_len = strnlen(name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->fileName, fileName, name_len); + strncpy(pSMB->fileName, name, name_len); } pSMB->SearchAttributes = cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); @@ -1440,7 +1416,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return 0; } -static int +int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length, len; @@ -1460,10 +1436,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - HEADER_SIZE(server) + 1; - rdata->iov[0].iov_base = buf + HEADER_SIZE(server) - 1; - rdata->iov[0].iov_len = len; + rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1; + rdata->iov.iov_len = len; - length = cifs_readv_from_socket(server, rdata->iov, 1, len); + length = cifs_readv_from_socket(server, &rdata->iov, 1, len); if (length < 0) return length; server->total_read += length; @@ -1509,19 +1485,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) len = data_offset - server->total_read; if (len > 0) { /* read any junk before data into the rest of smallbuf */ - rdata->iov[0].iov_base = buf + server->total_read; - rdata->iov[0].iov_len = len; - length = cifs_readv_from_socket(server, rdata->iov, 1, len); + rdata->iov.iov_base = buf + server->total_read; + rdata->iov.iov_len = len; + length = cifs_readv_from_socket(server, &rdata->iov, 1, len); if (length < 0) return length; server->total_read += length; } /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = server->total_read; + rdata->iov.iov_base = buf; + rdata->iov.iov_len = server->total_read; cFYI(1, "0: iov_base=%p iov_len=%zu", - rdata->iov[0].iov_base, rdata->iov[0].iov_len); + rdata->iov.iov_base, rdata->iov.iov_len); /* how much data is in the response? */ data_len = server->ops->read_data_length(buf); @@ -1531,23 +1507,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); } - /* marshal up the page array */ - cifs_kmap_lock(); - len = rdata->marshal_iov(rdata, data_len); - cifs_kmap_unlock(); - data_len -= len; - - /* issue the read if we have any iovecs left to fill */ - if (rdata->nr_iov > 1) { - length = cifs_readv_from_socket(server, &rdata->iov[1], - rdata->nr_iov - 1, len); - if (length < 0) - return length; - server->total_read += length; - } else { - length = 0; - } + length = rdata->read_into_pages(server, rdata, data_len); + if (length < 0) + return length; + server->total_read += length; rdata->bytes = length; cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read, @@ -1567,6 +1531,12 @@ cifs_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1, + .rq_pages = rdata->pages, + .rq_npages = rdata->nr_pages, + .rq_pagesz = rdata->pagesz, + .rq_tailsz = rdata->tailsz }; cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, mid->mid, mid->mid_state, rdata->result, rdata->bytes); @@ -1578,9 +1548,8 @@ cifs_readv_callback(struct mid_q_entry *mid) (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { int rc = 0; - rc = cifs_verify_signature(rdata->iov, rdata->nr_iov, - server, - mid->sequence_number + 1); + rc = cifs_verify_signature(&rqst, server, + mid->sequence_number + 1); if (rc) cERROR(1, "SMB signature verification returned " "error = %d", rc); @@ -1610,6 +1579,8 @@ cifs_async_readv(struct cifs_readdata *rdata) READ_REQ *smb = NULL; int wct; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1 }; cFYI(1, "%s: offset=%llu bytes=%u", __func__, rdata->offset, rdata->bytes); @@ -1632,7 +1603,7 @@ cifs_async_readv(struct cifs_readdata *rdata) smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ - smb->Fid = rdata->cfile->netfid; + smb->Fid = rdata->cfile->fid.netfid; smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); if (wct == 12) smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); @@ -1649,13 +1620,12 @@ cifs_async_readv(struct cifs_readdata *rdata) } /* 4 for RFC1001 length + 1 for BCC */ - rdata->iov[0].iov_base = smb; - rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + rdata->iov.iov_base = smb; + rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; kref_get(&rdata->refcount); - rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, - cifs_readv_receive, cifs_readv_callback, - rdata, 0); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, + cifs_readv_callback, rdata, 0); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); @@ -1926,6 +1896,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) { int i, rc; struct inode *inode = wdata->cfile->dentry->d_inode; + struct TCP_Server_Info *server; for (i = 0; i < wdata->nr_pages; i++) { lock_page(wdata->pages[i]); @@ -1933,7 +1904,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } do { - rc = cifs_async_writev(wdata); + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata); } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { @@ -2053,11 +2025,12 @@ cifs_writev_callback(struct mid_q_entry *mid) int cifs_async_writev(struct cifs_writedata *wdata) { - int i, rc = -EACCES; + int rc = -EACCES; WRITE_REQ *smb = NULL; int wct; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct kvec *iov = NULL; + struct kvec iov; + struct smb_rqst rqst = { }; if (tcon->ses->capabilities & CAP_LARGE_FILES) { wct = 14; @@ -2073,18 +2046,11 @@ cifs_async_writev(struct cifs_writedata *wdata) if (rc) goto async_writev_out; - /* 1 iov per page + 1 for header */ - iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS); - if (iov == NULL) { - rc = -ENOMEM; - goto async_writev_out; - } - smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ - smb->Fid = wdata->cfile->netfid; + smb->Fid = wdata->cfile->fid.netfid; smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF); if (wct == 14) smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32); @@ -2096,18 +2062,15 @@ cifs_async_writev(struct cifs_writedata *wdata) cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); /* 4 for RFC1001 length + 1 for BCC */ - iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; - iov[0].iov_base = smb; + iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; + iov.iov_base = smb; - /* - * This function should marshal up the page array into the kvec - * array, reserving [0] for the header. It should kmap the pages - * and set the iov_len properly for each one. It may also set - * wdata->bytes too. - */ - cifs_kmap_lock(); - wdata->marshal_iov(iov, wdata); - cifs_kmap_unlock(); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + rqst.rq_pages = wdata->pages; + rqst.rq_npages = wdata->nr_pages; + rqst.rq_pagesz = wdata->pagesz; + rqst.rq_tailsz = wdata->tailsz; cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); @@ -2123,32 +2086,26 @@ cifs_async_writev(struct cifs_writedata *wdata) (struct smb_com_writex_req *)smb; inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5); put_bcc(wdata->bytes + 5, &smbw->hdr); - iov[0].iov_len += 4; /* pad bigger by four bytes */ + iov.iov_len += 4; /* pad bigger by four bytes */ } kref_get(&wdata->refcount); - rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, - NULL, cifs_writev_callback, wdata, 0); + rc = cifs_call_async(tcon->ses->server, &rqst, NULL, + cifs_writev_callback, wdata, 0); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); else kref_put(&wdata->refcount, cifs_writedata_release); - /* send is done, unmap pages */ - for (i = 0; i < wdata->nr_pages; i++) - kunmap(wdata->pages[i]); - async_writev_out: cifs_small_buf_release(smb); - kfree(iov); return rc; } int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, struct kvec *iov, int n_vec, - const int long_op) + unsigned int *nbytes, struct kvec *iov, int n_vec) { int rc = -EACCES; WRITE_REQ *pSMB = NULL; @@ -2219,8 +2176,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, iov[0].iov_len = smb_hdr_len + 8; - rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, - long_op); + rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { cFYI(1, "Send error Write2 = %d", rc); @@ -2557,8 +2513,8 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; RENAME_REQ *pSMB = NULL; @@ -2566,6 +2522,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len, name_len2; __u16 count; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; cFYI(1, "In CIFSSMBRename"); renameRetry: @@ -2580,9 +2537,9 @@ renameRetry: ATTR_DIRECTORY); if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, + from_name, PATH_MAX, + cifs_sb->local_nls, remap); name_len++; /* trailing null */ name_len *= 2; pSMB->OldFileName[name_len] = 0x04; /* pad */ @@ -2590,17 +2547,18 @@ renameRetry: pSMB->OldFileName[name_len + 1] = 0x00; name_len2 = cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], - toName, PATH_MAX, nls_codepage, remap); + to_name, PATH_MAX, cifs_sb->local_nls, + remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fromName, PATH_MAX); + name_len = strnlen(from_name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->OldFileName, fromName, name_len); - name_len2 = strnlen(toName, PATH_MAX); + strncpy(pSMB->OldFileName, from_name, name_len); + name_len2 = strnlen(to_name, PATH_MAX); name_len2++; /* trailing null */ pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); name_len2++; /* trailing null */ name_len2++; /* signature byte */ } @@ -2948,8 +2906,8 @@ createHardLinkRetry: int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; NT_RENAME_REQ *pSMB = NULL; @@ -2957,6 +2915,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len, name_len2; __u16 count; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; cFYI(1, "In CIFSCreateHardLink"); winCreateHardLinkRetry: @@ -2976,8 +2935,8 @@ winCreateHardLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->OldFileName, from_name, + PATH_MAX, cifs_sb->local_nls, remap); name_len++; /* trailing null */ name_len *= 2; @@ -2986,17 +2945,18 @@ winCreateHardLinkRetry: pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ name_len2 = cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], - toName, PATH_MAX, nls_codepage, remap); + to_name, PATH_MAX, cifs_sb->local_nls, + remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fromName, PATH_MAX); + name_len = strnlen(from_name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->OldFileName, fromName, name_len); - name_len2 = strnlen(toName, PATH_MAX); + strncpy(pSMB->OldFileName, from_name, name_len); + name_len2 = strnlen(to_name, PATH_MAX); name_len2++; /* trailing null */ pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2); name_len2++; /* trailing null */ name_len2++; /* signature byte */ } @@ -4254,10 +4214,9 @@ UnixQPathInfoRetry: /* xid, tcon, searchName and codepage are input parms, rest are returned */ int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, - const char *searchName, - const struct nls_table *nls_codepage, + const char *searchName, struct cifs_sb_info *cifs_sb, __u16 *pnetfid, __u16 search_flags, - struct cifs_search_info *psrch_inf, int remap, const char dirsep) + struct cifs_search_info *psrch_inf, bool msearch) { /* level 257 SMB_ */ TRANSACTION2_FFIRST_REQ *pSMB = NULL; @@ -4265,8 +4224,9 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, T2_FFIRST_RSP_PARMS *parms; int rc = 0; int bytes_returned = 0; - int name_len; + int name_len, remap; __u16 params, byte_count; + struct nls_table *nls_codepage; cFYI(1, "In FindFirst for %s", searchName); @@ -4276,6 +4236,9 @@ findFirstRetry: if (rc) return rc; + nls_codepage = cifs_sb->local_nls; + remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, @@ -4284,24 +4247,29 @@ findFirstRetry: it got remapped to 0xF03A as if it were part of the directory name instead of a wildcard */ name_len *= 2; - pSMB->FileName[name_len] = dirsep; - pSMB->FileName[name_len+1] = 0; - pSMB->FileName[name_len+2] = '*'; - pSMB->FileName[name_len+3] = 0; - name_len += 4; /* now the trailing null */ - pSMB->FileName[name_len] = 0; /* null terminate just in case */ - pSMB->FileName[name_len+1] = 0; - name_len += 2; + if (msearch) { + pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[name_len+1] = 0; + pSMB->FileName[name_len+2] = '*'; + pSMB->FileName[name_len+3] = 0; + name_len += 4; /* now the trailing null */ + /* null terminate just in case */ + pSMB->FileName[name_len] = 0; + pSMB->FileName[name_len+1] = 0; + name_len += 2; + } } else { /* BB add check for overrun of SMB buf BB */ name_len = strnlen(searchName, PATH_MAX); /* BB fix here and in unicode clause above ie if (name_len > buffersize-header) free buffer exit; BB */ strncpy(pSMB->FileName, searchName, name_len); - pSMB->FileName[name_len] = dirsep; - pSMB->FileName[name_len+1] = '*'; - pSMB->FileName[name_len+2] = 0; - name_len += 3; + if (msearch) { + pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[name_len+1] = '*'; + pSMB->FileName[name_len+2] = 0; + name_len += 3; + } } params = 12 + name_len /* includes null */ ; @@ -4389,7 +4357,8 @@ findFirstRetry: psrch_inf->last_entry = psrch_inf->srch_entries_start + lnoff; - *pnetfid = parms->SearchHandle; + if (pnetfid) + *pnetfid = parms->SearchHandle; } else { cifs_buf_release(pSMB); } @@ -5417,16 +5386,16 @@ QFSPosixRetry: } -/* We can not use write of zero bytes trick to - set file size due to need for large file support. Also note that - this SetPathInfo is preferred to SetFileInfo based method in next - routine which is only needed to work around a sharing violation bug - in Samba which this routine can run into */ - +/* + * We can not use write of zero bytes trick to set file size due to need for + * large file support. Also note that this SetPathInfo is preferred to + * SetFileInfo based method in next routine which is only needed to work around + * a sharing violation bugin Samba which this routine can run into. + */ int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, __u64 size, bool SetAllocation, - const struct nls_table *nls_codepage, int remap) + const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb, + bool set_allocation) { struct smb_com_transaction2_spi_req *pSMB = NULL; struct smb_com_transaction2_spi_rsp *pSMBr = NULL; @@ -5434,6 +5403,8 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, int name_len; int rc = 0; int bytes_returned = 0; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + __u16 params, byte_count, data_count, param_offset, offset; cFYI(1, "In SetEOF"); @@ -5445,14 +5416,14 @@ SetEOFRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name, + PATH_MAX, cifs_sb->local_nls, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(fileName, PATH_MAX); + name_len = strnlen(file_name, PATH_MAX); name_len++; /* trailing null */ - strncpy(pSMB->FileName, fileName, name_len); + strncpy(pSMB->FileName, file_name, name_len); } params = 6 + name_len; data_count = sizeof(struct file_end_of_file_info); @@ -5466,7 +5437,7 @@ SetEOFRetry: param_offset = offsetof(struct smb_com_transaction2_spi_req, InformationLevel) - 4; offset = param_offset + params; - if (SetAllocation) { + if (set_allocation) { if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); @@ -5513,8 +5484,8 @@ SetEOFRetry: } int -CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size, - __u16 fid, __u32 pid_of_opener, bool SetAllocation) +CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, __u64 size, bool set_allocation) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct file_end_of_file_info *parm_data; @@ -5528,8 +5499,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size, if (rc) return rc; - pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); - pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + pSMB->hdr.Pid = cpu_to_le16((__u16)cfile->pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(cfile->pid >> 16)); params = 6; pSMB->MaxSetupCount = 0; @@ -5558,8 +5529,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, __u64 size, + offset); pSMB->DataOffset = cpu_to_le16(offset); parm_data->FileSize = cpu_to_le64(size); - pSMB->Fid = fid; - if (SetAllocation) { + pSMB->Fid = cfile->fid.netfid; + if (set_allocation) { if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6df6fa1..5c670b9 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -67,6 +67,7 @@ enum { /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, + Opt_forcegid, Opt_noforcegid, Opt_noblocksend, Opt_noautotune, Opt_hard, Opt_soft, Opt_perm, Opt_noperm, Opt_mapchars, Opt_nomapchars, Opt_sfu, @@ -82,8 +83,7 @@ enum { Opt_serverino, Opt_noserverino, Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl, Opt_acl, Opt_noacl, Opt_locallease, - Opt_sign, Opt_seal, Opt_direct, - Opt_strictcache, Opt_noac, + Opt_sign, Opt_seal, Opt_noac, Opt_fsc, Opt_mfsymlinks, Opt_multiuser, Opt_sloppy, @@ -118,6 +118,8 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_nouser_xattr, "nouser_xattr" }, { Opt_forceuid, "forceuid" }, { Opt_noforceuid, "noforceuid" }, + { Opt_forcegid, "forcegid" }, + { Opt_noforcegid, "noforcegid" }, { Opt_noblocksend, "noblocksend" }, { Opt_noautotune, "noautotune" }, { Opt_hard, "hard" }, @@ -160,10 +162,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_locallease, "locallease" }, { Opt_sign, "sign" }, { Opt_seal, "seal" }, - { Opt_direct, "direct" }, - { Opt_direct, "directio" }, - { Opt_direct, "forcedirectio" }, - { Opt_strictcache, "strictcache" }, { Opt_noac, "noac" }, { Opt_fsc, "fsc" }, { Opt_mfsymlinks, "mfsymlinks" }, @@ -277,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = { static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_21, SMB21_VERSION_STRING }, + { Smb_30, SMB30_VERSION_STRING }, }; static int ip_connect(struct TCP_Server_Info *server); @@ -819,6 +818,10 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server, length)) + return -1; + if (!mid) return length; @@ -1075,6 +1078,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb21_operations; vol->vals = &smb21_values; break; + case Smb_30: + vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->vals = &smb30_values; + break; #endif default: cERROR(1, "Unknown vers= option specified: %s", value); @@ -1101,8 +1108,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *string = NULL; char *tmp_end, *value; char delim; - bool cache_specified = false; - static bool cache_warned = false; separator[0] = ','; separator[1] = 0; @@ -1134,6 +1139,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* default to using server inode numbers where available */ vol->server_ino = 1; + /* default is to use strict cifs caching semantics */ + vol->strict_io = true; + vol->actimeo = CIFS_DEF_ACTIMEO; /* FIXME: add autonegotiation -- for now, SMB1 is default */ @@ -1190,6 +1198,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_noforceuid: override_uid = 0; break; + case Opt_forcegid: + override_gid = 1; + break; + case Opt_noforcegid: + override_gid = 0; + break; case Opt_noblocksend: vol->noblocksnd = 1; break; @@ -1317,22 +1331,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, */ vol->seal = 1; break; - case Opt_direct: - cache_specified = true; - vol->direct_io = true; - vol->strict_io = false; - cERROR(1, "The \"directio\" option will be removed in " - "3.7. Please switch to the \"cache=none\" " - "option."); - break; - case Opt_strictcache: - cache_specified = true; - vol->direct_io = false; - vol->strict_io = true; - cERROR(1, "The \"strictcache\" option will be removed " - "in 3.7. Please switch to the \"cache=strict\" " - "option."); - break; case Opt_noac: printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " @@ -1676,8 +1674,13 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnicmp(string, "TCP_NODELAY", 11) == 0) + if (strnicmp(string, "TCP_NODELAY", 11) == 0) { + printk(KERN_WARNING "CIFS: the " + "sockopt=TCP_NODELAY option has been " + "deprecated and will be removed " + "in 3.9\n"); vol->sockopt_tcp_nodelay = 1; + } break; case Opt_netbiosname: string = match_strdup(args); @@ -1762,7 +1765,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; break; case Opt_cache: - cache_specified = true; string = match_strdup(args); if (string == NULL) goto out_nomem; @@ -1813,14 +1815,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " "specified with no gid= option.\n"); - /* FIXME: remove this block in 3.7 */ - if (!cache_specified && !cache_warned) { - cache_warned = true; - printk(KERN_NOTICE "CIFS: no cache= option specified, using " - "\"cache=loose\". This default will change " - "to \"cache=strict\" in 3.7.\n"); - } - kfree(mountdata_copy); return 0; @@ -2636,6 +2630,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; tcon->local_lease = volume_info->local_lease; + INIT_LIST_HEAD(&tcon->pending_opens); spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); @@ -3261,146 +3256,6 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, "mount option supported"); } -/* - * When the server supports very large reads and writes via POSIX extensions, - * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not - * including the RFC1001 length. - * - * Note that this might make for "interesting" allocation problems during - * writeback however as we have to allocate an array of pointers for the - * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. - * - * For reads, there is a similar problem as we need to allocate an array - * of kvecs to handle the receive, though that should only need to be done - * once. - */ -#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) -#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) - -/* - * When the server doesn't allow large posix writes, only allow a rsize/wsize - * of 2^17-1 minus the size of the call header. That allows for a read or - * write up to the maximum size described by RFC1002. - */ -#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) -#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) - -/* - * The default wsize is 1M. find_get_pages seems to return a maximum of 256 - * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill - * a single wsize request with a single call. - */ -#define CIFS_DEFAULT_IOSIZE (1024 * 1024) - -/* - * Windows only supports a max of 60kb reads and 65535 byte writes. Default to - * those values when posix extensions aren't in force. In actuality here, we - * use 65536 to allow for a write that is a multiple of 4k. Most servers seem - * to be ok with the extra byte even though Windows doesn't send writes that - * are that large. - * - * Citation: - * - * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx - */ -#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) -#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) - -/* - * On hosts with high memory, we can't currently support wsize/rsize that are - * larger than we can kmap at once. Cap the rsize/wsize at - * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request - * larger than that anyway. - */ -#ifdef CONFIG_HIGHMEM -#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE) -#else /* CONFIG_HIGHMEM */ -#define CIFS_KMAP_SIZE_LIMIT (1<<24) -#endif /* CONFIG_HIGHMEM */ - -static unsigned int -cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) -{ - __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); - struct TCP_Server_Info *server = tcon->ses->server; - unsigned int wsize; - - /* start with specified wsize, or default */ - if (pvolume_info->wsize) - wsize = pvolume_info->wsize; - else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) - wsize = CIFS_DEFAULT_IOSIZE; - else - wsize = CIFS_DEFAULT_NON_POSIX_WSIZE; - - /* can server support 24-bit write sizes? (via UNIX extensions) */ - if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) - wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); - - /* - * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? - * Limit it to max buffer offered by the server, minus the size of the - * WRITEX header, not including the 4 byte RFC1001 length. - */ - if (!(server->capabilities & CAP_LARGE_WRITE_X) || - (!(server->capabilities & CAP_UNIX) && - (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) - wsize = min_t(unsigned int, wsize, - server->maxBuf - sizeof(WRITE_REQ) + 4); - - /* limit to the amount that we can kmap at once */ - wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT); - - /* hard limit of CIFS_MAX_WSIZE */ - wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); - - return wsize; -} - -static unsigned int -cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) -{ - __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); - struct TCP_Server_Info *server = tcon->ses->server; - unsigned int rsize, defsize; - - /* - * Set default value... - * - * HACK alert! Ancient servers have very small buffers. Even though - * MS-CIFS indicates that servers are only limited by the client's - * bufsize for reads, testing against win98se shows that it throws - * INVALID_PARAMETER errors if you try to request too large a read. - * OS/2 just sends back short reads. - * - * If the server doesn't advertise CAP_LARGE_READ_X, then assume that - * it can't handle a read request larger than its MaxBufferSize either. - */ - if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) - defsize = CIFS_DEFAULT_IOSIZE; - else if (server->capabilities & CAP_LARGE_READ_X) - defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; - else - defsize = server->maxBuf - sizeof(READ_RSP); - - rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize; - - /* - * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to - * the client's MaxBufferSize. - */ - if (!(server->capabilities & CAP_LARGE_READ_X)) - rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); - - /* limit to the amount that we can kmap at once */ - rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT); - - /* hard limit of CIFS_MAX_RSIZE */ - rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); - - return rsize; -} - static void cleanup_volume_info_contents(struct smb_vol *volume_info) { @@ -3651,8 +3506,8 @@ try_mount_again: if (!tcon->ipc && server->ops->qfs_tcon) server->ops->qfs_tcon(xid, tcon); - cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); - cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); + cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); /* tune readahead according to rsize */ cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 781025b..7c0a812 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -160,17 +160,18 @@ check_name(struct dentry *direntry) static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, __u16 *fileHandle, int *created) + __u32 *oplock, struct cifs_fid *fid, int *created) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; - int desiredAccess; + int desired_access; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = tlink_tcon(tlink); char *full_path = NULL; FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; + struct TCP_Server_Info *server = tcon->ses->server; *oplock = 0; if (tcon->ses->server->oplocks) @@ -185,8 +186,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { - rc = cifs_posix_open(full_path, &newinode, - inode->i_sb, mode, oflags, oplock, fileHandle, xid); + rc = cifs_posix_open(full_path, &newinode, inode->i_sb, mode, + oflags, oplock, &fid->netfid, xid); switch (rc) { case 0: if (newinode == NULL) { @@ -202,7 +203,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, * close it and proceed as if it were a normal * lookup. */ - CIFSSMBClose(xid, tcon, *fileHandle); + CIFSSMBClose(xid, tcon, fid->netfid); goto cifs_create_get_file_info; } /* success, no need to query */ @@ -244,11 +245,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, */ } - desiredAccess = 0; + desired_access = 0; if (OPEN_FMODE(oflags) & FMODE_READ) - desiredAccess |= GENERIC_READ; /* is this too little? */ + desired_access |= GENERIC_READ; /* is this too little? */ if (OPEN_FMODE(oflags) & FMODE_WRITE) - desiredAccess |= GENERIC_WRITE; + desired_access |= GENERIC_WRITE; disposition = FILE_OVERWRITE_IF; if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) @@ -260,8 +261,15 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, else cFYI(1, "Create flag not set in create function"); - /* BB add processing to set equivalent of mode - e.g. via CreateX with - ACLs */ + /* + * BB add processing to set equivalent of mode - e.g. via CreateX with + * ACLs + */ + + if (!server->ops->open) { + rc = -ENOSYS; + goto out; + } buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { @@ -279,28 +287,18 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - if (tcon->ses->capabilities & CAP_NT_SMBS) - rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, create_options, - fileHandle, oplock, buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = -EIO; /* no NT SMB support fall into legacy open below */ - - if (rc == -EIO) { - /* old server, retry the open legacy style */ - rc = SMBLegacyOpen(xid, tcon, full_path, disposition, - desiredAccess, create_options, - fileHandle, oplock, buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } + rc = server->ops->open(xid, tcon, full_path, disposition, + desired_access, create_options, fid, oplock, + buf, cifs_sb); if (rc) { cFYI(1, "cifs_create returned 0x%x", rc); goto out; } - /* If Open reported that we actually created a file - then we now have to set the mode if possible */ + /* + * If Open reported that we actually created a file then we now have to + * set the mode if possible. + */ if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) { struct cifs_unix_set_info_args args = { .mode = mode, @@ -321,11 +319,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - CIFSSMBUnixSetFileInfo(xid, tcon, &args, *fileHandle, - current->tgid); + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid, + current->tgid); } else { - /* BB implement mode setting via Windows security - descriptors e.g. */ + /* + * BB implement mode setting via Windows security + * descriptors e.g. + */ /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ /* Could set r/o dos attribute if mode & 0222 == 0 */ @@ -334,12 +334,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, cifs_create_get_file_info: /* server might mask mode so we have to query for it */ if (tcon->unix_ext) - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); + rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, + xid); else { - rc = cifs_get_inode_info(&newinode, full_path, buf, - inode->i_sb, xid, fileHandle); + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, + xid, &fid->netfid); if (newinode) { + if (server->ops->set_lease_key) + server->ops->set_lease_key(newinode, fid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) newinode->i_mode = mode; if ((*oplock & CIFS_CREATE_ACTION) && @@ -356,7 +358,8 @@ cifs_create_get_file_info: cifs_create_set_dentry: if (rc != 0) { cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); - CIFSSMBClose(xid, tcon, *fileHandle); + if (server->ops->close) + server->ops->close(xid, tcon, fid); goto out; } d_drop(direntry); @@ -377,11 +380,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, unsigned int xid; struct tcon_link *tlink; struct cifs_tcon *tcon; - __u16 fileHandle; + struct TCP_Server_Info *server; + struct cifs_fid fid; + struct cifs_pending_open open; __u32 oplock; - struct cifsFileInfo *pfile_info; + struct cifsFileInfo *file_info; - /* Posix open is only called (at lookup time) for file create now. For + /* + * Posix open is only called (at lookup time) for file create now. For * opens (rather than creates), because we do not know if it is a file * or directory yet, and current Samba no longer allows us to do posix * open on dirs, we could end up wasting an open call on what turns out @@ -413,22 +419,34 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, goto out_free_xid; tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (server->ops->new_lease_key) + server->ops->new_lease_key(&fid); + + cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fileHandle, opened); + &oplock, &fid, opened); - if (rc) + if (rc) { + cifs_del_pending_open(&open); goto out; + } rc = finish_open(file, direntry, generic_file_open, opened); if (rc) { - CIFSSMBClose(xid, tcon, fileHandle); + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); goto out; } - pfile_info = cifs_new_fileinfo(fileHandle, file, tlink, oplock); - if (pfile_info == NULL) { - CIFSSMBClose(xid, tcon, fileHandle); + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + if (file_info == NULL) { + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); rc = -ENOMEM; } @@ -453,7 +471,9 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, */ unsigned oflags = O_EXCL | O_CREAT | O_RDWR; struct tcon_link *tlink; - __u16 fileHandle; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct cifs_fid fid; __u32 oplock; int created = FILE_CREATED; @@ -465,10 +485,16 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, if (IS_ERR(tlink)) goto out_free_xid; + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (server->ops->new_lease_key) + server->ops->new_lease_key(&fid); + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fileHandle, &created); - if (!rc) - CIFSSMBClose(xid, tlink_tcon(tlink), fileHandle); + &oplock, &fid, &created); + if (!rc && server->ops->close) + server->ops->close(xid, tcon, &fid); cifs_put_tlink(tlink); out_free_xid: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9154192..edb25b4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -169,16 +169,20 @@ posix_open_ret: static int cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock, - __u16 *pnetfid, unsigned int xid) + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid) { int rc; - int desiredAccess; + int desired_access; int disposition; int create_options = CREATE_NOT_DIR; FILE_ALL_INFO *buf; + struct TCP_Server_Info *server = tcon->ses->server; + + if (!server->ops->open) + return -ENOSYS; - desiredAccess = cifs_convert_flags(f_flags); + desired_access = cifs_convert_flags(f_flags); /********************************************************************* * open flag mapping table: @@ -215,16 +219,9 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - if (tcon->ses->capabilities & CAP_NT_SMBS) - rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, create_options, pnetfid, poplock, buf, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = SMBLegacyOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = server->ops->open(xid, tcon, full_path, disposition, + desired_access, create_options, fid, oplock, buf, + cifs_sb); if (rc) goto out; @@ -234,7 +231,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, xid); else rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, - xid, pnetfid); + xid, &fid->netfid); out: kfree(buf); @@ -242,48 +239,62 @@ out: } struct cifsFileInfo * -cifs_new_fileinfo(__u16 fileHandle, struct file *file, +cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; - struct cifsInodeInfo *pCifsInode = CIFS_I(inode); - struct cifsFileInfo *pCifsFile; - - pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - if (pCifsFile == NULL) - return pCifsFile; - - pCifsFile->count = 1; - pCifsFile->netfid = fileHandle; - pCifsFile->pid = current->tgid; - pCifsFile->uid = current_fsuid(); - pCifsFile->dentry = dget(dentry); - pCifsFile->f_flags = file->f_flags; - pCifsFile->invalidHandle = false; - pCifsFile->tlink = cifs_get_tlink(tlink); - mutex_init(&pCifsFile->fh_mutex); - INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); - INIT_LIST_HEAD(&pCifsFile->llist); + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifsFileInfo *cfile; + struct cifs_fid_locks *fdlocks; + struct cifs_tcon *tcon = tlink_tcon(tlink); + + cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cfile == NULL) + return cfile; + + fdlocks = kzalloc(sizeof(struct cifs_fid_locks), GFP_KERNEL); + if (!fdlocks) { + kfree(cfile); + return NULL; + } + + INIT_LIST_HEAD(&fdlocks->locks); + fdlocks->cfile = cfile; + cfile->llist = fdlocks; + down_write(&cinode->lock_sem); + list_add(&fdlocks->llist, &cinode->llist); + up_write(&cinode->lock_sem); + + cfile->count = 1; + cfile->pid = current->tgid; + cfile->uid = current_fsuid(); + cfile->dentry = dget(dentry); + cfile->f_flags = file->f_flags; + cfile->invalidHandle = false; + cfile->tlink = cifs_get_tlink(tlink); + INIT_WORK(&cfile->oplock_break, cifs_oplock_break); + mutex_init(&cfile->fh_mutex); spin_lock(&cifs_file_list_lock); - list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList)); + if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) + oplock = fid->pending_open->oplock; + list_del(&fid->pending_open->olist); + + tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); + + list_add(&cfile->tlist, &tcon->openFileList); /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) - list_add(&pCifsFile->flist, &pCifsInode->openFileList); + list_add(&cfile->flist, &cinode->openFileList); else - list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); + list_add_tail(&cfile->flist, &cinode->openFileList); spin_unlock(&cifs_file_list_lock); - cifs_set_oplock_level(pCifsInode, oplock); - pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; - - file->private_data = pCifsFile; - return pCifsFile; + file->private_data = cfile; + return cfile; } -static void cifs_del_lock_waiters(struct cifsLockInfo *lock); - struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { @@ -302,9 +313,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { struct inode *inode = cifs_file->dentry->d_inode; struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); + struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsLockInfo *li, *tmp; + struct cifs_fid fid; + struct cifs_pending_open open; spin_lock(&cifs_file_list_lock); if (--cifs_file->count > 0) { @@ -312,6 +326,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) return; } + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + + /* store open in pending opens to make sure we don't miss lease break */ + cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open); + /* remove it from the lists */ list_del(&cifs_file->flist); list_del(&cifs_file->tlist); @@ -319,13 +339,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - - /* in strict cache mode we need invalidate mapping on the last - close because it may cause a error when we open this file - again and get at least level II oplock */ + /* + * In strict cache mode we need invalidate mapping on the last + * close because it may cause a error when we open this file + * again and get at least level II oplock. + */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) CIFS_I(inode)->invalid_mapping = true; - cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -333,23 +353,30 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cancel_work_sync(&cifs_file->oplock_break); if (!tcon->need_reconnect && !cifs_file->invalidHandle) { + struct TCP_Server_Info *server = tcon->ses->server; unsigned int xid; - int rc; + xid = get_xid(); - rc = CIFSSMBClose(xid, tcon, cifs_file->netfid); - free_xid(xid); + if (server->ops->close) + server->ops->close(xid, tcon, &cifs_file->fid); + _free_xid(xid); } - /* Delete any outstanding lock records. We'll lose them when the file + cifs_del_pending_open(&open); + + /* + * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - mutex_lock(&cifsi->lock_mutex); - list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + down_write(&cifsi->lock_sem); + list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { list_del(&li->llist); cifs_del_lock_waiters(li); kfree(li); } - mutex_unlock(&cifsi->lock_mutex); + list_del(&cifs_file->llist->llist); + kfree(cifs_file->llist); + up_write(&cifsi->lock_sem); cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); @@ -357,17 +384,20 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) } int cifs_open(struct inode *inode, struct file *file) + { int rc = -EACCES; unsigned int xid; __u32 oplock; struct cifs_sb_info *cifs_sb; + struct TCP_Server_Info *server; struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifsFileInfo *pCifsFile = NULL; + struct cifsFileInfo *cfile = NULL; char *full_path = NULL; bool posix_open_ok = false; - __u16 netfid; + struct cifs_fid fid; + struct cifs_pending_open open; xid = get_xid(); @@ -378,6 +408,7 @@ int cifs_open(struct inode *inode, struct file *file) return PTR_ERR(tlink); } tcon = tlink_tcon(tlink); + server = tcon->ses->server; full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -388,7 +419,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (tcon->ses->server->oplocks) + if (server->oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -399,7 +430,7 @@ int cifs_open(struct inode *inode, struct file *file) /* can not refresh inode info since size could be stale */ rc = cifs_posix_open(full_path, &inode, inode->i_sb, cifs_sb->mnt_file_mode /* ignored */, - file->f_flags, &oplock, &netfid, xid); + file->f_flags, &oplock, &fid.netfid, xid); if (rc == 0) { cFYI(1, "posix open succeeded"); posix_open_ok = true; @@ -415,20 +446,34 @@ int cifs_open(struct inode *inode, struct file *file) } else if ((rc != -EIO) && (rc != -EREMOTE) && (rc != -EOPNOTSUPP)) /* path not found or net err */ goto out; - /* else fallthrough to retry open the old way on network i/o - or DFS errors */ + /* + * Else fallthrough to retry open the old way on network i/o + * or DFS errors. + */ } + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + + cifs_add_pending_open(&fid, tlink, &open); + if (!posix_open_ok) { + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, - file->f_flags, &oplock, &netfid, xid); - if (rc) + file->f_flags, &oplock, &fid, xid); + if (rc) { + cifs_del_pending_open(&open); goto out; + } } - pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); - if (pCifsFile == NULL) { - CIFSSMBClose(xid, tcon, netfid); + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + if (cfile == NULL) { + if (server->ops->close) + server->ops->close(xid, tcon, &fid); + cifs_del_pending_open(&open); rc = -ENOMEM; goto out; } @@ -436,8 +481,10 @@ int cifs_open(struct inode *inode, struct file *file) cifs_fscache_set_inode_cookie(inode, file); if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { - /* time to set mode which we can not set earlier due to - problems creating new read-only files */ + /* + * Time to set mode which we can not set earlier due to + * problems creating new read-only files. + */ struct cifs_unix_set_info_args args = { .mode = inode->i_mode, .uid = NO_CHANGE_64, @@ -447,8 +494,8 @@ int cifs_open(struct inode *inode, struct file *file) .mtime = NO_CHANGE_64, .device = 0, }; - CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid, - pCifsFile->pid); + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid, + cfile->pid); } out: @@ -458,59 +505,66 @@ out: return rc; } -/* Try to reacquire byte range locks that were released when session */ -/* to server was lost */ +/* + * Try to reacquire byte range locks that were released when session + * to server was lost + */ static int cifs_relock_file(struct cifsFileInfo *cifsFile) { int rc = 0; -/* BB list all locks open on this file and relock */ + /* BB list all locks open on this file and relock */ return rc; } -static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) +static int +cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) { int rc = -EACCES; unsigned int xid; __u32 oplock; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsInodeInfo *pCifsInode; + struct TCP_Server_Info *server; + struct cifsInodeInfo *cinode; struct inode *inode; char *full_path = NULL; - int desiredAccess; + int desired_access; int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; - __u16 netfid; + struct cifs_fid fid; xid = get_xid(); - mutex_lock(&pCifsFile->fh_mutex); - if (!pCifsFile->invalidHandle) { - mutex_unlock(&pCifsFile->fh_mutex); + mutex_lock(&cfile->fh_mutex); + if (!cfile->invalidHandle) { + mutex_unlock(&cfile->fh_mutex); rc = 0; free_xid(xid); return rc; } - inode = pCifsFile->dentry->d_inode; + inode = cfile->dentry->d_inode; cifs_sb = CIFS_SB(inode->i_sb); - tcon = tlink_tcon(pCifsFile->tlink); + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; -/* can not grab rename sem here because various ops, including - those that already have the rename sem can end up causing writepage - to get called and if the server was down that means we end up here, - and we can never tell if the caller already has the rename_sem */ - full_path = build_path_from_dentry(pCifsFile->dentry); + /* + * Can not grab rename sem here because various ops, including those + * that already have the rename sem can end up causing writepage to get + * called and if the server was down that means we end up here, and we + * can never tell if the caller already has the rename_sem. + */ + full_path = build_path_from_dentry(cfile->dentry); if (full_path == NULL) { rc = -ENOMEM; - mutex_unlock(&pCifsFile->fh_mutex); + mutex_unlock(&cfile->fh_mutex); free_xid(xid); return rc; } - cFYI(1, "inode = 0x%p file flags 0x%x for %s", - inode, pCifsFile->f_flags, full_path); + cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags, + full_path); if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -524,69 +578,72 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) * O_CREAT, O_EXCL and O_TRUNC already had their effect on the * original open. Must mask them off for a reopen. */ - unsigned int oflags = pCifsFile->f_flags & + unsigned int oflags = cfile->f_flags & ~(O_CREAT | O_EXCL | O_TRUNC); rc = cifs_posix_open(full_path, NULL, inode->i_sb, - cifs_sb->mnt_file_mode /* ignored */, - oflags, &oplock, &netfid, xid); + cifs_sb->mnt_file_mode /* ignored */, + oflags, &oplock, &fid.netfid, xid); if (rc == 0) { cFYI(1, "posix reopen succeeded"); goto reopen_success; } - /* fallthrough to retry open the old way on errors, especially - in the reconnect path it is important to retry hard */ + /* + * fallthrough to retry open the old way on errors, especially + * in the reconnect path it is important to retry hard + */ } - desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + desired_access = cifs_convert_flags(cfile->f_flags); if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - /* Can not refresh inode by passing in file_info buf to be returned - by SMBOpen and then calling get_inode_info with returned buf - since file might have write behind data that needs to be flushed - and server version of file size can be stale. If we knew for sure - that inode was not dirty locally we could do this */ + if (server->ops->get_lease_key) + server->ops->get_lease_key(inode, &fid); - rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, - create_options, &netfid, &oplock, NULL, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + /* + * Can not refresh inode by passing in file_info buf to be returned by + * CIFSSMBOpen and then calling get_inode_info with returned buf since + * file might have write behind data that needs to be flushed and server + * version of file size can be stale. If we knew for sure that inode was + * not dirty locally we could do this. + */ + rc = server->ops->open(xid, tcon, full_path, disposition, + desired_access, create_options, &fid, &oplock, + NULL, cifs_sb); if (rc) { - mutex_unlock(&pCifsFile->fh_mutex); - cFYI(1, "cifs_open returned 0x%x", rc); + mutex_unlock(&cfile->fh_mutex); + cFYI(1, "cifs_reopen returned 0x%x", rc); cFYI(1, "oplock: %d", oplock); goto reopen_error_exit; } reopen_success: - pCifsFile->netfid = netfid; - pCifsFile->invalidHandle = false; - mutex_unlock(&pCifsFile->fh_mutex); - pCifsInode = CIFS_I(inode); + cfile->invalidHandle = false; + mutex_unlock(&cfile->fh_mutex); + cinode = CIFS_I(inode); if (can_flush) { rc = filemap_write_and_wait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); if (tcon->unix_ext) - rc = cifs_get_inode_info_unix(&inode, - full_path, inode->i_sb, xid); + rc = cifs_get_inode_info_unix(&inode, full_path, + inode->i_sb, xid); else - rc = cifs_get_inode_info(&inode, - full_path, NULL, inode->i_sb, - xid, NULL); - } /* else we are writing out data to server already - and could deadlock if we tried to flush data, and - since we do not know if we have data that would - invalidate the current end of file on the server - we can not go to the server to get the new inod - info */ - - cifs_set_oplock_level(pCifsInode, oplock); + rc = cifs_get_inode_info(&inode, full_path, NULL, + inode->i_sb, xid, NULL); + } + /* + * Else we are writing out data to server already and could deadlock if + * we tried to flush data, and since we do not know if we have data that + * would invalidate the current end of file on the server we can not go + * to the server to get the new inode info. + */ - cifs_relock_file(pCifsFile); + server->ops->set_fid(cfile, &fid, oplock); + cifs_relock_file(cfile); reopen_error_exit: kfree(full_path); @@ -609,42 +666,48 @@ int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; unsigned int xid; - struct cifsFileInfo *pCFileStruct = file->private_data; - char *ptmp; + struct cifsFileInfo *cfile = file->private_data; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + char *buf; cFYI(1, "Closedir inode = 0x%p", inode); + if (cfile == NULL) + return rc; + xid = get_xid(); + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; - if (pCFileStruct) { - struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink); + cFYI(1, "Freeing private data in close dir"); + spin_lock(&cifs_file_list_lock); + if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + cfile->invalidHandle = true; + spin_unlock(&cifs_file_list_lock); + if (server->ops->close_dir) + rc = server->ops->close_dir(xid, tcon, &cfile->fid); + else + rc = -ENOSYS; + cFYI(1, "Closing uncompleted readdir with rc %d", rc); + /* not much we can do if it fails anyway, ignore rc */ + rc = 0; + } else + spin_unlock(&cifs_file_list_lock); - cFYI(1, "Freeing private data in close dir"); - spin_lock(&cifs_file_list_lock); - if (!pCFileStruct->srch_inf.endOfSearch && - !pCFileStruct->invalidHandle) { - pCFileStruct->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); - rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); - cFYI(1, "Closing uncompleted readdir with rc %d", - rc); - /* not much we can do if it fails anyway, ignore rc */ - rc = 0; - } else - spin_unlock(&cifs_file_list_lock); - ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; - if (ptmp) { - cFYI(1, "closedir free smb buf in srch struct"); - pCFileStruct->srch_inf.ntwrk_buf_start = NULL; - if (pCFileStruct->srch_inf.smallBuf) - cifs_small_buf_release(ptmp); - else - cifs_buf_release(ptmp); - } - cifs_put_tlink(pCFileStruct->tlink); - kfree(file->private_data); - file->private_data = NULL; + buf = cfile->srch_inf.ntwrk_buf_start; + if (buf) { + cFYI(1, "closedir free smb buf in srch struct"); + cfile->srch_inf.ntwrk_buf_start = NULL; + if (cfile->srch_inf.smallBuf) + cifs_small_buf_release(buf); + else + cifs_buf_release(buf); } + + cifs_put_tlink(cfile->tlink); + kfree(file->private_data); + file->private_data = NULL; /* BB can we lock the filestruct while this is going on? */ free_xid(xid); return rc; @@ -666,7 +729,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type) return lock; } -static void +void cifs_del_lock_waiters(struct cifsLockInfo *lock) { struct cifsLockInfo *li, *tmp; @@ -677,45 +740,47 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) } static bool -cifs_find_fid_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, - __u64 length, __u8 type, struct cifsFileInfo *cur, - struct cifsLockInfo **conf_lock) +cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, + __u64 length, __u8 type, struct cifsFileInfo *cfile, + struct cifsLockInfo **conf_lock, bool rw_check) { struct cifsLockInfo *li; + struct cifsFileInfo *cur_cfile = fdlocks->cfile; struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; - list_for_each_entry(li, &cfile->llist, llist) { + list_for_each_entry(li, &fdlocks->locks, llist) { if (offset + length <= li->offset || offset >= li->offset + li->length) continue; - else if ((type & server->vals->shared_lock_type) && - ((server->ops->compare_fids(cur, cfile) && - current->tgid == li->pid) || type == li->type)) + if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && + current->tgid == li->pid) continue; - else { + if ((type & server->vals->shared_lock_type) && + ((server->ops->compare_fids(cfile, cur_cfile) && + current->tgid == li->pid) || type == li->type)) + continue; + if (conf_lock) *conf_lock = li; - return true; - } + return true; } return false; } -static bool +bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, - __u8 type, struct cifsLockInfo **conf_lock) + __u8 type, struct cifsLockInfo **conf_lock, + bool rw_check) { bool rc = false; - struct cifsFileInfo *fid, *tmp; + struct cifs_fid_locks *cur; struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); - spin_lock(&cifs_file_list_lock); - list_for_each_entry_safe(fid, tmp, &cinode->openFileList, flist) { - rc = cifs_find_fid_lock_conflict(fid, offset, length, type, - cfile, conf_lock); + list_for_each_entry(cur, &cinode->llist, llist) { + rc = cifs_find_fid_lock_conflict(cur, offset, length, type, + cfile, conf_lock, rw_check); if (rc) break; } - spin_unlock(&cifs_file_list_lock); return rc; } @@ -737,10 +802,10 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; bool exist; - mutex_lock(&cinode->lock_mutex); + down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - &conf_lock); + &conf_lock, false); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -754,7 +819,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, else flock->fl_type = F_UNLCK; - mutex_unlock(&cinode->lock_mutex); + up_read(&cinode->lock_sem); return rc; } @@ -762,9 +827,9 @@ static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); - mutex_lock(&cinode->lock_mutex); - list_add_tail(&lock->llist, &cfile->llist); - mutex_unlock(&cinode->lock_mutex); + down_write(&cinode->lock_sem); + list_add_tail(&lock->llist, &cfile->llist->locks); + up_write(&cinode->lock_sem); } /* @@ -784,13 +849,13 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, try_again: exist = false; - mutex_lock(&cinode->lock_mutex); + down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, - lock->type, &conf_lock); + lock->type, &conf_lock, false); if (!exist && cinode->can_cache_brlcks) { - list_add_tail(&lock->llist, &cfile->llist); - mutex_unlock(&cinode->lock_mutex); + list_add_tail(&lock->llist, &cfile->llist->locks); + up_write(&cinode->lock_sem); return rc; } @@ -800,17 +865,17 @@ try_again: rc = -EACCES; else { list_add_tail(&lock->blist, &conf_lock->blist); - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); rc = wait_event_interruptible(lock->block_q, (lock->blist.prev == &lock->blist) && (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - mutex_lock(&cinode->lock_mutex); + down_write(&cinode->lock_sem); list_del_init(&lock->blist); } - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); return rc; } @@ -831,7 +896,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) if ((flock->fl_flags & FL_POSIX) == 0) return 1; - mutex_lock(&cinode->lock_mutex); + down_read(&cinode->lock_sem); posix_test_lock(file, flock); if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { @@ -839,7 +904,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) rc = 1; } - mutex_unlock(&cinode->lock_mutex); + up_read(&cinode->lock_sem); return rc; } @@ -859,14 +924,14 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) return rc; try_again: - mutex_lock(&cinode->lock_mutex); + down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); return rc; } rc = posix_lock_file(file, flock, NULL); - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); if (rc == FILE_LOCK_DEFERRED) { rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); if (!rc) @@ -876,7 +941,7 @@ try_again: return rc; } -static int +int cifs_push_mandatory_locks(struct cifsFileInfo *cfile) { unsigned int xid; @@ -893,9 +958,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) xid = get_xid(); tcon = tlink_tcon(cfile->tlink); - mutex_lock(&cinode->lock_mutex); + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); free_xid(xid); return rc; } @@ -906,7 +972,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) */ max_buf = tcon->ses->server->maxBuf; if (!max_buf) { - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); free_xid(xid); return -EINVAL; } @@ -915,15 +981,15 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); free_xid(xid); - return rc; + return -ENOMEM; } for (i = 0; i < 2; i++) { cur = buf; num = 0; - list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { if (li->type != types[i]) continue; cur->Pid = cpu_to_le16(li->pid); @@ -932,7 +998,8 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) cur->OffsetLow = cpu_to_le32((u32)li->offset); cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); if (++num == max_num) { - stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + stored_rc = cifs_lockv(xid, tcon, + cfile->fid.netfid, (__u8)li->type, 0, num, buf); if (stored_rc) @@ -944,7 +1011,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) } if (num) { - stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid, (__u8)types[i], 0, num, buf); if (stored_rc) rc = stored_rc; @@ -952,7 +1019,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) } cinode->can_cache_brlcks = false; - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); kfree(buf); free_xid(xid); @@ -987,9 +1054,10 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - mutex_lock(&cinode->lock_mutex); + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); free_xid(xid); return rc; } @@ -1005,7 +1073,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) /* * Allocating count locks is enough because no FL_POSIX locks can be - * added to the list while we are holding cinode->lock_mutex that + * added to the list while we are holding cinode->lock_sem that * protects locking operations of this inode. */ for (; i < count; i++) { @@ -1038,7 +1106,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) type = CIFS_WRLCK; lck = list_entry(el, struct lock_to_push, llist); lck->pid = flock->fl_pid; - lck->netfid = cfile->netfid; + lck->netfid = cfile->fid.netfid; lck->length = length; lck->type = type; lck->offset = flock->fl_start; @@ -1060,7 +1128,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) out: cinode->can_cache_brlcks = false; - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); free_xid(xid); return rc; @@ -1083,7 +1151,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) return cifs_push_posix_locks(cfile); - return cifs_push_mandatory_locks(cfile); + return tcon->ses->server->ops->push_mand_locks(cfile); } static void @@ -1104,7 +1172,8 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, if (flock->fl_flags & FL_LEASE) cFYI(1, "Lease on file - not implemented yet"); if (flock->fl_flags & - (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) + (~(FL_POSIX | FL_FLOCK | FL_SLEEP | + FL_ACCESS | FL_LEASE | FL_CLOSE))) cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); *type = server->vals->large_lock_type; @@ -1134,15 +1203,6 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, } static int -cifs_mandatory_lock(unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, - __u64 length, __u32 type, int lock, int unlock, bool wait) -{ - return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->netfid, - current->tgid, length, offset, unlock, lock, - (__u8)type, wait, 0); -} - -static int cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, bool wait_flag, bool posix_lck, unsigned int xid) { @@ -1151,7 +1211,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - __u16 netfid = cfile->netfid; + __u16 netfid = cfile->fid.netfid; if (posix_lck) { int posix_lock_type; @@ -1175,11 +1235,11 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, return rc; /* BB we could chain these into one lock request BB */ - rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, type, - 1, 0, false); + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, + 1, 0, false); if (rc == 0) { - rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, - type, 0, 1, false); + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type, 0, 1, false); flock->fl_type = F_UNLCK; if (rc != 0) cERROR(1, "Error unlocking previously locked " @@ -1192,13 +1252,14 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, return 0; } - rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, - type | server->vals->shared_lock_type, 1, 0, - false); + type &= ~server->vals->exclusive_lock_type; + + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type | server->vals->shared_lock_type, + 1, 0, false); if (rc == 0) { - rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, - type | server->vals->shared_lock_type, - 0, 1, false); + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type | server->vals->shared_lock_type, 0, 1, false); flock->fl_type = F_RDLCK; if (rc != 0) cERROR(1, "Error unlocking previously locked " @@ -1209,7 +1270,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, return 0; } -static void +void cifs_move_llist(struct list_head *source, struct list_head *dest) { struct list_head *li, *tmp; @@ -1217,7 +1278,7 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } -static void +void cifs_free_llist(struct list_head *llist) { struct cifsLockInfo *li, *tmp; @@ -1228,7 +1289,7 @@ cifs_free_llist(struct list_head *llist) } } -static int +int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int xid) { @@ -1260,11 +1321,11 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!buf) return -ENOMEM; - mutex_lock(&cinode->lock_mutex); + down_write(&cinode->lock_sem); for (i = 0; i < 2; i++) { cur = buf; num = 0; - list_for_each_entry_safe(li, tmp, &cfile->llist, llist) { + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { if (flock->fl_start > li->offset || (flock->fl_start + length) < (li->offset + li->length)) @@ -1295,7 +1356,8 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, */ list_move(&li->llist, &tmp_llist); if (++num == max_num) { - stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + stored_rc = cifs_lockv(xid, tcon, + cfile->fid.netfid, li->type, num, 0, buf); if (stored_rc) { /* @@ -1304,7 +1366,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, * list to the head of the file's list. */ cifs_move_llist(&tmp_llist, - &cfile->llist); + &cfile->llist->locks); rc = stored_rc; } else /* @@ -1318,23 +1380,24 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, cur++; } if (num) { - stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid, types[i], num, 0, buf); if (stored_rc) { - cifs_move_llist(&tmp_llist, &cfile->llist); + cifs_move_llist(&tmp_llist, + &cfile->llist->locks); rc = stored_rc; } else cifs_free_llist(&tmp_llist); } } - mutex_unlock(&cinode->lock_mutex); + up_write(&cinode->lock_sem); kfree(buf); return rc; } static int -cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, +cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, bool wait_flag, bool posix_lck, int lock, int unlock, unsigned int xid) { @@ -1343,7 +1406,6 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - __u16 netfid = cfile->netfid; if (posix_lck) { int posix_lock_type; @@ -1360,9 +1422,9 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, - flock->fl_start, length, NULL, - posix_lock_type, wait_flag); + rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, + current->tgid, flock->fl_start, length, + NULL, posix_lock_type, wait_flag); goto out; } @@ -1379,8 +1441,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (rc <= 0) goto out; - rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, - type, 1, 0, wait_flag); + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, + type, 1, 0, wait_flag); if (rc) { kfree(lock); goto out; @@ -1388,7 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, cifs_lock_add(cfile, lock); } else if (unlock) - rc = cifs_unlock_range(cfile, flock, xid); + rc = server->ops->mand_unlock_range(cfile, flock, xid); out: if (flock->fl_flags & FL_POSIX) @@ -1423,7 +1485,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) tcon->ses->server); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - netfid = cfile->netfid; + netfid = cfile->fid.netfid; cinode = CIFS_I(file->f_path.dentry->d_inode); if (cap_unix(tcon->ses) && @@ -1469,15 +1531,16 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, cifsi->server_eof = end_of_write; } -static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, - const char *write_data, size_t write_size, - loff_t *poffset) +static ssize_t +cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, + size_t write_size, loff_t *offset) { int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; unsigned int xid; struct dentry *dentry = open_file->dentry; struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); @@ -1486,9 +1549,13 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, cifs_sb = CIFS_SB(dentry->d_sb); cFYI(1, "write %zd bytes to offset %lld of %s", write_size, - *poffset, dentry->d_name.name); + *offset, dentry->d_name.name); - pTcon = tlink_tcon(open_file->tlink); + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + + if (!server->ops->sync_write) + return -ENOSYS; xid = get_xid(); @@ -1514,13 +1581,12 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; iov[1].iov_len = len; - io_parms.netfid = open_file->netfid; io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = *poffset; + io_parms.tcon = tcon; + io_parms.offset = *offset; io_parms.length = len; - rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov, - 1, 0); + rc = server->ops->sync_write(xid, open_file, &io_parms, + &bytes_written, iov, 1); } if (rc || (bytes_written == 0)) { if (total_written) @@ -1531,18 +1597,18 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, } } else { spin_lock(&dentry->d_inode->i_lock); - cifs_update_eof(cifsi, *poffset, bytes_written); + cifs_update_eof(cifsi, *offset, bytes_written); spin_unlock(&dentry->d_inode->i_lock); - *poffset += bytes_written; + *offset += bytes_written; } } - cifs_stats_bytes_written(pTcon, total_written); + cifs_stats_bytes_written(tcon, total_written); if (total_written > 0) { spin_lock(&dentry->d_inode->i_lock); - if (*poffset > dentry->d_inode->i_size) - i_size_write(dentry->d_inode, *poffset); + if (*offset > dentry->d_inode->i_size) + i_size_write(dentry->d_inode, *offset); spin_unlock(&dentry->d_inode->i_lock); } mark_inode_dirty_sync(dentry->d_inode); @@ -1718,27 +1784,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } -/* - * Marshal up the iov array, reserving the first one for the header. Also, - * set wdata->bytes. - */ -static void -cifs_writepages_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata) -{ - int i; - struct inode *inode = wdata->cfile->dentry->d_inode; - loff_t size = i_size_read(inode); - - /* marshal up the pages into iov array */ - wdata->bytes = 0; - for (i = 0; i < wdata->nr_pages; i++) { - iov[i + 1].iov_len = min(size - page_offset(wdata->pages[i]), - (loff_t)PAGE_CACHE_SIZE); - iov[i + 1].iov_base = kmap(wdata->pages[i]); - wdata->bytes += iov[i + 1].iov_len; - } -} - static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1746,8 +1791,10 @@ static int cifs_writepages(struct address_space *mapping, bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; + struct TCP_Server_Info *server; struct page *page; int rc = 0; + loff_t isize = i_size_read(mapping->host); /* * If wsize is smaller than the page cache size, default to writing @@ -1852,7 +1899,7 @@ retry: */ set_page_writeback(page); - if (page_offset(page) >= mapping->host->i_size) { + if (page_offset(page) >= isize) { done = true; unlock_page(page); end_page_writeback(page); @@ -1883,7 +1930,12 @@ retry: wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; wdata->offset = page_offset(wdata->pages[0]); - wdata->marshal_iov = cifs_writepages_marshal_iov; + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = + min(isize - page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + + wdata->tailsz; do { if (wdata->cfile != NULL) @@ -1896,7 +1948,8 @@ retry: break; } wdata->pid = wdata->cfile->pid; - rc = cifs_async_writev(wdata); + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata); } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); for (i = 0; i < nr_pages; ++i) @@ -2054,6 +2107,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, unsigned int xid; int rc = 0; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2077,8 +2131,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } tcon = tlink_tcon(smbfile->tlink); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) - rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { + server = tcon->ses->server; + if (server->ops->flush) + rc = server->ops->flush(xid, tcon, &smbfile->fid); + else + rc = -ENOSYS; + } free_xid(xid); mutex_unlock(&inode->i_mutex); @@ -2090,6 +2149,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) unsigned int xid; int rc = 0; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); struct inode *inode = file->f_mapping->host; @@ -2105,8 +2165,13 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) file->f_path.dentry->d_name.name, datasync); tcon = tlink_tcon(smbfile->tlink); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) - rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { + server = tcon->ses->server; + if (server->ops->flush) + rc = server->ops->flush(xid, tcon, &smbfile->fid); + else + rc = -ENOSYS; + } free_xid(xid); mutex_unlock(&inode->i_mutex); @@ -2172,20 +2237,6 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) } static void -cifs_uncached_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata) -{ - int i; - size_t bytes = wdata->bytes; - - /* marshal up the pages into iov array */ - for (i = 0; i < wdata->nr_pages; i++) { - iov[i + 1].iov_len = min_t(size_t, bytes, PAGE_SIZE); - iov[i + 1].iov_base = kmap(wdata->pages[i]); - bytes -= iov[i + 1].iov_len; - } -} - -static void cifs_uncached_writev_complete(struct work_struct *work) { int i; @@ -2215,6 +2266,9 @@ static int cifs_uncached_retry_writev(struct cifs_writedata *wdata) { int rc; + struct TCP_Server_Info *server; + + server = tlink_tcon(wdata->cfile->tlink)->ses->server; do { if (wdata->cfile->invalidHandle) { @@ -2222,7 +2276,7 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata) if (rc != 0) continue; } - rc = cifs_async_writev(wdata); + rc = server->ops->async_writev(wdata); } while (rc == -EAGAIN); return rc; @@ -2257,6 +2311,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + offset = *poffset; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -2298,7 +2356,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, wdata->cfile = cifsFileInfo_get(open_file); wdata->pid = pid; wdata->bytes = cur_len; - wdata->marshal_iov = cifs_uncached_marshal_iov; + wdata->pagesz = PAGE_SIZE; + wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); rc = cifs_uncached_retry_writev(wdata); if (rc) { kref_put(&wdata->refcount, cifs_writedata_release); @@ -2376,40 +2435,110 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, return written; } -ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t +cifs_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct inode *inode; + struct file *file = iocb->ki_filp; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct inode *inode = file->f_mapping->host; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + ssize_t rc = -EACCES; - inode = iocb->ki_filp->f_path.dentry->d_inode; + BUG_ON(iocb->ki_pos != pos); - if (CIFS_I(inode)->clientCanCacheAll) - return generic_file_aio_write(iocb, iov, nr_segs, pos); + sb_start_write(inode->i_sb); /* - * In strict cache mode we need to write the data to the server exactly - * from the pos to pos+len-1 rather than flush all affected pages - * because it may cause a error with mandatory locks on these pages but - * not on the region from pos to ppos+len-1. + * We need to hold the sem to be sure nobody modifies lock list + * with a brlock that prevents writing. */ + down_read(&cinode->lock_sem); + if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), + server->vals->exclusive_lock_type, NULL, + true)) { + mutex_lock(&inode->i_mutex); + rc = __generic_file_aio_write(iocb, iov, nr_segs, + &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + } - return cifs_user_writev(iocb, iov, nr_segs, pos); + if (rc > 0 || rc == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, rc); + if (err < 0 && rc > 0) + rc = err; + } + + up_read(&cinode->lock_sem); + sb_end_write(inode->i_sb); + return rc; +} + +ssize_t +cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = (struct cifsFileInfo *) + iocb->ki_filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + +#ifdef CONFIG_CIFS_SMB2 + /* + * If we have an oplock for read and want to write a data to the file + * we need to store it in the page cache and then push it to the server + * to be sure the next read will get a valid data. + */ + if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { + ssize_t written; + int rc; + + written = generic_file_aio_write(iocb, iov, nr_segs, pos); + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return (ssize_t)rc; + + return written; + } +#endif + + /* + * For non-oplocked files in strict cache mode we need to write the data + * to the server exactly from the pos to pos+len-1 rather than flush all + * affected pages because it may cause a error with mandatory locks on + * these pages but not on the region from pos to ppos+len-1. + */ + + if (!cinode->clientCanCacheAll) + return cifs_user_writev(iocb, iov, nr_segs, pos); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + + return cifs_writev(iocb, iov, nr_segs, pos); } static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_vecs, work_func_t complete) +cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) { struct cifs_readdata *rdata; - rdata = kzalloc(sizeof(*rdata) + - sizeof(struct kvec) * nr_vecs, GFP_KERNEL); + rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages), + GFP_KERNEL); if (rdata != NULL) { kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); INIT_WORK(&rdata->work, complete); - INIT_LIST_HEAD(&rdata->pages); } + return rdata; } @@ -2426,25 +2555,25 @@ cifs_readdata_release(struct kref *refcount) } static int -cifs_read_allocate_pages(struct list_head *list, unsigned int npages) +cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) { int rc = 0; - struct page *page, *tpage; + struct page *page; unsigned int i; - for (i = 0; i < npages; i++) { + for (i = 0; i < nr_pages; i++) { page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); if (!page) { rc = -ENOMEM; break; } - list_add(&page->lru, list); + rdata->pages[i] = page; } if (rc) { - list_for_each_entry_safe(page, tpage, list, lru) { - list_del(&page->lru); - put_page(page); + for (i = 0; i < nr_pages; i++) { + put_page(rdata->pages[i]); + rdata->pages[i] = NULL; } } return rc; @@ -2453,13 +2582,13 @@ cifs_read_allocate_pages(struct list_head *list, unsigned int npages) static void cifs_uncached_readdata_release(struct kref *refcount) { - struct page *page, *tpage; struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); + unsigned int i; - list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { - list_del(&page->lru); - put_page(page); + for (i = 0; i < rdata->nr_pages; i++) { + put_page(rdata->pages[i]); + rdata->pages[i] = NULL; } cifs_readdata_release(refcount); } @@ -2468,6 +2597,9 @@ static int cifs_retry_async_readv(struct cifs_readdata *rdata) { int rc; + struct TCP_Server_Info *server; + + server = tlink_tcon(rdata->cfile->tlink)->ses->server; do { if (rdata->cfile->invalidHandle) { @@ -2475,7 +2607,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) if (rc != 0) continue; } - rc = cifs_async_readv(rdata); + rc = server->ops->async_readv(rdata); } while (rc == -EAGAIN); return rc; @@ -2500,17 +2632,18 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, int rc = 0; struct iov_iter ii; size_t pos = rdata->offset - offset; - struct page *page, *tpage; ssize_t remaining = rdata->bytes; unsigned char *pdata; + unsigned int i; /* set up iov_iter and advance to the correct offset */ iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); iov_iter_advance(&ii, pos); *copied = 0; - list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + for (i = 0; i < rdata->nr_pages; i++) { ssize_t copy; + struct page *page = rdata->pages[i]; /* copy a whole page or whatever's left */ copy = min_t(ssize_t, remaining, PAGE_SIZE); @@ -2530,9 +2663,6 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, iov_iter_advance(&ii, copy); } } - - list_del(&page->lru); - put_page(page); } return rc; @@ -2544,59 +2674,56 @@ cifs_uncached_readv_complete(struct work_struct *work) struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work); - /* if the result is non-zero then the pages weren't kmapped */ - if (rdata->result == 0) { - struct page *page; - - list_for_each_entry(page, &rdata->pages, lru) - kunmap(page); - } - complete(&rdata->done); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } static int -cifs_uncached_read_marshal_iov(struct cifs_readdata *rdata, - unsigned int remaining) +cifs_uncached_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) { - int len = 0; - struct page *page, *tpage; + int total_read = 0, result = 0; + unsigned int i; + unsigned int nr_pages = rdata->nr_pages; + struct kvec iov; + + rdata->tailsz = PAGE_SIZE; + for (i = 0; i < nr_pages; i++) { + struct page *page = rdata->pages[i]; - rdata->nr_iov = 1; - list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { - if (remaining >= PAGE_SIZE) { + if (len >= PAGE_SIZE) { /* enough data to fill the page */ - rdata->iov[rdata->nr_iov].iov_base = kmap(page); - rdata->iov[rdata->nr_iov].iov_len = PAGE_SIZE; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - rdata->nr_iov, page->index, - rdata->iov[rdata->nr_iov].iov_base, - rdata->iov[rdata->nr_iov].iov_len); - ++rdata->nr_iov; - len += PAGE_SIZE; - remaining -= PAGE_SIZE; - } else if (remaining > 0) { + iov.iov_base = kmap(page); + iov.iov_len = PAGE_SIZE; + cFYI(1, "%u: iov_base=%p iov_len=%zu", + i, iov.iov_base, iov.iov_len); + len -= PAGE_SIZE; + } else if (len > 0) { /* enough for partial page, fill and zero the rest */ - rdata->iov[rdata->nr_iov].iov_base = kmap(page); - rdata->iov[rdata->nr_iov].iov_len = remaining; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - rdata->nr_iov, page->index, - rdata->iov[rdata->nr_iov].iov_base, - rdata->iov[rdata->nr_iov].iov_len); - memset(rdata->iov[rdata->nr_iov].iov_base + remaining, - '\0', PAGE_SIZE - remaining); - ++rdata->nr_iov; - len += remaining; - remaining = 0; + iov.iov_base = kmap(page); + iov.iov_len = len; + cFYI(1, "%u: iov_base=%p iov_len=%zu", + i, iov.iov_base, iov.iov_len); + memset(iov.iov_base + len, '\0', PAGE_SIZE - len); + rdata->tailsz = len; + len = 0; } else { /* no need to hold page hostage */ - list_del(&page->lru); + rdata->pages[i] = NULL; + rdata->nr_pages--; put_page(page); + continue; } + + result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); + kunmap(page); + if (result < 0) + break; + + total_read += result; } - return len; + return total_read > 0 ? total_read : result; } static ssize_t @@ -2627,6 +2754,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else @@ -2647,15 +2777,17 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, goto error; } - rc = cifs_read_allocate_pages(&rdata->pages, npages); + rc = cifs_read_allocate_pages(rdata, npages); if (rc) goto error; rdata->cfile = cifsFileInfo_get(open_file); + rdata->nr_pages = npages; rdata->offset = offset; rdata->bytes = cur_len; rdata->pid = pid; - rdata->marshal_iov = cifs_uncached_read_marshal_iov; + rdata->pagesz = PAGE_SIZE; + rdata->read_into_pages = cifs_uncached_read_into_pages; rc = cifs_retry_async_readv(rdata); error: @@ -2706,6 +2838,10 @@ restart_loop: cifs_stats_bytes_read(tcon, total_read); *poffset += total_read; + /* mask nodata case */ + if (rc == -ENODATA) + rc = 0; + return total_read ? total_read : rc; } @@ -2721,15 +2857,17 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, return read; } -ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t +cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct inode *inode; - - inode = iocb->ki_filp->f_path.dentry->d_inode; - - if (CIFS_I(inode)->clientCanCacheRead) - return generic_file_aio_read(iocb, iov, nr_segs, pos); + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = (struct cifsFileInfo *) + iocb->ki_filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + int rc = -EACCES; /* * In strict cache mode we need to read from the server all the time @@ -2739,12 +2877,29 @@ ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, * on pages affected by this read but not on the region from pos to * pos+len-1. */ + if (!cinode->clientCanCacheRead) + return cifs_user_readv(iocb, iov, nr_segs, pos); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return generic_file_aio_read(iocb, iov, nr_segs, pos); - return cifs_user_readv(iocb, iov, nr_segs, pos); + /* + * We need to hold the sem to be sure nobody modifies lock list + * with a brlock that prevents reading. + */ + down_read(&cinode->lock_sem); + if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), + tcon->ses->server->vals->shared_lock_type, + NULL, true)) + rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + up_read(&cinode->lock_sem); + return rc; } -static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, - loff_t *poffset) +static ssize_t +cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) { int rc = -EACCES; unsigned int bytes_read = 0; @@ -2753,8 +2908,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, unsigned int rsize; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; unsigned int xid; - char *current_offset; + char *cur_offset; struct cifsFileInfo *open_file; struct cifs_io_parms io_parms; int buf_type = CIFS_NO_BUFFER; @@ -2773,6 +2929,12 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, } open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + + if (!server->ops->sync_read) { + free_xid(xid); + return -ENOSYS; + } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -2782,9 +2944,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); - for (total_read = 0, current_offset = read_data; - read_size > total_read; - total_read += bytes_read, current_offset += bytes_read) { + for (total_read = 0, cur_offset = read_data; read_size > total_read; + total_read += bytes_read, cur_offset += bytes_read) { current_read_size = min_t(uint, read_size - total_read, rsize); /* * For windows me and 9x we do not want to request more than it @@ -2802,13 +2963,13 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, if (rc != 0) break; } - io_parms.netfid = open_file->netfid; io_parms.pid = pid; io_parms.tcon = tcon; - io_parms.offset = *poffset; + io_parms.offset = *offset; io_parms.length = current_read_size; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, - ¤t_offset, &buf_type); + rc = server->ops->sync_read(xid, open_file, &io_parms, + &bytes_read, &cur_offset, + &buf_type); } if (rc || (bytes_read == 0)) { if (total_read) { @@ -2819,7 +2980,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, } } else { cifs_stats_bytes_read(tcon, total_read); - *poffset += bytes_read; + *offset += bytes_read; } } free_xid(xid); @@ -2842,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = cifs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) @@ -2885,16 +3047,16 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) static void cifs_readv_complete(struct work_struct *work) { + unsigned int i; struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work); - struct page *page, *tpage; - list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { - list_del(&page->lru); + for (i = 0; i < rdata->nr_pages; i++) { + struct page *page = rdata->pages[i]; + lru_cache_add_file(page); if (rdata->result == 0) { - kunmap(page); flush_dcache_page(page); SetPageUptodate(page); } @@ -2905,49 +3067,48 @@ cifs_readv_complete(struct work_struct *work) cifs_readpage_to_fscache(rdata->mapping->host, page); page_cache_release(page); + rdata->pages[i] = NULL; } kref_put(&rdata->refcount, cifs_readdata_release); } static int -cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining) +cifs_readpages_read_into_pages(struct TCP_Server_Info *server, + struct cifs_readdata *rdata, unsigned int len) { - int len = 0; - struct page *page, *tpage; + int total_read = 0, result = 0; + unsigned int i; u64 eof; pgoff_t eof_index; + unsigned int nr_pages = rdata->nr_pages; + struct kvec iov; /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); - rdata->nr_iov = 1; - list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { - if (remaining >= PAGE_CACHE_SIZE) { + rdata->tailsz = PAGE_CACHE_SIZE; + for (i = 0; i < nr_pages; i++) { + struct page *page = rdata->pages[i]; + + if (len >= PAGE_CACHE_SIZE) { /* enough data to fill the page */ - rdata->iov[rdata->nr_iov].iov_base = kmap(page); - rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; + iov.iov_base = kmap(page); + iov.iov_len = PAGE_CACHE_SIZE; cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - rdata->nr_iov, page->index, - rdata->iov[rdata->nr_iov].iov_base, - rdata->iov[rdata->nr_iov].iov_len); - ++rdata->nr_iov; - len += PAGE_CACHE_SIZE; - remaining -= PAGE_CACHE_SIZE; - } else if (remaining > 0) { + i, page->index, iov.iov_base, iov.iov_len); + len -= PAGE_CACHE_SIZE; + } else if (len > 0) { /* enough for partial page, fill and zero the rest */ - rdata->iov[rdata->nr_iov].iov_base = kmap(page); - rdata->iov[rdata->nr_iov].iov_len = remaining; + iov.iov_base = kmap(page); + iov.iov_len = len; cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - rdata->nr_iov, page->index, - rdata->iov[rdata->nr_iov].iov_base, - rdata->iov[rdata->nr_iov].iov_len); - memset(rdata->iov[rdata->nr_iov].iov_base + remaining, - '\0', PAGE_CACHE_SIZE - remaining); - ++rdata->nr_iov; - len += remaining; - remaining = 0; + i, page->index, iov.iov_base, iov.iov_len); + memset(iov.iov_base + len, + '\0', PAGE_CACHE_SIZE - len); + rdata->tailsz = len; + len = 0; } else if (page->index > eof_index) { /* * The VFS will not try to do readahead past the @@ -2958,22 +3119,33 @@ cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining) * fill them until the writes are flushed. */ zero_user(page, 0, PAGE_CACHE_SIZE); - list_del(&page->lru); lru_cache_add_file(page); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); page_cache_release(page); + rdata->pages[i] = NULL; + rdata->nr_pages--; + continue; } else { /* no need to hold page hostage */ - list_del(&page->lru); lru_cache_add_file(page); unlock_page(page); page_cache_release(page); + rdata->pages[i] = NULL; + rdata->nr_pages--; + continue; } + + result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len); + kunmap(page); + if (result < 0) + break; + + total_read += result; } - return len; + return total_read > 0 ? total_read : result; } static int cifs_readpages(struct file *file, struct address_space *mapping, @@ -3027,6 +3199,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * the rdata->pages, then we want them in increasing order. */ while (!list_empty(page_list)) { + unsigned int i; unsigned int bytes = PAGE_CACHE_SIZE; unsigned int expected_index; unsigned int nr_pages = 1; @@ -3096,14 +3269,18 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->offset = offset; rdata->bytes = bytes; rdata->pid = pid; - rdata->marshal_iov = cifs_readpages_marshal_iov; - list_splice_init(&tmplist, &rdata->pages); + rdata->pagesz = PAGE_CACHE_SIZE; + rdata->read_into_pages = cifs_readpages_read_into_pages; + + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + rdata->pages[rdata->nr_pages++] = page; + } rc = cifs_retry_async_readv(rdata); if (rc != 0) { - list_for_each_entry_safe(page, tpage, &rdata->pages, - lru) { - list_del(&page->lru); + for (i = 0; i < rdata->nr_pages; i++) { + page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); page_cache_release(page); @@ -3347,6 +3524,7 @@ void cifs_oplock_break(struct work_struct *work) oplock_break); struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; if (inode && S_ISREG(inode->i_mode)) { @@ -3374,10 +3552,8 @@ void cifs_oplock_break(struct work_struct *work) * disconnected since oplock already released by the server */ if (!cfile->oplock_break_cancelled) { - rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, - current->tgid, 0, 0, 0, 0, - LOCKING_ANDX_OPLOCK_RELEASE, false, - cinode->clientCanCacheRead ? 1 : 0); + rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, + cinode); cFYI(1, "Oplock release rc = %d", rc); } } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index cb79c7e..afdff79 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -282,7 +282,8 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; } -int cifs_get_file_info_unix(struct file *filp) +static int +cifs_get_file_info_unix(struct file *filp) { int rc; unsigned int xid; @@ -294,7 +295,7 @@ int cifs_get_file_info_unix(struct file *filp) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); xid = get_xid(); - rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { @@ -550,7 +551,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_gid = cifs_sb->mnt_gid; } -int cifs_get_file_info(struct file *filp) +static int +cifs_get_file_info(struct file *filp) { int rc; unsigned int xid; @@ -560,9 +562,13 @@ int cifs_get_file_info(struct file *filp) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + if (!server->ops->query_file_info) + return -ENOSYS; xid = get_xid(); - rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); + rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); @@ -601,7 +607,9 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, const __u16 *fid) { - int rc = 0, tmprc; + bool validinum = false; + __u16 srchflgs; + int rc = 0, tmprc = ENOSYS; struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; @@ -609,6 +617,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, char *buf = NULL; bool adjust_tz = false; struct cifs_fattr fattr; + struct cifs_search_info *srchinf = NULL; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -647,9 +656,38 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else { + } else if (rc == -EACCES && backup_cred(cifs_sb)) { + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + + srchinf->endOfSearch = false; + srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; + + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = + (FILE_ALL_INFO *)srchinf->srch_entries_start; + + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; + + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + } else goto cgii_exit; - } /* * If an inode wasn't passed in, then get the inode number @@ -660,23 +698,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, */ if (*inode == NULL) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - if (server->ops->get_srv_inum) - tmprc = server->ops->get_srv_inum(xid, tcon, - cifs_sb, full_path, &fattr.cf_uniqueid, - data); - else - tmprc = -ENOSYS; - if (tmprc || !fattr.cf_uniqueid) { - cFYI(1, "GetSrvInodeNum rc %d", tmprc); - fattr.cf_uniqueid = iunique(sb, ROOT_I); - cifs_autodisable_serverino(cifs_sb); + if (validinum == false) { + if (server->ops->get_srv_inum) + tmprc = server->ops->get_srv_inum(xid, + tcon, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) { + cFYI(1, "GetSrvInodeNum rc %d", tmprc); + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } } - } else { + } else fattr.cf_uniqueid = iunique(sb, ROOT_I); - } - } else { + } else fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - } /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && @@ -876,25 +912,22 @@ out: return inode; } -static int +int cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, - char *full_path, __u32 dosattr) + char *full_path, __u32 dosattr) { - int rc; - int oplock = 0; - __u16 netfid; - __u32 netpid; bool set_time = false; - struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = NULL; - struct cifs_tcon *pTcon; + struct TCP_Server_Info *server; FILE_BASIC_INFO info_buf; if (attrs == NULL) return -EINVAL; + server = cifs_sb_master_tcon(cifs_sb)->ses->server; + if (!server->ops->set_file_info) + return -ENOSYS; + if (attrs->ia_valid & ATTR_ATIME) { set_time = true; info_buf.LastAccessTime = @@ -925,81 +958,17 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, info_buf.CreationTime = 0; /* don't change */ info_buf.Attributes = cpu_to_le32(dosattr); - /* - * If the file is already open for write, just use that fileid - */ - open_file = find_writable_file(cifsInode, true); - if (open_file) { - netfid = open_file->netfid; - netpid = open_file->pid; - pTcon = tlink_tcon(open_file->tlink); - goto set_via_filehandle; - } - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; - } - pTcon = tlink_tcon(tlink); - - /* - * NT4 apparently returns success on this call, but it doesn't - * really work. - */ - if (!(pTcon->ses->flags & CIFS_SES_NT4)) { - rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, - &info_buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - cifsInode->cifsAttrs = dosattr; - goto out; - } else if (rc != -EOPNOTSUPP && rc != -EINVAL) - goto out; - } - - cFYI(1, "calling SetFileInfo since SetPathInfo for " - "times not supported by this server"); - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, - SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, - CREATE_NOT_DIR, &netfid, &oplock, - NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - - if (rc != 0) { - if (rc == -EIO) - rc = -EINVAL; - goto out; - } - - netpid = current->tgid; - -set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid); - if (!rc) - cifsInode->cifsAttrs = dosattr; - - if (open_file == NULL) - CIFSSMBClose(xid, pTcon, netfid); - else - cifsFileInfo_put(open_file); -out: - if (tlink != NULL) - cifs_put_tlink(tlink); - return rc; + return server->ops->set_file_info(inode, full_path, &info_buf, xid); } /* - * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit + * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit * and rename it to a random name that hopefully won't conflict with * anything else. */ -static int -cifs_rename_pending_delete(char *full_path, struct dentry *dentry, - unsigned int xid) +int +cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, + const unsigned int xid) { int oplock = 0; int rc; @@ -1136,6 +1105,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct iattr *attrs = NULL; __u32 dosattr = 0, origattr = 0; @@ -1145,6 +1115,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; xid = get_xid(); @@ -1167,8 +1138,12 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) } retry_std_delete: - rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (!server->ops->unlink) { + rc = -ENOSYS; + goto psx_del_no_retry; + } + + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb); psx_del_no_retry: if (!rc) { @@ -1177,9 +1152,14 @@ psx_del_no_retry: } else if (rc == -ENOENT) { d_drop(dentry); } else if (rc == -ETXTBSY) { - rc = cifs_rename_pending_delete(full_path, dentry, xid); - if (rc == 0) - cifs_drop_nlink(inode); + if (server->ops->rename_pending_delete) { + rc = server->ops->rename_pending_delete(full_path, + dentry, xid); + if (rc == 0) + cifs_drop_nlink(inode); + } + if (rc == -ETXTBSY) + rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1227,34 +1207,33 @@ unlink_out: } static int -cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode, +cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, const unsigned int xid) { int rc = 0; - struct inode *newinode = NULL; + struct inode *inode = NULL; if (tcon->unix_ext) - rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, + rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb, xid); else - rc = cifs_get_inode_info(&newinode, full_path, NULL, - inode->i_sb, xid, NULL); + rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb, + xid, NULL); + if (rc) return rc; - d_instantiate(dentry, newinode); /* * setting nlink not necessary except in cases where we failed to get it - * from the server or was set bogus + * from the server or was set bogus. Also, since this is a brand new + * inode, no need to grab the i_lock before setting the i_nlink. */ - spin_lock(&dentry->d_inode->i_lock); - if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2)) - set_nlink(dentry->d_inode, 2); - spin_unlock(&dentry->d_inode->i_lock); + if (inode->i_nlink < 2) + set_nlink(inode, 2); mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ - if (inode->i_mode & S_ISGID) + if (parent->i_mode & S_ISGID) mode |= S_ISGID; if (tcon->unix_ext) { @@ -1267,8 +1246,8 @@ cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode, }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = (__u64)current_fsuid(); - if (inode->i_mode & S_ISGID) - args.gid = (__u64)inode->i_gid; + if (parent->i_mode & S_ISGID) + args.gid = (__u64)parent->i_gid; else args.gid = (__u64)current_fsgid(); } else { @@ -1283,22 +1262,20 @@ cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode, struct TCP_Server_Info *server = tcon->ses->server; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) - server->ops->mkdir_setinfo(newinode, full_path, cifs_sb, + server->ops->mkdir_setinfo(inode, full_path, cifs_sb, tcon, xid); - if (dentry->d_inode) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - dentry->d_inode->i_mode = (mode | S_IFDIR); - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - dentry->d_inode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - dentry->d_inode->i_gid = inode->i_gid; - else - dentry->d_inode->i_gid = - current_fsgid(); - } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + inode->i_mode = (mode | S_IFDIR); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + inode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + inode->i_gid = parent->i_gid; + else + inode->i_gid = current_fsgid(); } } + d_instantiate(dentry, inode); return rc; } @@ -1495,29 +1472,32 @@ rmdir_exit: } static int -cifs_do_rename(unsigned int xid, struct dentry *from_dentry, - const char *fromPath, struct dentry *to_dentry, - const char *toPath) +cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, + const char *from_path, struct dentry *to_dentry, + const char *to_path) { struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; __u16 srcfid; int oplock, rc; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + + if (!server->ops->rename) + return -ENOSYS; /* try path-based rename first */ - rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb); /* - * don't bother with rename by filehandle unless file is busy and - * source Note that cross directory moves do not work with + * Don't bother with rename by filehandle unless file is busy and + * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ if (rc == 0 || rc != -ETXTBSY) @@ -1528,29 +1508,28 @@ cifs_do_rename(unsigned int xid, struct dentry *from_dentry, goto do_rename_exit; /* open the file to be renamed -- we need DELETE perms */ - rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, + rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE, CREATE_NOT_DIR, &srcfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid, + rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid, (const char *) to_dentry->d_name.name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - - CIFSSMBClose(xid, pTcon, srcfid); + CIFSSMBClose(xid, tcon, srcfid); } do_rename_exit: cifs_put_tlink(tlink); return rc; } -int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry) +int +cifs_rename(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry) { - char *fromName = NULL; - char *toName = NULL; + char *from_name = NULL; + char *to_name = NULL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -1571,25 +1550,25 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, * we already have the rename sem so we do not need to * grab it again here to protect the path integrity */ - fromName = build_path_from_dentry(source_dentry); - if (fromName == NULL) { + from_name = build_path_from_dentry(source_dentry); + if (from_name == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - toName = build_path_from_dentry(target_dentry); - if (toName == NULL) { + to_name = build_path_from_dentry(target_dentry); + if (to_name == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - rc = cifs_do_rename(xid, source_dentry, fromName, - target_dentry, toName); + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, + to_name); if (rc == -EEXIST && tcon->unix_ext) { /* - * Are src and dst hardlinks of same inode? We can - * only tell with unix extensions enabled + * Are src and dst hardlinks of same inode? We can only tell + * with unix extensions enabled. */ info_buf_source = kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), @@ -1600,19 +1579,19 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, } info_buf_target = info_buf_source + 1; - tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, - info_buf_source, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name, + info_buf_source, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc != 0) goto unlink_target; - tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName, - info_buf_target, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name, + info_buf_target, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc == 0 && (info_buf_source->UniqueId == info_buf_target->UniqueId)) { @@ -1620,8 +1599,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, rc = 0; goto cifs_rename_exit; } - } /* else ... BB we could add the same check for Windows by - checking the UniqueId via FILE_INTERNAL_INFO */ + } + /* + * else ... BB we could add the same check for Windows by + * checking the UniqueId via FILE_INTERNAL_INFO + */ unlink_target: /* Try unlinking the target dentry if it's not negative */ @@ -1629,15 +1611,14 @@ unlink_target: tmprc = cifs_unlink(target_dir, target_dentry); if (tmprc) goto cifs_rename_exit; - - rc = cifs_do_rename(xid, source_dentry, fromName, - target_dentry, toName); + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); } cifs_rename_exit: kfree(info_buf_source); - kfree(fromName); - kfree(toName); + kfree(from_name); + kfree(to_name); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -1862,7 +1843,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; - struct cifs_tcon *pTcon = NULL; + struct cifs_tcon *tcon = NULL; + struct TCP_Server_Info *server; struct cifs_io_parms io_parms; /* @@ -1876,19 +1858,21 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, */ open_file = find_writable_file(cifsInode, true); if (open_file) { - __u16 nfid = open_file->netfid; - __u32 npid = open_file->pid; - pTcon = tlink_tcon(open_file->tlink); - rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, - npid, false); + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + if (server->ops->set_file_size) + rc = server->ops->set_file_size(xid, tcon, open_file, + attrs->ia_size, false); + else + rc = -ENOSYS; cifsFileInfo_put(open_file); cFYI(1, "SetFSize for attrs rc = %d", rc); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; - io_parms.netfid = nfid; - io_parms.pid = npid; - io_parms.tcon = pTcon; + io_parms.netfid = open_file->fid.netfid; + io_parms.pid = open_file->pid; + io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = attrs->ia_size; rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, @@ -1898,52 +1882,55 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, } else rc = -EINVAL; - if (rc != 0) { - if (pTcon == NULL) { - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); - } + if (!rc) + goto set_size_out; - /* Set file size by pathname rather than by handle - either because no valid, writeable file handle for - it was found or because there was an error setting - it by handle */ - rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, - false, cifs_sb->local_nls, + if (tcon == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + } + + /* + * Set file size by pathname rather than by handle either because no + * valid, writeable file handle for it was found or because there was + * an error setting it by handle. + */ + if (server->ops->set_path_size) + rc = server->ops->set_path_size(xid, tcon, full_path, + attrs->ia_size, cifs_sb, false); + else + rc = -ENOSYS; + cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + __u16 netfid; + int oplock = 0; + + rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN, + GENERIC_WRITE, CREATE_NOT_DIR, &netfid, + &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); - if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { - __u16 netfid; - int oplock = 0; - - rc = SMBLegacyOpen(xid, pTcon, full_path, - FILE_OPEN, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - unsigned int bytes_written; - - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = pTcon; - io_parms.offset = 0; - io_parms.length = attrs->ia_size; - rc = CIFSSMBWrite(xid, &io_parms, - &bytes_written, - NULL, NULL, 1); - cFYI(1, "wrt seteof rc %d", rc); - CIFSSMBClose(xid, pTcon, netfid); - } + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + unsigned int bytes_written; + + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = attrs->ia_size; + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, + NULL, 1); + cFYI(1, "wrt seteof rc %d", rc); + CIFSSMBClose(xid, tcon, netfid); } - if (tlink) - cifs_put_tlink(tlink); } + if (tlink) + cifs_put_tlink(tlink); +set_size_out: if (rc == 0) { cifsInode->server_eof = attrs->ia_size; cifs_setsize(inode, attrs->ia_size); @@ -2050,7 +2037,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->device = 0; open_file = find_writable_file(cifsInode, true); if (open_file) { - u16 nfid = open_file->netfid; + u16 nfid = open_file->fid.netfid; u32 npid = open_file->pid; pTcon = tlink_tcon(open_file->tlink); rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index ae082a6..fd5009d 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -28,8 +28,6 @@ #include "cifs_debug.h" #include "cifsfs.h" -#define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2) - long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = filep->f_dentry->d_inode; @@ -51,23 +49,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) cifs_sb = CIFS_SB(inode->i_sb); switch (command) { - static bool warned = false; - case CIFS_IOC_CHECKUMOUNT: - if (!warned) { - warned = true; - cERROR(1, "the CIFS_IOC_CHECKMOUNT ioctl will " - "be deprecated in 3.7. Please " - "migrate away from the use of " - "umount.cifs"); - } - cFYI(1, "User unmount attempted"); - if (cifs_sb->mnt_uid == current_uid()) - rc = 0; - else { - rc = -EACCES; - cFYI(1, "uids do not match"); - } - break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: if (pSMBFile == NULL) @@ -75,8 +56,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { - rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, - &ExtAttrBits, &ExtAttrMask); + rc = CIFSGetExtAttr(xid, tcon, + pSMBFile->fid.netfid, + &ExtAttrBits, &ExtAttrMask); if (rc == 0) rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, @@ -94,8 +76,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EFAULT; break; } - /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, - extAttrBits, &ExtAttrMask);*/ + /* + * rc = CIFSGetExtAttr(xid, tcon, + * pSMBFile->fid.netfid, + * extAttrBits, + * &ExtAttrMask); + */ } cFYI(1, "set flags not implemented yet"); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index e6ce3b1..51dc2fb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -391,72 +391,86 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, { int rc = -EACCES; unsigned int xid; - char *fromName = NULL; - char *toName = NULL; + char *from_name = NULL; + char *to_name = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct cifsInodeInfo *cifsInode; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + tcon = tlink_tcon(tlink); xid = get_xid(); - fromName = build_path_from_dentry(old_file); - toName = build_path_from_dentry(direntry); - if ((fromName == NULL) || (toName == NULL)) { + from_name = build_path_from_dentry(old_file); + to_name = build_path_from_dentry(direntry); + if ((from_name == NULL) || (to_name == NULL)) { rc = -ENOMEM; goto cifs_hl_exit; } - if (pTcon->unix_ext) - rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, + if (tcon->unix_ext) + rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else { - rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + server = tcon->ses->server; + if (!server->ops->create_hardlink) + return -ENOSYS; + rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, + cifs_sb); if ((rc == -EIO) || (rc == -EINVAL)) rc = -EOPNOTSUPP; } d_drop(direntry); /* force new lookup from server of target */ - /* if source file is cached (oplocked) revalidate will not go to server - until the file is closed or oplock broken so update nlinks locally */ + /* + * if source file is cached (oplocked) revalidate will not go to server + * until the file is closed or oplock broken so update nlinks locally + */ if (old_file->d_inode) { cifsInode = CIFS_I(old_file->d_inode); if (rc == 0) { spin_lock(&old_file->d_inode->i_lock); inc_nlink(old_file->d_inode); spin_unlock(&old_file->d_inode->i_lock); -/* BB should we make this contingent on superblock flag NOATIME? */ -/* old_file->d_inode->i_ctime = CURRENT_TIME;*/ - /* parent dir timestamps will update from srv - within a second, would it really be worth it - to set the parent dir cifs inode time to zero - to force revalidate (faster) for it too? */ + /* + * BB should we make this contingent on superblock flag + * NOATIME? + */ + /* old_file->d_inode->i_ctime = CURRENT_TIME; */ + /* + * parent dir timestamps will update from srv within a + * second, would it really be worth it to set the parent + * dir cifs inode time to zero to force revalidate + * (faster) for it too? + */ } - /* if not oplocked will force revalidate to get info - on source file from srv */ + /* + * if not oplocked will force revalidate to get info on source + * file from srv + */ cifsInode->time = 0; - /* Will update parent dir timestamps from srv within a second. - Would it really be worth it to set the parent dir (cifs - inode) time field to zero to force revalidate on parent - directory faster ie - CIFS_I(inode)->time = 0; */ + /* + * Will update parent dir timestamps from srv within a second. + * Would it really be worth it to set the parent dir (cifs + * inode) time field to zero to force revalidate on parent + * directory faster ie + * + * CIFS_I(inode)->time = 0; + */ } cifs_hl_exit: - kfree(fromName); - kfree(toName); + kfree(from_name); + kfree(to_name); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index ce41fee..3a00c0d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); - if (pSMB->Fid != netfile->netfid) + if (pSMB->Fid != netfile->fid.netfid) continue; cFYI(1, "file id match, oplock break"); @@ -579,3 +579,33 @@ backup_cred(struct cifs_sb_info *cifs_sb) return false; } + +void +cifs_del_pending_open(struct cifs_pending_open *open) +{ + spin_lock(&cifs_file_list_lock); + list_del(&open->olist); + spin_unlock(&cifs_file_list_lock); +} + +void +cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, + struct cifs_pending_open *open) +{ +#ifdef CONFIG_CIFS_SMB2 + memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE); +#endif + open->oplock = CIFS_OPLOCK_NO_CHANGE; + open->tlink = tlink; + fid->pending_open = open; + list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens); +} + +void +cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, + struct cifs_pending_open *open) +{ + spin_lock(&cifs_file_list_lock); + cifs_add_pending_open_locked(fid, tlink, open); + spin_unlock(&cifs_file_list_lock); +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 581c225..d5ce9e2 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -110,7 +110,7 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {ERRnoroom, -ENOSPC}, {ERRrmuns, -EUSERS}, {ERRtimeout, -ETIME}, - {ERRnoresource, -ENOBUFS}, + {ERRnoresource, -EREMOTEIO}, {ERRtoomanyuids, -EUSERS}, {ERRbaduid, -EACCES}, {ERRusempx, -EIO}, @@ -412,7 +412,7 @@ static const struct { from NT_STATUS_INSUFFICIENT_RESOURCES to NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ { - ERRDOS, ERRnomem, NT_STATUS_INSUFFICIENT_RESOURCES}, { + ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, { ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { @@ -682,7 +682,7 @@ static const struct { ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { - ERRDOS, ERRnomem, NT_STATUS_INSUFF_SERVER_RESOURCES}, { + ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { @@ -913,8 +913,9 @@ map_smb_to_linux_error(char *buf, bool logErr) * portion, the number of word parameters and the data portion of the message */ unsigned int -smbCalcSize(struct smb_hdr *ptr) +smbCalcSize(void *buf) { + struct smb_hdr *ptr = (struct smb_hdr *)buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 2 /* size of the bcc field */ + get_bcc(ptr)); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index d87f826..f9b5d3d 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -151,7 +151,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } -static void +void cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, struct cifs_sb_info *cifs_sb) { @@ -220,7 +220,8 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, } */ -static int initiate_cifs_search(const unsigned int xid, struct file *file) +static int +initiate_cifs_search(const unsigned int xid, struct file *file) { __u16 search_flags; int rc = 0; @@ -229,6 +230,7 @@ static int initiate_cifs_search(const unsigned int xid, struct file *file) struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; if (file->private_data == NULL) { tlink = cifs_sb_tlink(cifs_sb); @@ -248,6 +250,13 @@ static int initiate_cifs_search(const unsigned int xid, struct file *file) tcon = tlink_tcon(cifsFile->tlink); } + server = tcon->ses->server; + + if (!server->ops->query_dir_first) { + rc = -ENOSYS; + goto error_exit; + } + cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; @@ -278,10 +287,10 @@ ffirst_retry: if (backup_cred(cifs_sb)) search_flags |= CIFS_SEARCH_BACKUP_SEARCH; - rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb->local_nls, - &cifsFile->netfid, search_flags, &cifsFile->srch_inf, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); + rc = server->ops->query_dir_first(xid, tcon, full_path, cifs_sb, + &cifsFile->fid, search_flags, + &cifsFile->srch_inf); + if (rc == 0) cifsFile->invalidHandle = false; /* BB add following call to handle readdir on new NTFS symlink errors @@ -501,62 +510,67 @@ static int cifs_save_resume_key(const char *current_entry, return rc; } -/* find the corresponding entry in the search */ -/* Note that the SMB server returns search entries for . and .. which - complicates logic here if we choose to parse for them and we do not - assume that they are located in the findfirst return buffer.*/ -/* We start counting in the buffer with entry 2 and increment for every - entry (do not increment for . or .. entry) */ -static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *pTcon, - struct file *file, char **ppCurrentEntry, int *num_to_ret) +/* + * Find the corresponding entry in the search. Note that the SMB server returns + * search entries for . and .. which complicates logic here if we choose to + * parse for them and we do not assume that they are located in the findfirst + * return buffer. We start counting in the buffer with entry 2 and increment for + * every entry (do not increment for . or .. entry). + */ +static int +find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, + struct file *file, char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; int pos_in_buf = 0; loff_t first_entry_in_buffer; loff_t index_to_find = file->f_pos; - struct cifsFileInfo *cifsFile = file->private_data; + struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct TCP_Server_Info *server = tcon->ses->server; /* check if index in the buffer */ - if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || - (num_to_ret == NULL)) + if (!server->ops->query_dir_first || !server->ops->query_dir_next) + return -ENOSYS; + + if ((cfile == NULL) || (current_entry == NULL) || (num_to_ret == NULL)) return -ENOENT; - *ppCurrentEntry = NULL; - first_entry_in_buffer = - cifsFile->srch_inf.index_of_last_entry - - cifsFile->srch_inf.entries_in_buffer; + *current_entry = NULL; + first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - + cfile->srch_inf.entries_in_buffer; - /* if first entry in buf is zero then is first buffer - in search response data which means it is likely . and .. - will be in this buffer, although some servers do not return - . and .. for the root of a drive and for those we need - to start two entries earlier */ + /* + * If first entry in buf is zero then is first buffer + * in search response data which means it is likely . and .. + * will be in this buffer, although some servers do not return + * . and .. for the root of a drive and for those we need + * to start two entries earlier. + */ dump_cifs_file_struct(file, "In fce "); - if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && - is_dir_changed(file)) || - (index_to_find < first_entry_in_buffer)) { + if (((index_to_find < cfile->srch_inf.index_of_last_entry) && + is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cFYI(1, "search backing up - close and restart search"); spin_lock(&cifs_file_list_lock); - if (!cifsFile->srch_inf.endOfSearch && - !cifsFile->invalidHandle) { - cifsFile->invalidHandle = true; + if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); - CIFSFindClose(xid, pTcon, cifsFile->netfid); + if (server->ops->close) + server->ops->close(xid, tcon, &cfile->fid); } else spin_unlock(&cifs_file_list_lock); - if (cifsFile->srch_inf.ntwrk_buf_start) { + if (cfile->srch_inf.ntwrk_buf_start) { cFYI(1, "freeing SMB ff cache buf on search rewind"); - if (cifsFile->srch_inf.smallBuf) - cifs_small_buf_release(cifsFile->srch_inf. + if (cfile->srch_inf.smallBuf) + cifs_small_buf_release(cfile->srch_inf. ntwrk_buf_start); else - cifs_buf_release(cifsFile->srch_inf. + cifs_buf_release(cfile->srch_inf. ntwrk_buf_start); - cifsFile->srch_inf.ntwrk_buf_start = NULL; + cfile->srch_inf.ntwrk_buf_start = NULL; } rc = initiate_cifs_search(xid, file); if (rc) { @@ -565,65 +579,64 @@ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *pTcon, return rc; } /* FindFirst/Next set last_entry to NULL on malformed reply */ - if (cifsFile->srch_inf.last_entry) - cifs_save_resume_key(cifsFile->srch_inf.last_entry, - cifsFile); + if (cfile->srch_inf.last_entry) + cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); } search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; if (backup_cred(cifs_sb)) search_flags |= CIFS_SEARCH_BACKUP_SEARCH; - while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && - (rc == 0) && !cifsFile->srch_inf.endOfSearch) { + while ((index_to_find >= cfile->srch_inf.index_of_last_entry) && + (rc == 0) && !cfile->srch_inf.endOfSearch) { cFYI(1, "calling findnext2"); - rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, search_flags, - &cifsFile->srch_inf); + rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, + search_flags, + &cfile->srch_inf); /* FindFirst/Next set last_entry to NULL on malformed reply */ - if (cifsFile->srch_inf.last_entry) - cifs_save_resume_key(cifsFile->srch_inf.last_entry, - cifsFile); + if (cfile->srch_inf.last_entry) + cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); if (rc) return -ENOENT; } - if (index_to_find < cifsFile->srch_inf.index_of_last_entry) { + if (index_to_find < cfile->srch_inf.index_of_last_entry) { /* we found the buffer that contains the entry */ /* scan and find it */ int i; - char *current_entry; - char *end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + - smbCalcSize((struct smb_hdr *) - cifsFile->srch_inf.ntwrk_buf_start); - - current_entry = cifsFile->srch_inf.srch_entries_start; - first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - - cifsFile->srch_inf.entries_in_buffer; + char *cur_ent; + char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + server->ops->calc_smb_size( + cfile->srch_inf.ntwrk_buf_start); + + cur_ent = cfile->srch_inf.srch_entries_start; + first_entry_in_buffer = cfile->srch_inf.index_of_last_entry + - cfile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; cFYI(1, "found entry - pos_in_buf %d", pos_in_buf); - for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { + for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { /* go entry by entry figuring out which is first */ - current_entry = nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + cur_ent = nxt_dir_entry(cur_ent, end_of_smb, + cfile->srch_inf.info_level); } - if ((current_entry == NULL) && (i < pos_in_buf)) { + if ((cur_ent == NULL) && (i < pos_in_buf)) { /* BB fixme - check if we should flag this error */ cERROR(1, "reached end of buf searching for pos in buf" - " %d index to find %lld rc %d", - pos_in_buf, index_to_find, rc); + " %d index to find %lld rc %d", pos_in_buf, + index_to_find, rc); } rc = 0; - *ppCurrentEntry = current_entry; + *current_entry = cur_ent; } else { cFYI(1, "index not in buffer - could not findnext into it"); return 0; } - if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { + if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) { cFYI(1, "can not return entries pos_in_buf beyond last"); *num_to_ret = 0; } else - *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; + *num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf; return rc; } @@ -723,7 +736,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) int rc = 0; unsigned int xid; int i; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; struct cifsFileInfo *cifsFile = NULL; char *current_entry; int num_to_fill = 0; @@ -781,12 +794,12 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) } } /* else { cifsFile->invalidHandle = true; - CIFSFindClose(xid, pTcon, cifsFile->netfid); + tcon->ses->server->close(xid, tcon, &cifsFile->fid); } */ - pTcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, pTcon, file, - ¤t_entry, &num_to_fill); + tcon = tlink_tcon(cifsFile->tlink); + rc = find_cifs_entry(xid, tcon, file, ¤t_entry, + &num_to_fill); if (rc) { cFYI(1, "fce error %d", rc); goto rddir2_exit; @@ -798,7 +811,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) } cFYI(1, "loop through %d times filling dir for net buf %p", num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); - max_len = smbCalcSize((struct smb_hdr *) + max_len = tcon->ses->server->ops->calc_smb_size( cifsFile->srch_inf.ntwrk_buf_start); end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; @@ -815,10 +828,12 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) num_to_fill, i); break; } - /* if buggy server returns . and .. late do - we want to check for that here? */ - rc = cifs_filldir(current_entry, file, - filldir, direntry, tmp_buf, max_len); + /* + * if buggy server returns . and .. late do we want to + * check for that here? + */ + rc = cifs_filldir(current_entry, file, filldir, + direntry, tmp_buf, max_len); if (rc == -EOVERFLOW) { rc = 0; break; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 382c06d..76809f4 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -876,7 +876,8 @@ ssetup_ntlmssp_authenticate: pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; smb_buf = (struct smb_hdr *)iov[0].iov_base; - if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == + if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && + (smb_buf->Status.CifsError == cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { if (phase != NtLmNegotiate) { cERROR(1, "Unexpected more processing error"); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3129ac7..56cc4be 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -17,6 +17,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/pagemap.h> +#include <linux/vfs.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" @@ -63,7 +65,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, static bool cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2) { - return ob1->netfid == ob2->netfid; + return ob1->fid.netfid == ob2->fid.netfid; } static unsigned int @@ -410,6 +412,83 @@ cifs_negotiate(const unsigned int xid, struct cifs_ses *ses) return rc; } +static unsigned int +cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + if (volume_info->wsize) + wsize = volume_info->wsize; + else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = CIFS_DEFAULT_IOSIZE; + else + wsize = CIFS_DEFAULT_NON_POSIX_WSIZE; + + /* can server support 24-bit write sizes? (via UNIX extensions) */ + if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); + + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && + (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); + + /* hard limit of CIFS_MAX_WSIZE */ + wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); + + return wsize; +} + +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * OS/2 just sends back short reads. + * + * If the server doesn't advertise CAP_LARGE_READ_X, then assume that + * it can't handle a read request larger than its MaxBufferSize either. + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = volume_info->rsize ? volume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + static void cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) { @@ -489,6 +568,13 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, CIFS_MOUNT_MAP_SPECIAL_CHR); } +static int +cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, FILE_ALL_INFO *data) +{ + return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); +} + static char * cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon) @@ -607,6 +693,219 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, cifsInode->cifsAttrs = dosattrs; } +static int +cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, + int disposition, int desired_access, int create_options, + struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf, + struct cifs_sb_info *cifs_sb) +{ + if (!(tcon->ses->capabilities & CAP_NT_SMBS)) + return SMBLegacyOpen(xid, tcon, path, disposition, + desired_access, create_options, + &fid->netfid, oplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags + & CIFS_MOUNT_MAP_SPECIAL_CHR); + return CIFSSMBOpen(xid, tcon, path, disposition, desired_access, + create_options, &fid->netfid, oplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); +} + +static void +cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + cfile->fid.netfid = fid->netfid; + cifs_set_oplock_level(cinode, oplock); + cinode->can_cache_brlcks = cinode->clientCanCacheAll; +} + +static void +cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + CIFSSMBClose(xid, tcon, fid->netfid); +} + +static int +cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return CIFSSMBFlush(xid, tcon, fid->netfid); +} + +static int +cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, + struct cifs_io_parms *parms, unsigned int *bytes_read, + char **buf, int *buf_type) +{ + parms->netfid = cfile->fid.netfid; + return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type); +} + +static int +cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, + struct cifs_io_parms *parms, unsigned int *written, + struct kvec *iov, unsigned long nr_segs) +{ + + parms->netfid = cfile->fid.netfid; + return CIFSSMBWrite2(xid, parms, written, iov, nr_segs); +} + +static int +smb_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid) +{ + int oplock = 0; + int rc; + __u16 netfid; + __u32 netpid; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon; + FILE_BASIC_INFO info_buf; + + /* if the file is already open for write, just use that fileid */ + open_file = find_writable_file(cinode, true); + if (open_file) { + netfid = open_file->fid.netfid; + netpid = open_file->pid; + tcon = tlink_tcon(open_file->tlink); + goto set_via_filehandle; + } + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); + + /* + * NT4 apparently returns success on this call, but it doesn't really + * work. + */ + if (!(tcon->ses->flags & CIFS_SES_NT4)) { + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) + goto out; + } + + cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported " + "by this server"); + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, + &netfid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc != 0) { + if (rc == -EIO) + rc = -EINVAL; + goto out; + } + + netpid = current->tgid; + +set_via_filehandle: + rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); + if (!rc) + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + + if (open_file == NULL) + CIFSSMBClose(xid, tcon, netfid); + else + cifsFileInfo_put(open_file); +out: + if (tlink != NULL) + cifs_put_tlink(tlink); + return rc; +} + +static int +cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); +} + +static int +cifs_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return CIFSFindNext(xid, tcon, fid->netfid, search_flags, srch_inf); +} + +static int +cifs_close_dir(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return CIFSFindClose(xid, tcon, fid->netfid); +} + +static int +cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, + struct cifsInodeInfo *cinode) +{ + return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, + cinode->clientCanCacheRead ? 1 : 0); +} + +static int +cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *buf) +{ + int rc = -EOPNOTSUPP; + + buf->f_type = CIFS_MAGIC_NUMBER; + + /* + * We could add a second check for a QFS Unix capability bit + */ + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability))) + rc = CIFSSMBQFSPosixInfo(xid, tcon, buf); + + /* + * Only need to call the old QFSInfo if failed on newer one, + * e.g. by OS/2. + **/ + if (rc && (tcon->ses->capabilities & CAP_NT_SMBS)) + rc = CIFSSMBQFSInfo(xid, tcon, buf); + + /* + * Some old Windows servers also do not support level 103, retry with + * older level one if old server failed the previous call or we + * bypassed it because we detected that this was an older LANMAN sess + */ + if (rc) + rc = SMBOldQFSInfo(xid, tcon, buf); + return rc; +} + +static int +cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u32 type, int lock, int unlock, bool wait) +{ + return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->fid.netfid, + current->tgid, length, offset, unlock, lock, + (__u8)type, wait, 0); +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -630,6 +929,8 @@ struct smb_version_operations smb1_operations = { .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, + .negotiate_wsize = cifs_negotiate_wsize, + .negotiate_rsize = cifs_negotiate_rsize, .sess_setup = CIFS_SessSetup, .logoff = CIFSSMBLogoff, .tree_connect = CIFSTCon, @@ -638,12 +939,37 @@ struct smb_version_operations smb1_operations = { .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, .query_path_info = cifs_query_path_info, + .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, + .set_path_size = CIFSSMBSetEOF, + .set_file_size = CIFSSMBSetFileSize, + .set_file_info = smb_set_file_info, .build_path_to_root = cifs_build_path_to_root, .echo = CIFSSMBEcho, .mkdir = CIFSSMBMkDir, .mkdir_setinfo = cifs_mkdir_setinfo, .rmdir = CIFSSMBRmDir, + .unlink = CIFSSMBDelFile, + .rename_pending_delete = cifs_rename_pending_delete, + .rename = CIFSSMBRename, + .create_hardlink = CIFSCreateHardLink, + .open = cifs_open_file, + .set_fid = cifs_set_fid, + .close = cifs_close_file, + .flush = cifs_flush_file, + .async_readv = cifs_async_readv, + .async_writev = cifs_async_writev, + .sync_read = cifs_sync_read, + .sync_write = cifs_sync_write, + .query_dir_first = cifs_query_dir_first, + .query_dir_next = cifs_query_dir_next, + .close_dir = cifs_close_dir, + .calc_smb_size = smbCalcSize, + .oplock_response = cifs_oplock_response, + .queryfs = cifs_queryfs, + .mand_lock = cifs_mand_lock, + .mand_unlock_range = cifs_unlock_range, + .push_mand_locks = cifs_push_mandatory_locks, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c new file mode 100644 index 0000000..a93eec3 --- /dev/null +++ b/fs/cifs/smb2file.c @@ -0,0 +1,302 @@ +/* + * fs/cifs/smb2file.c + * + * Copyright (C) International Business Machines Corp., 2002, 2011 + * Author(s): Steve French (sfrench@us.ibm.com), + * Pavel Shilovsky ((pshilovsky@samba.org) 2012 + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/fs.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <asm/div64.h> +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#include "fscache.h" +#include "smb2proto.h" + +void +smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_II) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} + +int +smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, + int disposition, int desired_access, int create_options, + struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *smb2_path; + struct smb2_file_all_info *smb2_data = NULL; + __u8 smb2_oplock[17]; + + smb2_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (smb2_path == NULL) { + rc = -ENOMEM; + goto out; + } + + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + GFP_KERNEL); + if (smb2_data == NULL) { + rc = -ENOMEM; + goto out; + } + + desired_access |= FILE_READ_ATTRIBUTES; + *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + + if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) + memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); + + rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid, + &fid->volatile_fid, desired_access, disposition, + 0, 0, smb2_oplock, smb2_data); + if (rc) + goto out; + + if (buf) { + /* open response does not have IndexNumber field - get it */ + rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid, + fid->volatile_fid, + &smb2_data->IndexNumber); + if (rc) { + /* let get_inode_info disable server inode numbers */ + smb2_data->IndexNumber = 0; + rc = 0; + } + move_smb2_info_to_cifs(buf, smb2_data); + } + + *oplock = *smb2_oplock; +out: + kfree(smb2_data); + kfree(smb2_path); + return rc; +} + +int +smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, + const unsigned int xid) +{ + int rc = 0, stored_rc; + unsigned int max_num, num = 0, max_buf; + struct smb2_lock_element *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) + return -EINVAL; + + max_num = max_buf / sizeof(struct smb2_lock_element); + buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + cur = buf; + + down_write(&cinode->lock_sem); + list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cinode->can_cache_brlcks) { + /* + * We can cache brlock requests - simply remove a lock + * from the file's list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + continue; + } + cur->Length = cpu_to_le64(li->length); + cur->Offset = cpu_to_le64(li->offset); + cur->Flags = cpu_to_le32(SMB2_LOCKFLAG_UNLOCK); + /* + * We need to save a lock here to let us add it again to the + * file's list if the unlock range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) { + /* + * We failed on the unlock range request - add + * all locks from the tmp list to the head of + * the file's list. + */ + cifs_move_llist(&tmp_llist, + &cfile->llist->locks); + rc = stored_rc; + } else + /* + * The unlock range request succeed - free the + * tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; + } + if (num) { + stored_rc = smb2_lockv(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, current->tgid, + num, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cfile->llist->locks); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } + up_write(&cinode->lock_sem); + + kfree(buf); + return rc; +} + +static int +smb2_push_mand_fdlocks(struct cifs_fid_locks *fdlocks, const unsigned int xid, + struct smb2_lock_element *buf, unsigned int max_num) +{ + int rc = 0, stored_rc; + struct cifsFileInfo *cfile = fdlocks->cfile; + struct cifsLockInfo *li; + unsigned int num = 0; + struct smb2_lock_element *cur = buf; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + list_for_each_entry(li, &fdlocks->locks, llist) { + cur->Length = cpu_to_le64(li->length); + cur->Offset = cpu_to_le64(li->offset); + cur->Flags = cpu_to_le32(li->type | + SMB2_LOCKFLAG_FAIL_IMMEDIATELY); + if (++num == max_num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + if (num) { + stored_rc = smb2_lockv(xid, tcon, + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, num, buf); + if (stored_rc) + rc = stored_rc; + } + + return rc; +} + +int +smb2_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + int rc = 0, stored_rc; + unsigned int xid; + unsigned int max_num, max_buf; + struct smb2_lock_element *buf; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_fid_locks *fdlocks; + + xid = get_xid(); + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (!cinode->can_cache_brlcks) { + up_write(&cinode->lock_sem); + free_xid(xid); + return rc; + } + + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; + if (!max_buf) { + up_write(&cinode->lock_sem); + free_xid(xid); + return -EINVAL; + } + + max_num = max_buf / sizeof(struct smb2_lock_element); + buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + if (!buf) { + up_write(&cinode->lock_sem); + free_xid(xid); + return -ENOMEM; + } + + list_for_each_entry(fdlocks, &cinode->llist, llist) { + stored_rc = smb2_push_mand_fdlocks(fdlocks, xid, buf, max_num); + if (stored_rc) + rc = stored_rc; + } + + cinode->can_cache_brlcks = false; + kfree(buf); + + up_write(&cinode->lock_sem); + free_xid(xid); + return rc; +} diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 33c1d89..7c0e214 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -23,6 +23,8 @@ #ifndef _SMB2_GLOB_H #define _SMB2_GLOB_H +#define SMB2_MAGIC_NUMBER 0xFE534D42 + /* ***************************************************************** * Constants go here @@ -40,5 +42,17 @@ #define SMB2_OP_MKDIR 5 #define SMB2_OP_RENAME 6 #define SMB2_OP_DELETE 7 +#define SMB2_OP_HARDLINK 8 +#define SMB2_OP_SET_EOF 9 + +/* Used when constructing chained read requests. */ +#define CHAINED_REQUEST 1 +#define START_OF_CHAIN 2 +#define END_OF_CHAIN 4 +#define RELATED_REQUEST 8 + +#define SMB2_SIGNATURE_SIZE (16) +#define SMB2_NTLMV2_SESSKEY_SIZE (16) +#define SMB2_HMACSHA256_SIZE (32) #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 2aa5cb0..7064824 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -47,6 +47,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, int rc, tmprc = 0; u64 persistent_fid, volatile_fid; __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) @@ -54,7 +55,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, desired_access, create_disposition, file_attributes, - create_options); + create_options, &oplock, NULL); if (rc) { kfree(utf16_path); return rc; @@ -74,6 +75,22 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, * SMB2_open() call. */ break; + case SMB2_OP_RENAME: + tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid, + (__le16 *)data); + break; + case SMB2_OP_HARDLINK: + tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid, + volatile_fid, (__le16 *)data); + break; + case SMB2_OP_SET_EOF: + tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid, + current->tgid, (__le64 *)data); + break; + case SMB2_OP_SET_INFO: + tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid, + (FILE_BASIC_INFO *)data); + break; default: cERROR(1, "Invalid command"); break; @@ -86,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static void +void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) { memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); @@ -161,3 +178,80 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, NULL, SMB2_OP_DELETE); } + +int +smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + 0, CREATE_DELETE_ON_CLOSE, NULL, + SMB2_OP_DELETE); +} + +static int +smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb, __u32 access, int command) +{ + __le16 *smb2_to_name = NULL; + int rc; + + smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb); + if (smb2_to_name == NULL) { + rc = -ENOMEM; + goto smb2_rename_path; + } + + rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, + FILE_OPEN, 0, 0, smb2_to_name, command); +smb2_rename_path: + kfree(smb2_to_name); + return rc; +} + +int +smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + DELETE, SMB2_OP_RENAME); +} + +int +smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) +{ + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK); +} + +int +smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_alloc) +{ + __le64 eof = cpu_to_le64(size); + return smb2_open_op_close(xid, tcon, cifs_sb, full_path, + FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof, + SMB2_OP_SET_EOF); +} + +int +smb2_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, + FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf, + SMB2_OP_SET_INFO); + cifs_put_tlink(tlink); + return rc; +} diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index be41478..494c912 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -453,7 +453,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"}, {STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO, "STATUS_ALLOTTED_SPACE_EXCEEDED"}, - {STATUS_INSUFFICIENT_RESOURCES, -EIO, "STATUS_INSUFFICIENT_RESOURCES"}, + {STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO, + "STATUS_INSUFFICIENT_RESOURCES"}, {STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"}, {STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"}, {STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"}, @@ -2455,7 +2456,8 @@ map_smb2_to_linux_error(char *buf, bool log_err) return 0; /* mask facility */ - if (log_err && (smb2err != (STATUS_MORE_PROCESSING_REQUIRED))) + if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) && + (smb2err != STATUS_END_OF_FILE)) smb2_print_status(smb2err); else if (cifsFYI & CIFS_RC) smb2_print_status(smb2err); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index e4d3b99..7b1c5e3 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -142,12 +142,19 @@ smb2_check_message(char *buf, unsigned int length) } if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { - if (hdr->Status == 0 || - pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2) { + if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 || + pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { /* error packets have 9 byte structure size */ cERROR(1, "Illegal response size %u for command %d", le16_to_cpu(pdu->StructureSize2), command); return 1; + } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0) + && (le16_to_cpu(pdu->StructureSize2) != 44) + && (le16_to_cpu(pdu->StructureSize2) != 36)) { + /* special case for SMB2.1 lease break message */ + cERROR(1, "Illegal response size %d for oplock break", + le16_to_cpu(pdu->StructureSize2)); + return 1; } } @@ -162,6 +169,9 @@ smb2_check_message(char *buf, unsigned int length) if (4 + len != clc_len) { cFYI(1, "Calculated size %u length %u mismatch mid %llu", clc_len, 4 + len, mid); + /* Windows 7 server returns 24 bytes more */ + if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) + return 0; /* server can return one byte more */ if (clc_len == 4 + len + 1) return 0; @@ -244,7 +254,15 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength); break; case SMB2_READ: + *off = ((struct smb2_read_rsp *)hdr)->DataOffset; + *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength); + break; case SMB2_QUERY_DIRECTORY: + *off = le16_to_cpu( + ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset); + *len = le32_to_cpu( + ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); + break; case SMB2_IOCTL: case SMB2_CHANGE_NOTIFY: default: @@ -287,8 +305,9 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * portion, the number of word parameters and the data portion of the message. */ unsigned int -smb2_calc_size(struct smb2_hdr *hdr) +smb2_calc_size(void *buf) { + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ @@ -347,3 +366,218 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) CIFS_MOUNT_MAP_SPECIAL_CHR); return to; } + +__le32 +smb2_get_lease_state(struct cifsInodeInfo *cinode) +{ + if (cinode->clientCanCacheAll) + return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + else if (cinode->clientCanCacheRead) + return SMB2_LEASE_READ_CACHING; + return 0; +} + +__u8 smb2_map_lease_to_oplock(__le32 lease_state) +{ + if (lease_state & SMB2_LEASE_WRITE_CACHING) { + if (lease_state & SMB2_LEASE_HANDLE_CACHING) + return SMB2_OPLOCK_LEVEL_BATCH; + else + return SMB2_OPLOCK_LEVEL_EXCLUSIVE; + } else if (lease_state & SMB2_LEASE_READ_CACHING) + return SMB2_OPLOCK_LEVEL_II; + return 0; +} + +struct smb2_lease_break_work { + struct work_struct lease_break; + struct tcon_link *tlink; + __u8 lease_key[16]; + __le32 lease_state; +}; + +static void +cifs_ses_oplock_break(struct work_struct *work) +{ + struct smb2_lease_break_work *lw = container_of(work, + struct smb2_lease_break_work, lease_break); + int rc; + + rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, + lw->lease_state); + cFYI(1, "Lease release rc %d", rc); + cifs_put_tlink(lw->tlink); + kfree(lw); +} + +static bool +smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + struct cifs_pending_open *open; + struct smb2_lease_break_work *lw; + bool found; + int ack_req = le32_to_cpu(rsp->Flags & + SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); + + lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); + if (!lw) { + cERROR(1, "Memory allocation failed during lease break check"); + return false; + } + + INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); + lw->lease_state = rsp->NewLeaseState; + + cFYI(1, "Checking for lease break"); + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + + spin_lock(&cifs_file_list_lock); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); + list_for_each(tmp2, &tcon->openFileList) { + cfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + cinode = CIFS_I(cfile->dentry->d_inode); + + if (memcmp(cinode->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + cFYI(1, "found in the open list"); + cFYI(1, "lease key match, lease break 0x%d", + le32_to_cpu(rsp->NewLeaseState)); + + smb2_set_oplock_level(cinode, + smb2_map_lease_to_oplock(rsp->NewLeaseState)); + + if (ack_req) + cfile->oplock_break_cancelled = false; + else + cfile->oplock_break_cancelled = true; + + queue_work(cifsiod_wq, &cfile->oplock_break); + + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + + found = false; + list_for_each_entry(open, &tcon->pending_opens, olist) { + if (memcmp(open->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + if (!found && ack_req) { + found = true; + memcpy(lw->lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + lw->tlink = cifs_get_tlink(open->tlink); + queue_work(cifsiod_wq, + &lw->lease_break); + } + + cFYI(1, "found in the pending open list"); + cFYI(1, "lease key match, lease break 0x%d", + le32_to_cpu(rsp->NewLeaseState)); + + open->oplock = + smb2_map_lease_to_oplock(rsp->NewLeaseState); + } + if (found) { + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + } + spin_unlock(&cifs_file_list_lock); + } + spin_unlock(&cifs_tcp_ses_lock); + kfree(lw); + cFYI(1, "Can not process lease break - no lease matched"); + return false; +} + +bool +smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + + cFYI(1, "Checking for oplock break"); + + if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) + return false; + + if (rsp->StructureSize != + smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { + if (le16_to_cpu(rsp->StructureSize) == 44) + return smb2_is_valid_lease_break(buffer, server); + else + return false; + } + + cFYI(1, "oplock level 0x%d", rsp->OplockLevel); + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + + cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &tcon->openFileList) { + cfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + if (rsp->PersistentFid != + cfile->fid.persistent_fid || + rsp->VolatileFid != + cfile->fid.volatile_fid) + continue; + + cFYI(1, "file id match, oplock break"); + cinode = CIFS_I(cfile->dentry->d_inode); + + if (!cinode->clientCanCacheAll && + rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) + cfile->oplock_break_cancelled = true; + else + cfile->oplock_break_cancelled = false; + + smb2_set_oplock_level(cinode, + rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0); + + queue_work(cifsiod_wq, &cfile->oplock_break); + + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + cFYI(1, "No matching file for oplock break"); + return true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + cFYI(1, "Can not process oplock break for non-existent connection"); + return false; +} diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 826209b..4d9dbe0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -17,11 +17,15 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/pagemap.h> +#include <linux/vfs.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "smb2status.h" +#include "smb2glob.h" static int change_conf(struct TCP_Server_Info *server) @@ -63,6 +67,17 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) rc = change_conf(server); + /* + * Sometimes server returns 0 credits on oplock break ack - we need to + * rebalance credits in this case. + */ + else if (server->in_flight > 0 && server->oplock_credits == 0 && + server->oplocks) { + if (server->credits > 1) { + server->credits--; + server->oplock_credits++; + } + } spin_unlock(&server->req_lock); wake_up(&server->request_q); if (rc) @@ -157,6 +172,42 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) return rc; } +static unsigned int +smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; + wsize = min_t(unsigned int, wsize, server->max_write); + /* + * limit write size to 2 ** 16, because we don't support multicredit + * requests now. + */ + wsize = min_t(unsigned int, wsize, 2 << 15); + + return wsize; +} + +static unsigned int +smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize; + + /* start with specified rsize, or default */ + rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; + rsize = min_t(unsigned int, rsize, server->max_read); + /* + * limit write size to 2 ** 16, because we don't support multicredit + * requests now. + */ + rsize = min_t(unsigned int, rsize, 2 << 15); + + return rsize; +} + static int smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -164,13 +215,14 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, int rc; __u64 persistent_fid, volatile_fid; __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) return -ENOMEM; rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, - FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0); + FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL); if (rc) { kfree(utf16_path); return rc; @@ -190,6 +242,26 @@ smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, return 0; } +static int +smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, FILE_ALL_INFO *data) +{ + int rc; + struct smb2_file_all_info *smb2_data; + + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + GFP_KERNEL); + if (smb2_data == NULL) + return -ENOMEM; + + rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, + smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + kfree(smb2_data); + return rc; +} + static char * smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon) @@ -292,7 +364,221 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) #endif } +static void +smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + cfile->fid.persistent_fid = fid->persistent_fid; + cfile->fid.volatile_fid = fid->volatile_fid; + smb2_set_oplock_level(cinode, oplock); + cinode->can_cache_brlcks = cinode->clientCanCacheAll; +} + +static void +smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +static int +smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return SMB2_flush(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +static unsigned int +smb2_read_data_offset(char *buf) +{ + struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + return rsp->DataOffset; +} + +static unsigned int +smb2_read_data_length(char *buf) +{ + struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + return le32_to_cpu(rsp->DataLength); +} + + +static int +smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, + struct cifs_io_parms *parms, unsigned int *bytes_read, + char **buf, int *buf_type) +{ + parms->persistent_fid = cfile->fid.persistent_fid; + parms->volatile_fid = cfile->fid.volatile_fid; + return SMB2_read(xid, parms, bytes_read, buf, buf_type); +} + +static int +smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, + struct cifs_io_parms *parms, unsigned int *written, + struct kvec *iov, unsigned long nr_segs) +{ + + parms->persistent_fid = cfile->fid.persistent_fid; + parms->volatile_fid = cfile->fid.volatile_fid; + return SMB2_write(xid, parms, written, iov, nr_segs); +} + +static int +smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, __u64 size, bool set_alloc) +{ + __le64 eof = cpu_to_le64(size); + return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); +} + +static int +smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + __le16 *utf16_path; + int rc; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + __u64 persistent_fid, volatile_fid; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, + FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0, + &oplock, NULL); + kfree(utf16_path); + if (rc) { + cERROR(1, "open dir failed"); + return rc; + } + + srch_inf->entries_in_buffer = 0; + srch_inf->index_of_last_entry = 0; + fid->persistent_fid = persistent_fid; + fid->volatile_fid = volatile_fid; + + rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0, + srch_inf); + if (rc) { + cERROR(1, "query directory failed"); + SMB2_close(xid, tcon, persistent_fid, volatile_fid); + } + return rc; +} + +static int +smb2_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid, __u16 search_flags, + struct cifs_search_info *srch_inf) +{ + return SMB2_query_directory(xid, tcon, fid->persistent_fid, + fid->volatile_fid, 0, srch_inf); +} + +static int +smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *fid) +{ + return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); +} + +/* +* If we negotiate SMB2 protocol and get STATUS_PENDING - update +* the number of credits and return true. Otherwise - return false. +*/ +static bool +smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + + if (hdr->Status != STATUS_PENDING) + return false; + + if (!length) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(hdr->CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + + return true; +} + +static int +smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, + struct cifsInodeInfo *cinode) +{ + if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) + return SMB2_lease_break(0, tcon, cinode->lease_key, + smb2_get_lease_state(cinode)); + + return SMB2_oplock_break(0, tcon, fid->persistent_fid, + fid->volatile_fid, + cinode->clientCanCacheRead ? 1 : 0); +} + +static int +smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *buf) +{ + int rc; + u64 persistent_fid, volatile_fid; + __le16 srch_path = 0; /* Null - open root of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + + rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid, + FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL); + if (rc) + return rc; + buf->f_type = SMB2_MAGIC_NUMBER; + rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf); + SMB2_close(xid, tcon, persistent_fid, volatile_fid); + return rc; +} + +static bool +smb2_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2) +{ + return ob1->fid.persistent_fid == ob2->fid.persistent_fid && + ob1->fid.volatile_fid == ob2->fid.volatile_fid; +} + +static int +smb2_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, + __u64 length, __u32 type, int lock, int unlock, bool wait) +{ + if (unlock && !lock) + type = SMB2_LOCKFLAG_UNLOCK; + return SMB2_lock(xid, tlink_tcon(cfile->tlink), + cfile->fid.persistent_fid, cfile->fid.volatile_fid, + current->tgid, length, offset, type, wait); +} + +static void +smb2_get_lease_key(struct inode *inode, struct cifs_fid *fid) +{ + memcpy(fid->lease_key, CIFS_I(inode)->lease_key, SMB2_LEASE_KEY_SIZE); +} + +static void +smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) +{ + memcpy(CIFS_I(inode)->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE); +} + +static void +smb2_new_lease_key(struct cifs_fid *fid) +{ + get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); +} + struct smb_version_operations smb21_operations = { + .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, .setup_async_request = smb2_setup_async_request, .check_receive = smb2_check_receive, @@ -301,13 +587,19 @@ struct smb_version_operations smb21_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, .find_mid = smb2_find_mid, .check_message = smb2_check_message, .dump_detail = smb2_dump_detail, .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, .sess_setup = SMB2_sess_setup, .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, @@ -317,16 +609,68 @@ struct smb_version_operations smb21_operations = { .echo = SMB2_echo, .query_path_info = smb2_query_path_info, .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, .build_path_to_root = smb2_build_path_to_root, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, }; struct smb_version_values smb21_values = { .version_string = SMB21_VERSION_STRING, + .protocol_id = SMB21_PROT_ID, + .req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */ + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, +}; + +struct smb_version_values smb30_values = { + .version_string = SMB30_VERSION_STRING, + .protocol_id = SMB30_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 62b3f17..cf33622 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,7 +1,7 @@ /* * fs/cifs/smb2pdu.c * - * Copyright (C) International Business Machines Corp., 2009, 2011 + * Copyright (C) International Business Machines Corp., 2009, 2012 * Etersoft, 2012 * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 @@ -31,7 +31,9 @@ #include <linux/fs.h> #include <linux/kernel.h> #include <linux/vfs.h> +#include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> +#include <linux/pagemap.h> #include <linux/xattr.h> #include "smb2pdu.h" #include "cifsglob.h" @@ -42,6 +44,8 @@ #include "cifs_debug.h" #include "ntlmssp.h" #include "smb2status.h" +#include "smb2glob.h" +#include "cifspdu.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -115,9 +119,9 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , /* BB how does SMB2 do case sensitive? */ /* if (tcon->nocase) hdr->Flags |= SMBFLG_CASELESS; */ - /* if (tcon->ses && tcon->ses->server && + if (tcon->ses && tcon->ses->server && (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) - hdr->Flags |= SMB2_FLAGS_SIGNED; */ + hdr->Flags |= SMB2_FLAGS_SIGNED; out: pdu->StructureSize2 = cpu_to_le16(parmsize); return; @@ -300,24 +304,6 @@ free_rsp_buf(int resp_buftype, void *rsp) cifs_buf_release(rsp); } -#define SMB2_NUM_PROT 1 - -#define SMB2_PROT 0 -#define SMB21_PROT 1 -#define BAD_PROT 0xFFFF - -#define SMB2_PROT_ID 0x0202 -#define SMB21_PROT_ID 0x0210 -#define BAD_PROT_ID 0xFFFF - -static struct { - int index; - __le16 name; -} smb2protocols[] = { - {SMB2_PROT, cpu_to_le16(SMB2_PROT_ID)}, - {SMB21_PROT, cpu_to_le16(SMB21_PROT_ID)}, - {BAD_PROT, cpu_to_le16(BAD_PROT_ID)} -}; /* * @@ -344,7 +330,6 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) int resp_buftype; struct TCP_Server_Info *server; unsigned int sec_flags; - u16 i; u16 temp = 0; int blob_offset, blob_length; char *security_blob; @@ -373,11 +358,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->hdr.SessionId = 0; - for (i = 0; i < SMB2_NUM_PROT; i++) - req->Dialects[i] = smb2protocols[i].name; + req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); - req->DialectCount = cpu_to_le16(i); - inc_rfc1001_len(req, i * 2); + req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */ + inc_rfc1001_len(req, 2); /* only one of SMB2 signing flags may be set in SMB2 request */ if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) @@ -387,7 +371,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->SecurityMode = cpu_to_le16(temp); - req->Capabilities = cpu_to_le32(SMB2_GLOBAL_CAP_DFS); + req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); + + memcpy(req->ClientGUID, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ @@ -403,17 +389,16 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc != 0) goto neg_exit; - if (rsp == NULL) { - rc = -EIO; - goto neg_exit; - } - cFYI(1, "mode 0x%x", rsp->SecurityMode); - if (rsp->DialectRevision == smb2protocols[SMB21_PROT].name) + /* BB we may eventually want to match the negotiated vs. requested + dialect, even though we are only requesting one at a time */ + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) + cFYI(1, "negotiated smb2.0 dialect"); + else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) cFYI(1, "negotiated smb2.1 dialect"); - else if (rsp->DialectRevision == smb2protocols[SMB2_PROT].name) - cFYI(1, "negotiated smb2 dialect"); + else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) + cFYI(1, "negotiated smb3.0 dialect"); else { cERROR(1, "Illegal dialect returned by server %d", le16_to_cpu(rsp->DialectRevision)); @@ -438,6 +423,38 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) rc = -EIO; goto neg_exit; } + + cFYI(1, "sec_flags 0x%x", sec_flags); + if (sec_flags & CIFSSEC_MUST_SIGN) { + cFYI(1, "Signing required"); + if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | + SMB2_NEGOTIATE_SIGNING_ENABLED))) { + cERROR(1, "signing required but server lacks support"); + rc = -EOPNOTSUPP; + goto neg_exit; + } + server->sec_mode |= SECMODE_SIGN_REQUIRED; + } else if (sec_flags & CIFSSEC_MAY_SIGN) { + cFYI(1, "Signing optional"); + if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { + cFYI(1, "Server requires signing"); + server->sec_mode |= SECMODE_SIGN_REQUIRED; + } else { + server->sec_mode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } + } else { + cFYI(1, "Signing disabled"); + if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { + cERROR(1, "Server requires packet signing to be enabled" + " in /proc/fs/cifs/SecurityFlags."); + rc = -EOPNOTSUPP; + goto neg_exit; + } + server->sec_mode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } + #ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ rc = decode_neg_token_init(security_blob, blob_length, &server->sec_type); @@ -599,13 +616,14 @@ ssetup_ntlmssp_authenticate: kfree(security_blob); rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; - if (rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { + if (resp_buftype != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { if (phase != NtLmNegotiate) { cERROR(1, "Unexpected more processing error"); goto ssetup_exit; } if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != - le16_to_cpu(rsp->SecurityBufferOffset)) { + le16_to_cpu(rsp->SecurityBufferOffset)) { cERROR(1, "Invalid security buffer offset %d", le16_to_cpu(rsp->SecurityBufferOffset)); rc = -EIO; @@ -631,11 +649,6 @@ ssetup_ntlmssp_authenticate: if (rc != 0) goto ssetup_exit; - if (rsp == NULL) { - rc = -EIO; - goto ssetup_exit; - } - ses->session_flags = le16_to_cpu(rsp->SessionFlags); ssetup_exit: free_rsp_buf(resp_buftype, rsp); @@ -666,6 +679,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) /* since no tcon, smb2_init can not do this, so do here */ req->hdr.SessionId = ses->Suid; + if (server->sec_mode & SECMODE_SIGN_REQUIRED) + req->hdr.Flags |= SMB2_FLAGS_SIGNED; rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); /* @@ -753,11 +768,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_error_exit; } - if (rsp == NULL) { - rc = -EIO; - goto tcon_exit; - } - if (tcon == NULL) { ses->ipc_tid = rsp->hdr.TreeId; goto tcon_exit; @@ -830,18 +840,87 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) return rc; } +static struct create_lease * +create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease *buf; + + buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL); + if (!buf) + return NULL; + + memset(buf, 0, sizeof(struct create_lease)); + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) + buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING | + SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_II) + buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) + buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING | + SMB2_LEASE_READ_CACHING | + SMB2_LEASE_WRITE_CACHING; + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return buf; +} + +static __u8 +parse_lease_state(struct smb2_create_rsp *rsp) +{ + char *data_offset; + struct create_lease *lc; + bool found = false; + + data_offset = (char *)rsp; + data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset); + lc = (struct create_lease *)data_offset; + do { + char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; + if (le16_to_cpu(lc->ccontext.NameLength) != 4 || + strncmp(name, "RqLs", 4)) { + lc = (struct create_lease *)((char *)lc + + le32_to_cpu(lc->ccontext.Next)); + continue; + } + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + found = true; + break; + } while (le32_to_cpu(lc->ccontext.Next) != 0); + + if (!found) + return 0; + + return smb2_map_lease_to_oplock(lc->lcontext.LeaseState); +} + int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access, - __u32 create_disposition, __u32 file_attributes, __u32 create_options) + __u32 create_disposition, __u32 file_attributes, __u32 create_options, + __u8 *oplock, struct smb2_file_all_info *buf) { struct smb2_create_req *req; struct smb2_create_rsp *rsp; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; - struct kvec iov[2]; + struct kvec iov[3]; int resp_buftype; int uni_path_len; + __le16 *copy_path = NULL; + int copy_size; int rc = 0; int num_iovecs = 2; @@ -856,10 +935,6 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, if (rc) return rc; - if (enable_oplocks) - req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_BATCH; - else - req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_NONE; req->ImpersonationLevel = IL_IMPERSONATION; req->DesiredAccess = cpu_to_le32(desired_access); /* File attributes ignored on open (used in create though) */ @@ -869,7 +944,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, req->CreateOptions = cpu_to_le32(create_options); uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - - 1 /* pad */ - 4 /* do not count rfc1001 len field */); + - 8 /* pad */ - 4 /* do not count rfc1001 len field */); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ @@ -880,6 +955,20 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, req->NameLength = cpu_to_le16(uni_path_len - 2); /* -1 since last byte is buf[0] which is sent below (path) */ iov[0].iov_len--; + if (uni_path_len % 8 != 0) { + copy_size = uni_path_len / 8 * 8; + if (copy_size < uni_path_len) + copy_size += 8; + + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) + return -ENOMEM; + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } + iov[1].iov_len = uni_path_len; iov[1].iov_base = path; /* @@ -888,10 +977,37 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, */ inc_rfc1001_len(req, uni_path_len - 1); } else { + iov[0].iov_len += 7; + req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu( + req->hdr.smb2_buf_length) + 8 - 1); num_iovecs = 1; req->NameLength = 0; } + if (!server->oplocks) + *oplock = SMB2_OPLOCK_LEVEL_NONE; + + if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) || + *oplock == SMB2_OPLOCK_LEVEL_NONE) + req->RequestedOplockLevel = *oplock; + else { + iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock); + if (iov[num_iovecs].iov_base == NULL) { + cifs_small_buf_release(req); + kfree(copy_path); + return -ENOMEM; + } + iov[num_iovecs].iov_len = sizeof(struct create_lease); + req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) - 4 - 8 + + iov[num_iovecs-1].iov_len); + req->CreateContextsLength = cpu_to_le32( + sizeof(struct create_lease)); + inc_rfc1001_len(&req->hdr, sizeof(struct create_lease)); + num_iovecs++; + } + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); rsp = (struct smb2_create_rsp *)iov[0].iov_base; @@ -900,13 +1016,24 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, goto creat_exit; } - if (rsp == NULL) { - rc = -EIO; - goto creat_exit; - } *persistent_fid = rsp->PersistentFileId; *volatile_fid = rsp->VolatileFileId; + + if (buf) { + memcpy(buf, &rsp->CreationTime, 32); + buf->AllocationSize = rsp->AllocationSize; + buf->EndOfFile = rsp->EndofFile; + buf->Attributes = rsp->FileAttributes; + buf->NumberOfLinks = cpu_to_le32(1); + buf->DeletePending = 0; + } + + if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) + *oplock = parse_lease_state(rsp); + else + *oplock = rsp->OplockLevel; creat_exit: + kfree(copy_path); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -950,11 +1077,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, goto close_exit; } - if (rsp == NULL) { - rc = -EIO; - goto close_exit; - } - /* BB FIXME - decode close response, update inode for caching */ close_exit: @@ -1019,10 +1141,10 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, return 0; } -int -SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - struct smb2_file_all_info *data) +static int +query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u8 info_class, + size_t output_len, size_t min_len, void *data) { struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; @@ -1044,37 +1166,56 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; req->InfoType = SMB2_O_INFO_FILE; - req->FileInfoClass = FILE_ALL_INFORMATION; + req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; /* 4 for rfc1002 length field and 1 for Buffer */ req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); - req->OutputBufferLength = - cpu_to_le32(sizeof(struct smb2_file_all_info) + MAX_NAME * 2); + req->OutputBufferLength = cpu_to_le32(output_len); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + rsp = (struct smb2_query_info_rsp *)iov[0].iov_base; + if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto qinf_exit; } - rsp = (struct smb2_query_info_rsp *)iov[0].iov_base; - rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp->hdr, sizeof(struct smb2_file_all_info), - (char *)data); + &rsp->hdr, min_len, data); qinf_exit: free_rsp_buf(resp_buftype, rsp); return rc; } +int +SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_all_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_ALL_INFORMATION, + sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + sizeof(struct smb2_file_all_info), data); +} + +int +SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_INTERNAL_INFORMATION, + sizeof(struct smb2_file_internal_info), + sizeof(struct smb2_file_internal_info), uniqueid); +} + /* * This is a no-op for now. We're not really interested in the reply, but * rather in the fact that the server sent one and that server->lstrp @@ -1102,6 +1243,8 @@ SMB2_echo(struct TCP_Server_Info *server) struct smb2_echo_req *req; int rc = 0; struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; cFYI(1, "In echo request"); @@ -1115,7 +1258,7 @@ SMB2_echo(struct TCP_Server_Info *server) /* 4 for rfc1002 length field */ iov.iov_len = get_rfc1002_length(req) + 4; - rc = cifs_call_async(server, &iov, 1, NULL, smb2_echo_callback, server, + rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server, CIFS_ECHO_OP); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -1123,3 +1266,945 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_small_buf_release(req); return rc; } + +int +SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid) +{ + struct smb2_flush_req *req; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[1]; + int resp_buftype; + int rc = 0; + + cFYI(1, "Flush"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req); + if (rc) + return rc; + + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); + + if ((rc != 0) && tcon) + cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); + + free_rsp_buf(resp_buftype, iov[0].iov_base); + return rc; +} + +/* + * To form a chain of read requests, any read requests after the first should + * have the end_of_chain boolean set to true. + */ +static int +smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms, + unsigned int remaining_bytes, int request_type) +{ + int rc = -EACCES; + struct smb2_read_req *req = NULL; + + rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req); + if (rc) + return rc; + if (io_parms->tcon->ses->server == NULL) + return -ECONNABORTED; + + req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; + req->ReadChannelInfoOffset = 0; /* reserved */ + req->ReadChannelInfoLength = 0; /* reserved */ + req->Channel = 0; /* reserved */ + req->MinimumCount = 0; + req->Length = cpu_to_le32(io_parms->length); + req->Offset = cpu_to_le64(io_parms->offset); + + if (request_type & CHAINED_REQUEST) { + if (!(request_type & END_OF_CHAIN)) { + /* 4 for rfc1002 length field */ + req->hdr.NextCommand = + cpu_to_le32(get_rfc1002_length(req) + 4); + } else /* END_OF_CHAIN */ + req->hdr.NextCommand = 0; + if (request_type & RELATED_REQUEST) { + req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS; + /* + * Related requests use info from previous read request + * in chain. + */ + req->hdr.SessionId = 0xFFFFFFFF; + req->hdr.TreeId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFF; + } + } + if (remaining_bytes > io_parms->length) + req->RemainingBytes = cpu_to_le32(remaining_bytes); + else + req->RemainingBytes = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + return rc; +} + +static void +smb2_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base; + unsigned int credits_received = 1; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1, + .rq_pages = rdata->pages, + .rq_npages = rdata->nr_pages, + .rq_pagesz = rdata->pagesz, + .rq_tailsz = rdata->tailsz }; + + cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, + mid->mid, mid->mid_state, rdata->result, rdata->bytes); + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + credits_received = le16_to_cpu(buf->CreditRequest); + /* result already set, check signature */ + if (server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + int rc; + + rc = smb2_verify_signature(&rqst, server); + if (rc) + cERROR(1, "SMB signature verification returned " + "error = %d", rc); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->bytes); + cifs_stats_bytes_read(tcon, rdata->bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + break; + default: + if (rdata->result != -ENODATA) + rdata->result = -EIO; + } + + if (rdata->result) + cifs_stats_fail_inc(tcon, SMB2_READ_HE); + + queue_work(cifsiod_wq, &rdata->work); + DeleteMidQEntry(mid); + add_credits(server, credits_received, 0); +} + +/* smb2_async_readv - send an async write, and set up mid to handle result */ +int +smb2_async_readv(struct cifs_readdata *rdata) +{ + int rc; + struct smb2_hdr *buf; + struct cifs_io_parms io_parms; + struct smb_rqst rqst = { .rq_iov = &rdata->iov, + .rq_nvec = 1 }; + + cFYI(1, "%s: offset=%llu bytes=%u", __func__, + rdata->offset, rdata->bytes); + + io_parms.tcon = tlink_tcon(rdata->cfile->tlink); + io_parms.offset = rdata->offset; + io_parms.length = rdata->bytes; + io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; + io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; + io_parms.pid = rdata->pid; + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); + if (rc) + return rc; + + buf = (struct smb2_hdr *)rdata->iov.iov_base; + /* 4 for rfc1002 length field */ + rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; + + kref_get(&rdata->refcount); + rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, + cifs_readv_receive, smb2_readv_callback, + rdata, 0); + if (rc) { + kref_put(&rdata->refcount, cifs_readdata_release); + cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); + } + + cifs_small_buf_release(buf); + return rc; +} + +int +SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, int *buf_type) +{ + int resp_buftype, rc = -EACCES; + struct smb2_read_rsp *rsp = NULL; + struct kvec iov[1]; + + *nbytes = 0; + rc = smb2_new_read_req(iov, io_parms, 0, 0); + if (rc) + return rc; + + rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1, + &resp_buftype, CIFS_LOG_ERROR); + + rsp = (struct smb2_read_rsp *)iov[0].iov_base; + + if (rsp->hdr.Status == STATUS_END_OF_FILE) { + free_rsp_buf(resp_buftype, iov[0].iov_base); + return 0; + } + + if (rc) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cERROR(1, "Send error in read = %d", rc); + } else { + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cFYI(1, "bad length %d for count %d", *nbytes, + io_parms->length); + rc = -EIO; + *nbytes = 0; + } + } + + if (*buf) { + memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + *nbytes); + free_rsp_buf(resp_buftype, iov[0].iov_base); + } else if (resp_buftype != CIFS_NO_BUFFER) { + *buf = iov[0].iov_base; + if (resp_buftype == CIFS_SMALL_BUFFER) + *buf_type = CIFS_SMALL_BUFFER; + else if (resp_buftype == CIFS_LARGE_BUFFER) + *buf_type = CIFS_LARGE_BUFFER; + } + return rc; +} + +/* + * Check the mid_state and signature on received buffer (if any), and queue the + * workqueue completion task. + */ +static void +smb2_writev_callback(struct mid_q_entry *mid) +{ + struct cifs_writedata *wdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + unsigned int written; + struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; + unsigned int credits_received = 1; + + switch (mid->mid_state) { + case MID_RESPONSE_RECEIVED: + credits_received = le16_to_cpu(rsp->hdr.CreditRequest); + wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); + if (wdata->result != 0) + break; + + written = le32_to_cpu(rsp->DataLength); + /* + * Mask off high 16 bits when bytes written as returned + * by the server is greater than bytes requested by the + * client. OS/2 servers are known to set incorrect + * CountHigh values. + */ + if (written > wdata->bytes) + written &= 0xFFFF; + + if (written < wdata->bytes) + wdata->result = -ENOSPC; + else + wdata->bytes = written; + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + wdata->result = -EAGAIN; + break; + default: + wdata->result = -EIO; + break; + } + + if (wdata->result) + cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + + queue_work(cifsiod_wq, &wdata->work); + DeleteMidQEntry(mid); + add_credits(tcon->ses->server, credits_received, 0); +} + +/* smb2_async_writev - send an async write, and set up mid to handle result */ +int +smb2_async_writev(struct cifs_writedata *wdata) +{ + int rc = -EACCES; + struct smb2_write_req *req = NULL; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct kvec iov; + struct smb_rqst rqst; + + rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); + if (rc) + goto async_writev_out; + + req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); + + req->PersistentFileId = wdata->cfile->fid.persistent_fid; + req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->WriteChannelInfoOffset = 0; + req->WriteChannelInfoLength = 0; + req->Channel = 0; + req->Offset = cpu_to_le64(wdata->offset); + /* 4 for rfc1002 length field */ + req->DataOffset = cpu_to_le16( + offsetof(struct smb2_write_req, Buffer) - 4); + req->RemainingBytes = 0; + + /* 4 for rfc1002 length field and 1 for Buffer */ + iov.iov_len = get_rfc1002_length(req) + 4 - 1; + iov.iov_base = req; + + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + rqst.rq_pages = wdata->pages; + rqst.rq_npages = wdata->nr_pages; + rqst.rq_pagesz = wdata->pagesz; + rqst.rq_tailsz = wdata->tailsz; + + cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + + req->Length = cpu_to_le32(wdata->bytes); + + inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); + + kref_get(&wdata->refcount); + rc = cifs_call_async(tcon->ses->server, &rqst, NULL, + smb2_writev_callback, wdata, 0); + + if (rc) { + kref_put(&wdata->refcount, cifs_writedata_release); + cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + } + +async_writev_out: + cifs_small_buf_release(req); + return rc; +} + +/* + * SMB2_write function gets iov pointer to kvec array with n_vec as a length. + * The length field from io_parms must be at least 1 and indicates a number of + * elements with data to write that begins with position 1 in iov array. All + * data length is specified by count. + */ +int +SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec) +{ + int rc = 0; + struct smb2_write_req *req = NULL; + struct smb2_write_rsp *rsp = NULL; + int resp_buftype; + *nbytes = 0; + + if (n_vec < 1) + return rc; + + rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req); + if (rc) + return rc; + + if (io_parms->tcon->ses->server == NULL) + return -ECONNABORTED; + + req->hdr.ProcessId = cpu_to_le32(io_parms->pid); + + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; + req->WriteChannelInfoOffset = 0; + req->WriteChannelInfoLength = 0; + req->Channel = 0; + req->Length = cpu_to_le32(io_parms->length); + req->Offset = cpu_to_le64(io_parms->offset); + /* 4 for rfc1002 length field */ + req->DataOffset = cpu_to_le16( + offsetof(struct smb2_write_req, Buffer) - 4); + req->RemainingBytes = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and 1 for Buffer */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + + /* length of entire message including data to be written */ + inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */); + + rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1, + &resp_buftype, 0); + rsp = (struct smb2_write_rsp *)iov[0].iov_base; + + if (rc) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); + cERROR(1, "Send error in write = %d", rc); + } else + *nbytes = le32_to_cpu(rsp->DataLength); + + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +static unsigned int +num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) +{ + int len; + unsigned int entrycount = 0; + unsigned int next_offset = 0; + FILE_DIRECTORY_INFO *entryptr; + + if (bufstart == NULL) + return 0; + + entryptr = (FILE_DIRECTORY_INFO *)bufstart; + + while (1) { + entryptr = (FILE_DIRECTORY_INFO *) + ((char *)entryptr + next_offset); + + if ((char *)entryptr + size > end_of_buf) { + cERROR(1, "malformed search entry would overflow"); + break; + } + + len = le32_to_cpu(entryptr->FileNameLength); + if ((char *)entryptr + len + size > end_of_buf) { + cERROR(1, "directory entry name would overflow frame " + "end of buf %p", end_of_buf); + break; + } + + *lastentry = (char *)entryptr; + entrycount++; + + next_offset = le32_to_cpu(entryptr->NextEntryOffset); + if (!next_offset) + break; + } + + return entrycount; +} + +/* + * Readdir/FindFirst + */ +int +SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf) +{ + struct smb2_query_directory_req *req; + struct smb2_query_directory_rsp *rsp = NULL; + struct kvec iov[2]; + int rc = 0; + int len; + int resp_buftype; + unsigned char *bufptr; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + __le16 asteriks = cpu_to_le16('*'); + char *end_of_smb; + unsigned int output_size = CIFSMaxBufSize; + size_t info_buf_size; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req); + if (rc) + return rc; + + switch (srch_inf->info_level) { + case SMB_FIND_FILE_DIRECTORY_INFO: + req->FileInformationClass = FILE_DIRECTORY_INFORMATION; + info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION; + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + break; + default: + cERROR(1, "info level %u isn't supported", + srch_inf->info_level); + rc = -EINVAL; + goto qdir_exit; + } + + req->FileIndex = cpu_to_le32(index); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + len = 0x2; + bufptr = req->Buffer; + memcpy(bufptr, &asteriks, len); + + req->FileNameOffset = + cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4); + req->FileNameLength = cpu_to_le16(len); + /* + * BB could be 30 bytes or so longer if we used SMB2 specific + * buffer lengths, but this is safe and close enough. + */ + output_size = min_t(unsigned int, output_size, server->maxBuf); + output_size = min_t(unsigned int, output_size, 2 << 15); + req->OutputBufferLength = cpu_to_le32(output_size); + + iov[0].iov_base = (char *)req; + /* 4 for RFC1001 length and 1 for Buffer */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + + iov[1].iov_base = (char *)(req->Buffer); + iov[1].iov_len = len; + + inc_rfc1001_len(req, len - 1 /* Buffer */); + + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0); + rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + goto qdir_exit; + } + + rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + info_buf_size); + if (rc) + goto qdir_exit; + + srch_inf->unicode = true; + + if (srch_inf->ntwrk_buf_start) { + if (srch_inf->smallBuf) + cifs_small_buf_release(srch_inf->ntwrk_buf_start); + else + cifs_buf_release(srch_inf->ntwrk_buf_start); + } + srch_inf->ntwrk_buf_start = (char *)rsp; + srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ + + (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset); + /* 4 for rfc1002 length field */ + end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr; + srch_inf->entries_in_buffer = + num_entries(srch_inf->srch_entries_start, end_of_smb, + &srch_inf->last_entry, info_buf_size); + srch_inf->index_of_last_entry += srch_inf->entries_in_buffer; + cFYI(1, "num entries %d last_index %lld srch start %p srch end %p", + srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, + srch_inf->srch_entries_start, srch_inf->last_entry); + if (resp_buftype == CIFS_LARGE_BUFFER) + srch_inf->smallBuf = false; + else if (resp_buftype == CIFS_SMALL_BUFFER) + srch_inf->smallBuf = true; + else + cERROR(1, "illegal search buffer type"); + + if (rsp->hdr.Status == STATUS_NO_MORE_FILES) + srch_inf->endOfSearch = 1; + else + srch_inf->endOfSearch = 0; + + return rc; + +qdir_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + +static int +send_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class, + unsigned int num, void **data, unsigned int *size) +{ + struct smb2_set_info_req *req; + struct smb2_set_info_rsp *rsp = NULL; + struct kvec *iov; + int rc = 0; + int resp_buftype; + unsigned int i; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + if (!num) + return -EINVAL; + + iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL); + if (!iov) + return -ENOMEM; + + rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req); + if (rc) { + kfree(iov); + return rc; + } + + req->hdr.ProcessId = cpu_to_le32(pid); + + req->InfoType = SMB2_O_INFO_FILE; + req->FileInfoClass = info_class; + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + /* 4 for RFC1001 length and 1 for Buffer */ + req->BufferOffset = + cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4); + req->BufferLength = cpu_to_le32(*size); + + inc_rfc1001_len(req, *size - 1 /* Buffer */); + + memcpy(req->Buffer, *data, *size); + + iov[0].iov_base = (char *)req; + /* 4 for RFC1001 length */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + for (i = 1; i < num; i++) { + inc_rfc1001_len(req, size[i]); + le32_add_cpu(&req->BufferLength, size[i]); + iov[i].iov_base = (char *)data[i]; + iov[i].iov_len = size[i]; + } + + rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); + rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); + goto out; + } +out: + free_rsp_buf(resp_buftype, rsp); + kfree(iov); + return rc; +} + +int +SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le16 *target_file) +{ + struct smb2_file_rename_info info; + void **data; + unsigned int size[2]; + int rc; + int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); + + data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + if (!data) + return -ENOMEM; + + info.ReplaceIfExists = 1; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ + info.FileNameLength = cpu_to_le32(len); + + data[0] = &info; + size[0] = sizeof(struct smb2_file_rename_info); + + data[1] = target_file; + size[1] = len + 2 /* null */; + + rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_RENAME_INFORMATION, 2, data, + size); + kfree(data); + return rc; +} + +int +SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, __le16 *target_file) +{ + struct smb2_file_link_info info; + void **data; + unsigned int size[2]; + int rc; + int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); + + data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + if (!data) + return -ENOMEM; + + info.ReplaceIfExists = 0; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ + info.FileNameLength = cpu_to_le32(len); + + data[0] = &info; + size[0] = sizeof(struct smb2_file_link_info); + + data[1] = target_file; + size[1] = len + 2 /* null */; + + rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_LINK_INFORMATION, 2, data, size); + kfree(data); + return rc; +} + +int +SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 pid, __le64 *eof) +{ + struct smb2_file_eof_info info; + void *data; + unsigned int size; + + info.EndOfFile = *eof; + + data = &info; + size = sizeof(struct smb2_file_eof_info); + + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, + FILE_END_OF_FILE_INFORMATION, 1, &data, &size); +} + +int +SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf) +{ + unsigned int size; + size = sizeof(FILE_BASIC_INFO); + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_BASIC_INFORMATION, 1, + (void **)&buf, &size); +} + +int +SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, + const u64 persistent_fid, const u64 volatile_fid, + __u8 oplock_level) +{ + int rc; + struct smb2_oplock_break *req = NULL; + + cFYI(1, "SMB2_oplock_break"); + rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + + if (rc) + return rc; + + req->VolatileFid = volatile_fid; + req->PersistentFid = persistent_fid; + req->OplockLevel = oplock_level; + req->hdr.CreditRequest = cpu_to_le16(1); + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); + /* SMB2 buffer freed by function above */ + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + cFYI(1, "Send error in Oplock Break = %d", rc); + } + + return rc; +} + +static void +copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, + struct kstatfs *kst) +{ + kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * + le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); + kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); + kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); + kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + return; +} + +static int +build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, + int outbuf_len, u64 persistent_fid, u64 volatile_fid) +{ + int rc; + struct smb2_query_info_req *req; + + cFYI(1, "Query FSInfo level %d", level); + + if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) + return -EIO; + + rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + if (rc) + return rc; + + req->InfoType = SMB2_O_INFO_FILESYSTEM; + req->FileInfoClass = level; + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + /* 4 for rfc1002 length field and 1 for pad */ + req->InputBufferOffset = + cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + req->OutputBufferLength = cpu_to_le32( + outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4); + + iov->iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov->iov_len = get_rfc1002_length(req) + 4; + return 0; +} + +int +SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) +{ + struct smb2_query_info_rsp *rsp = NULL; + struct kvec iov; + int rc = 0; + int resp_buftype; + struct cifs_ses *ses = tcon->ses; + struct smb2_fs_full_size_info *info = NULL; + + rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION, + sizeof(struct smb2_fs_full_size_info), + persistent_fid, volatile_fid); + if (rc) + return rc; + + rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0); + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + goto qinf_exit; + } + rsp = (struct smb2_query_info_rsp *)iov.iov_base; + + info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ + + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); + rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + sizeof(struct smb2_fs_full_size_info)); + if (!rc) + copy_fs_info_to_kstatfs(info, fsdata); + +qinf_exit: + free_rsp_buf(resp_buftype, iov.iov_base); + return rc; +} + +int +smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, + const __u32 num_lock, struct smb2_lock_element *buf) +{ + int rc = 0; + struct smb2_lock_req *req = NULL; + struct kvec iov[2]; + int resp_buf_type; + unsigned int count; + + cFYI(1, "smb2_lockv num lock %d", num_lock); + + rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); + if (rc) + return rc; + + req->hdr.ProcessId = cpu_to_le32(pid); + req->LockCount = cpu_to_le16(num_lock); + + req->PersistentFileId = persist_fid; + req->VolatileFileId = volatile_fid; + + count = num_lock * sizeof(struct smb2_lock_element); + inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element)); + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field and count for all locks */ + iov[0].iov_len = get_rfc1002_length(req) + 4 - count; + iov[1].iov_base = (char *)buf; + iov[1].iov_len = count; + + cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) { + cFYI(1, "Send error in smb2_lockv = %d", rc); + cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); + } + + return rc; +} + +int +SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, + const __u64 length, const __u64 offset, const __u32 lock_flags, + const bool wait) +{ + struct smb2_lock_element lock; + + lock.Offset = cpu_to_le64(offset); + lock.Length = cpu_to_le64(length); + lock.Flags = cpu_to_le32(lock_flags); + if (!wait && lock_flags != SMB2_LOCKFLAG_UNLOCK) + lock.Flags |= cpu_to_le32(SMB2_LOCKFLAG_FAIL_IMMEDIATELY); + + return smb2_lockv(xid, tcon, persist_fid, volatile_fid, pid, 1, &lock); +} + +int +SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, + __u8 *lease_key, const __le32 lease_state) +{ + int rc; + struct smb2_lease_ack *req = NULL; + + cFYI(1, "SMB2_lease_break"); + rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + + if (rc) + return rc; + + req->hdr.CreditRequest = cpu_to_le16(1); + req->StructureSize = cpu_to_le16(36); + inc_rfc1001_len(req, 12); + + memcpy(req->LeaseKey, lease_key, 16); + req->LeaseState = lease_state; + + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP); + /* SMB2 buffer freed by function above */ + + if (rc) { + cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + cFYI(1, "Send error in Lease Break = %d", rc); + } + + return rc; +} diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c5fbfac..4cb4ced 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -96,7 +96,7 @@ * */ -#define SMB2_HEADER_STRUCTURE_SIZE __constant_le16_to_cpu(64) +#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ @@ -140,7 +140,7 @@ struct smb2_pdu { * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_le16_to_cpu(9) +#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) struct smb2_err_rsp { struct smb2_hdr hdr; @@ -150,6 +150,10 @@ struct smb2_err_rsp { __u8 ErrorData[1]; /* variable length */ } __packed; +#define SMB2_CLIENT_GUID_SIZE 16 + +extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; + struct smb2_negotiate_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ @@ -157,11 +161,17 @@ struct smb2_negotiate_req { __le16 SecurityMode; __le16 Reserved; /* MBZ */ __le32 Capabilities; - __u8 ClientGUID[16]; /* MBZ */ + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; __le64 ClientStartTime; /* MBZ */ - __le16 Dialects[2]; /* variable length */ + __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ } __packed; +/* Dialects */ +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +#define SMB30_PROT_ID 0x0300 +#define BAD_PROT_ID 0xFFFF + /* SecurityMode flags */ #define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 #define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 @@ -169,6 +179,10 @@ struct smb2_negotiate_req { #define SMB2_GLOBAL_CAP_DFS 0x00000001 #define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ #define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ /* Internal types */ #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 @@ -307,6 +321,8 @@ struct smb2_tree_disconnect_rsp { #define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 #define SMB2_OPLOCK_LEVEL_BATCH 0x09 #define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 /* Desired Access Flags */ #define FILE_READ_DATA_LE cpu_to_le32(0x00000001) @@ -404,7 +420,7 @@ struct smb2_create_req { __le16 NameLength; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[1]; + __u8 Buffer[8]; } __packed; struct smb2_create_rsp { @@ -428,6 +444,39 @@ struct smb2_create_rsp { __u8 Buffer[1]; } __packed; +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[0]; +} __packed; + +#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) + +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) + +#define SMB2_LEASE_KEY_SIZE 16 + +struct lease_context { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; +} __packed; + +struct create_lease { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context lcontext; +} __packed; + /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { @@ -453,6 +502,108 @@ struct smb2_close_rsp { __le32 Attributes; } __packed; +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Reserved; + __le32 Length; + __le64 Offset; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 MinimumCount; + __le32 Channel; /* Reserved MBZ */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; /* Reserved MBZ */ + __le16 ReadChannelInfoLength; /* Reserved MBZ */ + __u8 Buffer[1]; +} __packed; + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +/* For write request Flags field below the following flag is defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 Channel; /* Reserved MBZ */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; /* Reserved MBZ */ + __le16 WriteChannelInfoLength; /* Reserved MBZ */ + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 +#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 +#define SMB2_LOCKFLAG_UNLOCK 0x0004 +#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 + +struct smb2_lock_element { + __le64 Offset; + __le64 Length; + __le32 Flags; + __le32 Reserved; +} __packed; + +struct smb2_lock_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 48 */ + __le16 LockCount; + __le32 Reserved; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + /* Followed by at least one */ + struct smb2_lock_element locks[1]; +} __packed; + +struct smb2_lock_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + struct smb2_echo_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ @@ -465,6 +616,34 @@ struct smb2_echo_rsp { __u16 Reserved; } __packed; +/* search (query_directory) Flags field */ +#define SMB2_RESTART_SCANS 0x01 +#define SMB2_RETURN_SINGLE_ENTRY 0x02 +#define SMB2_INDEX_SPECIFIED 0x04 +#define SMB2_REOPEN 0x10 + +struct smb2_query_directory_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 FileInformationClass; + __u8 Flags; + __le32 FileIndex; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le16 FileNameOffset; + __le16 FileNameLength; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_directory_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + /* Possible InfoType values */ #define SMB2_O_INFO_FILE 0x01 #define SMB2_O_INFO_FILESYSTEM 0x02 @@ -495,11 +674,84 @@ struct smb2_query_info_rsp { __u8 Buffer[1]; } __packed; +struct smb2_set_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 BufferLength; + __le16 BufferOffset; + __u16 Reserved; + __le32 AdditionalInformation; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 2 */ +} __packed; + +struct smb2_oplock_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) + +struct smb2_lease_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 44 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 CurrentLeaseState; + __le32 NewLeaseState; + __le32 BreakReason; + __le32 AccessMaskHint; + __le32 ShareMaskHint; +} __packed; + +struct smb2_lease_ack { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 LeaseState; + __le64 LeaseDuration; +} __packed; + /* * PDU infolevel structure definitions * BB consider moving to a different header */ +/* File System Information Classes */ +#define FS_VOLUME_INFORMATION 1 /* Query */ +#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_SIZE_INFORMATION 3 /* Query */ +#define FS_DEVICE_INFORMATION 4 /* Query */ +#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ +#define FS_CONTROL_INFORMATION 6 /* Query, Set */ +#define FS_FULL_SIZE_INFORMATION 7 /* Query */ +#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ + +struct smb2_fs_full_size_info { + __le64 TotalAllocationUnits; + __le64 CallerAvailableAllocationUnits; + __le64 ActualAvailableAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; + /* partial list of QUERY INFO levels */ #define FILE_DIRECTORY_INFORMATION 1 #define FILE_FULL_DIRECTORY_INFORMATION 2 @@ -548,6 +800,28 @@ struct smb2_query_info_rsp { #define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 #define FILE_STANDARD_LINK_INFORMATION 54 +struct smb2_file_internal_info { + __le64 IndexNumber; +} __packed; /* level 6 Query */ + +struct smb2_file_rename_info { /* encoding of request for level 10 */ + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* New name to be assigned */ +} __packed; /* level 10 Set */ + +struct smb2_file_link_info { /* encoding of request for level 11 */ + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* Name to be assigned to new link */ +} __packed; /* level 11 Set */ + /* * This level 18, although with struct with same name is different from cifs * level 0x107. Level 0x107 has an extra u64 between AccessFlags and @@ -576,4 +850,8 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ char FileName[1]; } __packed; /* level 18 Query */ +struct smb2_file_eof_info { /* encoding of request for level 10 */ + __le64 EndOfFile; /* new end of file value */ +} __packed; /* level 20 Set */ + #endif /* _SMB2PDU_H */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index bfaa7b1..7d25f8b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -26,6 +26,7 @@ #include <linux/key-type.h> struct statfs; +struct smb_rqst; /* ***************************************************************** @@ -34,24 +35,35 @@ struct statfs; */ extern int map_smb2_to_linux_error(char *buf, bool log_err); extern int smb2_check_message(char *buf, unsigned int length); -extern unsigned int smb2_calc_size(struct smb2_hdr *hdr); +extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); +extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *); extern int smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); -extern int smb2_setup_request(struct cifs_ses *ses, struct kvec *iov, - unsigned int nvec, struct mid_q_entry **ret_mid); -extern int smb2_setup_async_request(struct TCP_Server_Info *server, - struct kvec *iov, unsigned int nvec, - struct mid_q_entry **ret_mid); +extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, + struct smb_rqst *rqst); +extern struct mid_q_entry *smb2_setup_async_request( + struct TCP_Server_Info *server, struct smb_rqst *rqst); extern void smb2_echo_request(struct work_struct *work); +extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); +extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); +extern bool smb2_is_valid_oplock_break(char *buffer, + struct TCP_Server_Info *srv); +extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, + struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, FILE_ALL_INFO *data, bool *adjust_tz); +extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, __u64 size, + struct cifs_sb_info *cifs_sb, bool set_alloc); +extern int smb2_set_file_info(struct inode *inode, const char *full_path, + FILE_BASIC_INFO *buf, const unsigned int xid); extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, @@ -59,6 +71,24 @@ extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_tcon *tcon, const unsigned int xid); extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); +extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb); +extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); + +extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, int disposition, + int desired_access, int create_options, + struct cifs_fid *fid, __u32 *oplock, + FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb); +extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int smb2_unlock_range(struct cifsFileInfo *cfile, + struct file_lock *flock, const unsigned int xid); +extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -75,12 +105,55 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access, __u32 create_disposition, - __u32 file_attributes, __u32 create_options); + __u32 file_attributes, __u32 create_options, + __u8 *oplock, struct smb2_file_all_info *buf); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); +extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le64 *uniqueid); +extern int smb2_async_readv(struct cifs_readdata *rdata); +extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, int *buf_type); +extern int smb2_async_writev(struct cifs_writedata *wdata); +extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec); extern int SMB2_echo(struct TCP_Server_Info *server); +extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf); +extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le16 *target_file); +extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + __le16 *target_file); +extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, + __le64 *eof); +extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + FILE_BASIC_INFO *buf); +extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, + const u64 persistent_fid, const u64 volatile_fid, + const __u8 oplock_level); +extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + struct kstatfs *FSData); +extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, + const __u32 pid, const __u64 length, const __u64 offset, + const __u32 lockFlags, const bool wait); +extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, + const __u64 persist_fid, const __u64 volatile_fid, + const __u32 pid, const __u32 num_lock, + struct smb2_lock_element *buf); +extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, + __u8 *lease_key, const __le32 lease_state); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 31f5d42..2a5fdf2 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -30,12 +30,156 @@ #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> +#include <linux/highmem.h> #include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" #include "cifs_debug.h" #include "smb2status.h" +#include "smb2glob.h" + +static int +smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + int i, rc; + unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; + unsigned char *sigptr = smb2_signature; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + + memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); + memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + + rc = crypto_shash_setkey(server->secmech.hmacsha256, + server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } + + rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + if (rc) { + cERROR(1, "%s: Could not init md5\n", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cERROR(1, "null iovec entry"); + return -EIO; + } + /* + * The first entry includes a length field (which does not get + * signed that occupies the first 4 bytes before the header). + */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update( + &server->secmech.sdeschmacsha256->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update( + &server->secmech.sdeschmacsha256->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cERROR(1, "%s: Could not update with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + sigptr); + if (rc) + cERROR(1, "%s: Could not generate sha256 hash\n", __func__); + + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + + return rc; +} + +/* must be called with server->srv_mutex held */ +static int +smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + int rc = 0; + struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base; + + if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) || + server->tcpStatus == CifsNeedNegotiate) + return rc; + + if (!server->session_estab) { + strncpy(smb2_pdu->Signature, "BSRSPYL", 8); + return rc; + } + + rc = smb2_calc_signature(rqst, server); + + return rc; +} + +int +smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + unsigned int rc; + char server_response_sig[16]; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; + + if ((smb2_pdu->Command == SMB2_NEGOTIATE) || + (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || + (!server->session_estab)) + return 0; + + /* + * BB what if signatures are supposed to be on for session but + * server does not send one? BB + */ + + /* Do not need to verify session setups with signature "BSRSPYL " */ + if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0) + cFYI(1, "dummy signature received for smb command 0x%x", + smb2_pdu->Command); + + /* + * Save off the origiginal signature so we can modify the smb and check + * our calculated signature against what the server sent. + */ + memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE); + + memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); + + mutex_lock(&server->srv_mutex); + rc = smb2_calc_signature(rqst, server); + mutex_unlock(&server->srv_mutex); + + if (rc) + return rc; + + if (memcmp(server_response_sig, smb2_pdu->Signature, + SMB2_SIGNATURE_SIZE)) + return -EACCES; + else + return 0; +} /* * Set message id for the request. Should be called after wait_for_free_request @@ -115,58 +259,66 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { unsigned int len = get_rfc1002_length(mid->resp_buf); + struct kvec iov; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; + + iov.iov_base = (char *)mid->resp_buf; + iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4; dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ - /* BB - uncomment with SMB2 signing implementation */ - /* if ((len > 24) && + if ((len > 24) && (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) { - if (smb2_verify_signature(mid->resp_buf, server)) - cERROR(1, "Unexpected SMB signature"); - } */ + int rc; + + rc = smb2_verify_signature(&rqst, server); + if (rc) + cERROR(1, "SMB signature verification returned error = " + "%d", rc); + } return map_smb2_to_linux_error(mid->resp_buf, log_error); } -int -smb2_setup_request(struct cifs_ses *ses, struct kvec *iov, - unsigned int nvec, struct mid_q_entry **ret_mid) +struct mid_q_entry * +smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; - struct smb2_hdr *hdr = (struct smb2_hdr *)iov[0].iov_base; + struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(ses->server, hdr); rc = smb2_get_mid_entry(ses, hdr, &mid); if (rc) - return rc; - /* rc = smb2_sign_smb2(iov, nvec, ses->server); - if (rc) - delete_mid(mid); */ - *ret_mid = mid; - return rc; + return ERR_PTR(rc); + rc = smb2_sign_rqst(rqst, ses->server); + if (rc) { + cifs_delete_mid(mid); + return ERR_PTR(rc); + } + return mid; } -int -smb2_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, struct mid_q_entry **ret_mid) +struct mid_q_entry * +smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { - int rc = 0; - struct smb2_hdr *hdr = (struct smb2_hdr *)iov[0].iov_base; + int rc; + struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, hdr); mid = smb2_mid_entry_alloc(hdr, server); if (mid == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - /* rc = smb2_sign_smb2(iov, nvec, server); + rc = smb2_sign_rqst(rqst, server); if (rc) { DeleteMidQEntry(mid); - return rc; - }*/ - *ret_mid = mid; - return rc; + return ERR_PTR(rc); + } + + return mid; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index d9b639b..76d974c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -27,6 +27,8 @@ #include <linux/net.h> #include <linux/delay.h> #include <linux/freezer.h> +#include <linux/tcp.h> +#include <linux/highmem.h> #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> @@ -109,8 +111,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) mempool_free(midEntry, cifs_mid_poolp); } -static void -delete_mid(struct mid_q_entry *mid) +void +cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); list_del(&mid->qhead); @@ -119,18 +121,29 @@ delete_mid(struct mid_q_entry *mid) DeleteMidQEntry(mid); } +/* + * smb_send_kvec - send an array of kvecs to the server + * @server: Server to send the data to + * @iov: Pointer to array of kvecs + * @n_vec: length of kvec array + * @sent: amount of data sent on socket is stored here + * + * Our basic "send data to server" function. Should be called with srv_mutex + * held. The caller is responsible for handling the results. + */ static int -smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) +smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, + size_t *sent) { int rc = 0; int i = 0; struct msghdr smb_msg; - unsigned int len = iov[0].iov_len; - unsigned int total_len; - int first_vec = 0; - unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); + unsigned int remaining; + size_t first_vec = 0; struct socket *ssocket = server->ssocket; + *sent = 0; + if (ssocket == NULL) return -ENOTSOCK; /* BB eventually add reconnect code here */ @@ -143,56 +156,66 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) else smb_msg.msg_flags = MSG_NOSIGNAL; - total_len = 0; + remaining = 0; for (i = 0; i < n_vec; i++) - total_len += iov[i].iov_len; - - cFYI(1, "Sending smb: total_len %d", total_len); - dump_smb(iov[0].iov_base, len); + remaining += iov[i].iov_len; i = 0; - while (total_len) { + while (remaining) { + /* + * If blocking send, we try 3 times, since each can block + * for 5 seconds. For nonblocking we have to try more + * but wait increasing amounts of time allowing time for + * socket to clear. The overall time we wait in either + * case to send on the socket is about 15 seconds. + * Similarly we wait for 15 seconds for a response from + * the server in SendReceive[2] for the server to send + * a response back for most types of requests (except + * SMB Write past end of file which can be slow, and + * blocking lock operations). NFS waits slightly longer + * than CIFS, but this can make it take longer for + * nonresponsive servers to be detected and 15 seconds + * is more than enough time for modern networks to + * send a packet. In most cases if we fail to send + * after the retries we will kill the socket and + * reconnect which may clear the network problem. + */ rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], - n_vec - first_vec, total_len); - if ((rc == -ENOSPC) || (rc == -EAGAIN)) { - i++; + n_vec - first_vec, remaining); + if (rc == -ENOSPC || rc == -EAGAIN) { /* - * If blocking send we try 3 times, since each can block - * for 5 seconds. For nonblocking we have to try more - * but wait increasing amounts of time allowing time for - * socket to clear. The overall time we wait in either - * case to send on the socket is about 15 seconds. - * Similarly we wait for 15 seconds for a response from - * the server in SendReceive[2] for the server to send - * a response back for most types of requests (except - * SMB Write past end of file which can be slow, and - * blocking lock operations). NFS waits slightly longer - * than CIFS, but this can make it take longer for - * nonresponsive servers to be detected and 15 seconds - * is more than enough time for modern networks to - * send a packet. In most cases if we fail to send - * after the retries we will kill the socket and - * reconnect which may clear the network problem. + * Catch if a low level driver returns -ENOSPC. This + * WARN_ON will be removed by 3.10 if no one reports + * seeing this. */ - if ((i >= 14) || (!server->noblocksnd && (i > 2))) { - cERROR(1, "sends on sock %p stuck for 15 seconds", - ssocket); + WARN_ON_ONCE(rc == -ENOSPC); + i++; + if (i >= 14 || (!server->noblocksnd && (i > 2))) { + cERROR(1, "sends on sock %p stuck for 15 " + "seconds", ssocket); rc = -EAGAIN; break; } msleep(1 << i); continue; } + if (rc < 0) break; - if (rc == total_len) { - total_len = 0; + /* send was at least partially successful */ + *sent += rc; + + if (rc == remaining) { + remaining = 0; break; - } else if (rc > total_len) { - cERROR(1, "sent %d requested %d", rc, total_len); + } + + if (rc > remaining) { + cERROR(1, "sent %d requested %d", rc, remaining); break; } + if (rc == 0) { /* should never happen, letting socket clear before retrying is our only obvious option here */ @@ -200,7 +223,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) msleep(500); continue; } - total_len -= rc; + + remaining -= rc; + /* the line below resets i */ for (i = first_vec; i < n_vec; i++) { if (iov[i].iov_len) { @@ -215,16 +240,97 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) } } } + i = 0; /* in case we get ENOSPC on the next send */ + rc = 0; } + return rc; +} + +/** + * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec + * @rqst: pointer to smb_rqst + * @idx: index into the array of the page + * @iov: pointer to struct kvec that will hold the result + * + * Helper function to convert a slot in the rqst->rq_pages array into a kvec. + * The page will be kmapped and the address placed into iov_base. The length + * will then be adjusted according to the ptailoff. + */ +void +cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, + struct kvec *iov) +{ + /* + * FIXME: We could avoid this kmap altogether if we used + * kernel_sendpage instead of kernel_sendmsg. That will only + * work if signing is disabled though as sendpage inlines the + * page directly into the fraglist. If userspace modifies the + * page after we calculate the signature, then the server will + * reject it and may break the connection. kernel_sendmsg does + * an extra copy of the data and avoids that issue. + */ + iov->iov_base = kmap(rqst->rq_pages[idx]); + + /* if last page, don't send beyond this offset into page */ + if (idx == (rqst->rq_npages - 1)) + iov->iov_len = rqst->rq_tailsz; + else + iov->iov_len = rqst->rq_pagesz; +} + +static int +smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + int rc; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); + unsigned int i; + size_t total_len = 0, sent; + struct socket *ssocket = server->ssocket; + int val = 1; + + cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); + dump_smb(iov[0].iov_base, iov[0].iov_len); + + /* cork the socket */ + kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, + (char *)&val, sizeof(val)); + + rc = smb_send_kvec(server, iov, n_vec, &sent); + if (rc < 0) + goto uncork; + + total_len += sent; + + /* now walk the page array and send each page in it */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + rc = smb_send_kvec(server, &p_iov, 1, &sent); + kunmap(rqst->rq_pages[i]); + if (rc < 0) + break; + + total_len += sent; + } + +uncork: + /* uncork it */ + val = 0; + kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, + (char *)&val, sizeof(val)); if ((total_len > 0) && (total_len != smb_buf_length + 4)) { - cFYI(1, "partial send (%d remaining), terminating session", - total_len); - /* If we have only sent part of an SMB then the next SMB - could be taken as the remainder of this one. We need - to kill the socket so the server throws away the partial - SMB */ + cFYI(1, "partial send (wanted=%u sent=%zu): terminating " + "session", smb_buf_length + 4, total_len); + /* + * If we have only sent part of an SMB then the next SMB could + * be taken as the remainder of this one. We need to kill the + * socket so the server throws away the partial SMB + */ server->tcpStatus = CifsNeedReconnect; } @@ -236,6 +342,15 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) return rc; } +static int +smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) +{ + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; + + return smb_send_rqst(server, &rqst); +} + int smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, unsigned int smb_buf_length) @@ -345,12 +460,11 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) return 0; } -int -cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, struct mid_q_entry **ret_mid) +struct mid_q_entry * +cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; + struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; /* enable signing if server requires it */ @@ -359,16 +473,15 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, mid = AllocMidQEntry(hdr, server); if (mid == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rc = cifs_sign_smbv(iov, nvec, server, &mid->sequence_number); + rc = cifs_sign_rqst(rqst, server, &mid->sequence_number); if (rc) { DeleteMidQEntry(mid); - return rc; + return ERR_PTR(rc); } - *ret_mid = mid; - return 0; + return mid; } /* @@ -376,9 +489,9 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, * the result. Caller is responsible for dealing with timeouts. */ int -cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_receive_t *receive, - mid_callback_t *callback, void *cbdata, const int flags) +cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, + mid_receive_t *receive, mid_callback_t *callback, + void *cbdata, const int flags) { int rc, timeout, optype; struct mid_q_entry *mid; @@ -391,12 +504,12 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, return rc; mutex_lock(&server->srv_mutex); - rc = server->ops->setup_async_request(server, iov, nvec, &mid); - if (rc) { + mid = server->ops->setup_async_request(server, rqst); + if (IS_ERR(mid)) { mutex_unlock(&server->srv_mutex); add_credits(server, 1, optype); wake_up(&server->request_q); - return rc; + return PTR_ERR(mid); } mid->receive = receive; @@ -411,7 +524,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, cifs_in_send_inc(server); - rc = smb_sendv(server, iov, nvec); + rc = smb_send_rqst(server, rqst); cifs_in_send_dec(server); cifs_save_when_sent(mid); mutex_unlock(&server->srv_mutex); @@ -419,7 +532,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, if (rc == 0) return 0; - delete_mid(mid); + cifs_delete_mid(mid); add_credits(server, 1, optype); wake_up(&server->request_q); return rc; @@ -504,11 +617,13 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { struct kvec iov; int rc = 0; + struct smb_rqst rqst = { .rq_iov = &iov, + .rq_nvec = 1 }; iov.iov_base = mid->resp_buf; iov.iov_len = len; /* FIXME: add code to kill session */ - rc = cifs_verify_signature(&iov, 1, server, + rc = cifs_verify_signature(&rqst, server, mid->sequence_number + 1); if (rc) cERROR(1, "SMB signature verification returned error = " @@ -519,22 +634,22 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, return map_smb_to_linux_error(mid->resp_buf, log_error); } -int -cifs_setup_request(struct cifs_ses *ses, struct kvec *iov, - unsigned int nvec, struct mid_q_entry **ret_mid) +struct mid_q_entry * +cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; - struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; + struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; rc = allocate_mid(ses, hdr, &mid); if (rc) - return rc; - rc = cifs_sign_smbv(iov, nvec, ses->server, &mid->sequence_number); - if (rc) - delete_mid(mid); - *ret_mid = mid; - return rc; + return ERR_PTR(rc); + rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number); + if (rc) { + cifs_delete_mid(mid); + return ERR_PTR(rc); + } + return mid; } int @@ -547,6 +662,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, struct mid_q_entry *midQ; char *buf = iov[0].iov_base; unsigned int credits = 1; + struct smb_rqst rqst = { .rq_iov = iov, + .rq_nvec = n_vec }; timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; @@ -584,13 +701,13 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, mutex_lock(&ses->server->srv_mutex); - rc = ses->server->ops->setup_request(ses, iov, n_vec, &midQ); - if (rc) { + midQ = ses->server->ops->setup_request(ses, &rqst); + if (IS_ERR(midQ)) { mutex_unlock(&ses->server->srv_mutex); cifs_small_buf_release(buf); /* Update # of requests on wire to server */ add_credits(ses->server, 1, optype); - return rc; + return PTR_ERR(midQ); } midQ->mid_state = MID_REQUEST_SUBMITTED; @@ -652,11 +769,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rc = ses->server->ops->check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR); - /* mark it so buf will not be freed by delete_mid */ + /* mark it so buf will not be freed by cifs_delete_mid */ if ((flags & CIFS_NO_RESP) == 0) midQ->resp_buf = NULL; out: - delete_mid(midQ); + cifs_delete_mid(midQ); add_credits(ses->server, credits, optype); return rc; @@ -762,7 +879,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, ses->server, 0); out: - delete_mid(midQ); + cifs_delete_mid(midQ); add_credits(ses->server, 1, 0); return rc; @@ -846,7 +963,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); if (rc) { - delete_mid(midQ); + cifs_delete_mid(midQ); mutex_unlock(&ses->server->srv_mutex); return rc; } @@ -859,7 +976,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { - delete_mid(midQ); + cifs_delete_mid(midQ); return rc; } @@ -880,7 +997,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, blocking lock to return. */ rc = send_cancel(ses->server, in_buf, midQ); if (rc) { - delete_mid(midQ); + cifs_delete_mid(midQ); return rc; } } else { @@ -892,7 +1009,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* If we get -ENOLCK back the lock may have already been removed. Don't exit in this case. */ if (rc && rc != -ENOLCK) { - delete_mid(midQ); + cifs_delete_mid(midQ); return rc; } } @@ -929,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, ses->server, 0); out: - delete_mid(midQ); + cifs_delete_mid(midQ); if (rstart && rc == -EACCES) return -ERESTARTSYS; return rc; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index f181312..be2aa49 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -85,6 +85,11 @@ int coda_init_inodecache(void) void coda_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(coda_inode_cachep); } @@ -107,43 +112,41 @@ static const struct super_operations coda_super_operations = static int get_device_index(struct coda_mount_data *data) { - struct file *file; + struct fd f; struct inode *inode; int idx; - if(data == NULL) { + if (data == NULL) { printk("coda_read_super: Bad mount data\n"); return -1; } - if(data->version != CODA_MOUNT_VERSION) { + if (data->version != CODA_MOUNT_VERSION) { printk("coda_read_super: Bad mount version\n"); return -1; } - file = fget(data->fd); - inode = NULL; - if(file) - inode = file->f_path.dentry->d_inode; - - if(!inode || !S_ISCHR(inode->i_mode) || - imajor(inode) != CODA_PSDEV_MAJOR) { - if(file) - fput(file); - - printk("coda_read_super: Bad file\n"); - return -1; + f = fdget(data->fd); + if (!f.file) + goto Ebadf; + inode = f.file->f_path.dentry->d_inode; + if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { + fdput(f); + goto Ebadf; } idx = iminor(inode); - fput(file); + fdput(f); - if(idx < 0 || idx >= MAX_CODADEVS) { + if (idx < 0 || idx >= MAX_CODADEVS) { printk("coda_read_super: Bad minor number\n"); return -1; } return idx; +Ebadf: + printk("coda_read_super: Bad file\n"); + return -1; } static int coda_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/compat.c b/fs/compat.c index 1bdb350..b7a24d0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -870,22 +870,20 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, struct compat_old_linux_dirent __user *dirent, unsigned int count) { int error; - struct file *file; - int fput_needed; + struct fd f = fdget(fd); struct compat_readdir_callback buf; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; buf.result = 0; buf.dirent = dirent; - error = vfs_readdir(file, compat_fillonedir, &buf); + error = vfs_readdir(f.file, compat_fillonedir, &buf); if (buf.result) error = buf.result; - fput_light(file, fput_needed); + fdput(f); return error; } @@ -949,17 +947,16 @@ efault: asmlinkage long compat_sys_getdents(unsigned int fd, struct compat_linux_dirent __user *dirent, unsigned int count) { - struct file * file; + struct fd f; struct compat_linux_dirent __user * lastdirent; struct compat_getdents_callback buf; - int fput_needed; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; buf.current_dir = dirent; @@ -967,17 +964,17 @@ asmlinkage long compat_sys_getdents(unsigned int fd, buf.count = count; buf.error = 0; - error = vfs_readdir(file, compat_filldir, &buf); + error = vfs_readdir(f.file, compat_filldir, &buf); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(file->f_pos, &lastdirent->d_off)) + if (put_user(f.file->f_pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } - fput_light(file, fput_needed); + fdput(f); return error; } @@ -1035,17 +1032,16 @@ efault: asmlinkage long compat_sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count) { - struct file * file; + struct fd f; struct linux_dirent64 __user * lastdirent; struct compat_getdents_callback64 buf; - int fput_needed; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; buf.current_dir = dirent; @@ -1053,18 +1049,18 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, buf.count = count; buf.error = 0; - error = vfs_readdir(file, compat_filldir64, &buf); + error = vfs_readdir(f.file, compat_filldir64, &buf); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = file->f_pos; + typeof(lastdirent->d_off) d_off = f.file->f_pos; if (__put_user_unaligned(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } - fput_light(file, fput_needed); + fdput(f); return error; } #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ @@ -1152,18 +1148,16 @@ asmlinkage ssize_t compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) { - struct file *file; - int fput_needed; + struct fd f = fdget(fd); ssize_t ret; loff_t pos; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - pos = file->f_pos; - ret = compat_readv(file, vec, vlen, &pos); - file->f_pos = pos; - fput_light(file, fput_needed); + pos = f.file->f_pos; + ret = compat_readv(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); return ret; } @@ -1171,19 +1165,18 @@ asmlinkage ssize_t compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen, loff_t pos) { - struct file *file; - int fput_needed; + struct fd f; ssize_t ret; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; ret = -ESPIPE; - if (file->f_mode & FMODE_PREAD) - ret = compat_readv(file, vec, vlen, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PREAD) + ret = compat_readv(f.file, vec, vlen, &pos); + fdput(f); return ret; } @@ -1221,18 +1214,16 @@ asmlinkage ssize_t compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) { - struct file *file; - int fput_needed; + struct fd f = fdget(fd); ssize_t ret; loff_t pos; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - pos = file->f_pos; - ret = compat_writev(file, vec, vlen, &pos); - file->f_pos = pos; - fput_light(file, fput_needed); + pos = f.file->f_pos; + ret = compat_writev(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); return ret; } @@ -1240,19 +1231,18 @@ asmlinkage ssize_t compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen, loff_t pos) { - struct file *file; - int fput_needed; + struct fd f; ssize_t ret; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; ret = -ESPIPE; - if (file->f_mode & FMODE_PWRITE) - ret = compat_writev(file, vec, vlen, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PWRITE) + ret = compat_writev(f.file, vec, vlen, &pos); + fdput(f); return ret; } @@ -1802,3 +1792,25 @@ compat_sys_open_by_handle_at(int mountdirfd, return do_handle_open(mountdirfd, handle, flags); } #endif + +#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE +asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, + compat_off_t __user *offset, compat_size_t count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */ diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 112e45a..a81147e 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -38,6 +38,13 @@ #define elf_addr_t Elf32_Addr /* + * Some data types as stored in coredump. + */ +#define user_long_t compat_long_t +#define user_siginfo_t compat_siginfo_t +#define copy_siginfo_to_user copy_siginfo_to_user32 + +/* * The machine-dependent core note format types are defined in elfcore-compat.h, * which requires asm/elf.h to define compat_elf_gregset_t et al. */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index debdfe0..f505402 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -866,6 +866,12 @@ COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) COMPATIBLE_IOCTL(TIOCSIG) +#ifdef TIOCSRS485 +COMPATIBLE_IOCTL(TIOCSRS485) +#endif +#ifdef TIOCGRS485 +COMPATIBLE_IOCTL(TIOCGRS485) +#endif #ifdef TCGETS2 COMPATIBLE_IOCTL(TCGETS2) COMPATIBLE_IOCTL(TCSETS2) @@ -897,6 +903,8 @@ COMPATIBLE_IOCTL(KDGKBSENT) COMPATIBLE_IOCTL(KDSKBSENT) COMPATIBLE_IOCTL(KDGKBDIACR) COMPATIBLE_IOCTL(KDSKBDIACR) +COMPATIBLE_IOCTL(KDGKBDIACRUC) +COMPATIBLE_IOCTL(KDSKBDIACRUC) COMPATIBLE_IOCTL(KDKBDREP) COMPATIBLE_IOCTL(KDGKBLED) COMPATIBLE_IOCTL(KDGETLED) @@ -1531,16 +1539,13 @@ static int compat_ioctl_check_table(unsigned int xcmd) asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - struct file *filp; + struct fd f = fdget(fd); int error = -EBADF; - int fput_needed; - - filp = fget_light(fd, &fput_needed); - if (!filp) + if (!f.file) goto out; /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(filp, cmd, arg); + error = security_file_ioctl(f.file, cmd, arg); if (error) goto out_fput; @@ -1560,30 +1565,30 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(filp, compat_ptr(arg)); + error = compat_ioctl_preallocate(f.file, compat_ptr(arg)); goto out_fput; #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(filp, compat_ptr(arg)); + error = ioctl_preallocate(f.file, compat_ptr(arg)); goto out_fput; #endif case FIBMAP: case FIGETBSZ: case FIONREAD: - if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode)) break; /*FALL THROUGH*/ default: - if (filp->f_op && filp->f_op->compat_ioctl) { - error = filp->f_op->compat_ioctl(filp, cmd, arg); + if (f.file->f_op && f.file->f_op->compat_ioctl) { + error = f.file->f_op->compat_ioctl(f.file, cmd, arg); if (error != -ENOIOCTLCMD) goto out_fput; } - if (!filp->f_op || !filp->f_op->unlocked_ioctl) + if (!f.file->f_op || !f.file->f_op->unlocked_ioctl) goto do_ioctl; break; } @@ -1591,7 +1596,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, if (compat_ioctl_check_table(XFORM(cmd))) goto found_handler; - error = do_ioctl_trans(fd, cmd, arg, filp); + error = do_ioctl_trans(fd, cmd, arg, f.file); if (error == -ENOIOCTLCMD) error = -ENOTTY; @@ -1600,9 +1605,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, found_handler: arg = (unsigned long)compat_ptr(arg); do_ioctl: - error = do_vfs_ioctl(filp, fd, cmd, arg); + error = do_vfs_ioctl(f.file, fd, cmd, arg); out_fput: - fput_light(filp, fput_needed); + fdput(f); out: return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0074362..a9d35b0 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -79,8 +79,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) return -ENOMEM; /* assign default attributes */ sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; + sd_iattr->ia_uid = GLOBAL_ROOT_UID; + sd_iattr->ia_gid = GLOBAL_ROOT_GID; sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; sd->s_iattr = sd_iattr; } diff --git a/fs/coredump.c b/fs/coredump.c new file mode 100644 index 0000000..fd37fac --- /dev/null +++ b/fs/coredump.c @@ -0,0 +1,692 @@ +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#include <linux/swap.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/perf_event.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> +#include <linux/key.h> +#include <linux/personality.h> +#include <linux/binfmts.h> +#include <linux/coredump.h> +#include <linux/utsname.h> +#include <linux/pid_namespace.h> +#include <linux/module.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/tsacct_kern.h> +#include <linux/cn_proc.h> +#include <linux/audit.h> +#include <linux/tracehook.h> +#include <linux/kmod.h> +#include <linux/fsnotify.h> +#include <linux/fs_struct.h> +#include <linux/pipe_fs_i.h> +#include <linux/oom.h> +#include <linux/compat.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/tlb.h> +#include <asm/exec.h> + +#include <trace/events/task.h> +#include "internal.h" +#include "coredump.h" + +#include <trace/events/sched.h> + +int core_uses_pid; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +unsigned int core_pipe_limit; + +struct core_name { + char *corename; + int used, size; +}; +static atomic_t call_count = ATOMIC_INIT(1); + +/* The maximal length of core_pattern is also specified in sysctl.c */ + +static int expand_corename(struct core_name *cn) +{ + char *old_corename = cn->corename; + + cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); + cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + + if (!cn->corename) { + kfree(old_corename); + return -ENOMEM; + } + + return 0; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + char *cur; + int need; + int ret; + va_list arg; + + va_start(arg, fmt); + need = vsnprintf(NULL, 0, fmt, arg); + va_end(arg); + + if (likely(need < cn->size - cn->used - 1)) + goto out_printf; + + ret = expand_corename(cn); + if (ret) + goto expand_fail; + +out_printf: + cur = cn->corename + cn->used; + va_start(arg, fmt); + vsnprintf(cur, need + 1, fmt, arg); + va_end(arg); + cn->used += need; + return 0; + +expand_fail: + return ret; +} + +static void cn_escape(char *str) +{ + for (; *str; str++) + if (*str == '/') + *str = '!'; +} + +static int cn_print_exe_file(struct core_name *cn) +{ + struct file *exe_file; + char *pathbuf, *path; + int ret; + + exe_file = get_mm_exe_file(current->mm); + if (!exe_file) { + char *commstart = cn->corename + cn->used; + ret = cn_printf(cn, "%s (path unknown)", current->comm); + cn_escape(commstart); + return ret; + } + + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + if (!pathbuf) { + ret = -ENOMEM; + goto put_exe_file; + } + + path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto free_buf; + } + + cn_escape(path); + + ret = cn_printf(cn, "%s", path); + +free_buf: + kfree(pathbuf); +put_exe_file: + fput(exe_file); + return ret; +} + +/* format_corename will inspect the pattern parameter, and output a + * name into corename, which must have space for at least + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. + */ +static int format_corename(struct core_name *cn, struct coredump_params *cprm) +{ + const struct cred *cred = current_cred(); + const char *pat_ptr = core_pattern; + int ispipe = (*pat_ptr == '|'); + int pid_in_pattern = 0; + int err = 0; + + cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); + cn->corename = kmalloc(cn->size, GFP_KERNEL); + cn->used = 0; + + if (!cn->corename) + return -ENOMEM; + + /* Repeat as long as we have more pattern to process and more output + space */ + while (*pat_ptr) { + if (*pat_ptr != '%') { + if (*pat_ptr == 0) + goto out; + err = cn_printf(cn, "%c", *pat_ptr++); + } else { + switch (*++pat_ptr) { + /* single % at the end, drop that */ + case 0: + goto out; + /* Double percent, output one percent */ + case '%': + err = cn_printf(cn, "%c", '%'); + break; + /* pid */ + case 'p': + pid_in_pattern = 1; + err = cn_printf(cn, "%d", + task_tgid_vnr(current)); + break; + /* uid */ + case 'u': + err = cn_printf(cn, "%d", cred->uid); + break; + /* gid */ + case 'g': + err = cn_printf(cn, "%d", cred->gid); + break; + case 'd': + err = cn_printf(cn, "%d", + __get_dumpable(cprm->mm_flags)); + break; + /* signal that caused the coredump */ + case 's': + err = cn_printf(cn, "%ld", cprm->siginfo->si_signo); + break; + /* UNIX time of coredump */ + case 't': { + struct timeval tv; + do_gettimeofday(&tv); + err = cn_printf(cn, "%lu", tv.tv_sec); + break; + } + /* hostname */ + case 'h': { + char *namestart = cn->corename + cn->used; + down_read(&uts_sem); + err = cn_printf(cn, "%s", + utsname()->nodename); + up_read(&uts_sem); + cn_escape(namestart); + break; + } + /* executable */ + case 'e': { + char *commstart = cn->corename + cn->used; + err = cn_printf(cn, "%s", current->comm); + cn_escape(commstart); + break; + } + case 'E': + err = cn_print_exe_file(cn); + break; + /* core limit size */ + case 'c': + err = cn_printf(cn, "%lu", + rlimit(RLIMIT_CORE)); + break; + default: + break; + } + ++pat_ptr; + } + + if (err) + return err; + } + + /* Backward compatibility with core_uses_pid: + * + * If core_pattern does not include a %p (as is the default) + * and core_uses_pid is set, then .%pid will be appended to + * the filename. Do not do this for piped commands. */ + if (!ispipe && !pid_in_pattern && core_uses_pid) { + err = cn_printf(cn, ".%d", task_tgid_vnr(current)); + if (err) + return err; + } +out: + return ispipe; +} + +static int zap_process(struct task_struct *start, int exit_code) +{ + struct task_struct *t; + int nr = 0; + + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_exit_code = exit_code; + start->signal->group_stop_count = 0; + + t = start; + do { + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); + if (t != current && t->mm) { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + nr++; + } + } while_each_thread(start, t); + + return nr; +} + +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int nr = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_group_exit(tsk->signal)) { + mm->core_state = core_state; + nr = zap_process(tsk, exit_code); + } + spin_unlock_irq(&tsk->sighand->siglock); + if (unlikely(nr < 0)) + return nr; + + if (atomic_read(&mm->mm_users) == nr + 1) + goto done; + /* + * We should find and kill all tasks which use this mm, and we should + * count them correctly into ->nr_threads. We don't take tasklist + * lock, but this is safe wrt: + * + * fork: + * None of sub-threads can fork after zap_process(leader). All + * processes which were created before this point should be + * visible to zap_threads() because copy_process() adds the new + * process to the tail of init_task.tasks list, and lock/unlock + * of ->siglock provides a memory barrier. + * + * do_exit: + * The caller holds mm->mmap_sem. This means that the task which + * uses this mm can't pass exit_mm(), so it can't exit or clear + * its ->mm. + * + * de_thread: + * It does list_replace_rcu(&leader->tasks, ¤t->tasks), + * we must see either old or new leader, this does not matter. + * However, it can change p->sighand, so lock_task_sighand(p) + * must be used. Since p->mm != NULL and we hold ->mmap_sem + * it can't fail. + * + * Note also that "g" can be the old leader with ->mm == NULL + * and already unhashed and thus removed from ->thread_group. + * This is OK, __unhash_process()->list_del_rcu() does not + * clear the ->next pointer, we will find the new leader via + * next_thread(). + */ + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + if (g->flags & PF_KTHREAD) + continue; + p = g; + do { + if (p->mm) { + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code); + unlock_task_sighand(p, &flags); + } + break; + } + } while_each_thread(g, p); + } + rcu_read_unlock(); +done: + atomic_set(&core_state->nr_threads, nr); + return nr; +} + +static int coredump_wait(int exit_code, struct core_state *core_state) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + int core_waiters = -EBUSY; + + init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; + + down_write(&mm->mmap_sem); + if (!mm->core_state) + core_waiters = zap_threads(tsk, mm, core_state, exit_code); + up_write(&mm->mmap_sem); + + if (core_waiters > 0) { + struct core_thread *ptr; + + wait_for_completion(&core_state->startup); + /* + * Wait for all the threads to become inactive, so that + * all the thread context (extended register state, like + * fpu etc) gets copied to the memory. + */ + ptr = core_state->dumper.next; + while (ptr != NULL) { + wait_task_inactive(ptr->task, 0); + ptr = ptr->next; + } + } + + return core_waiters; +} + +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe; + + pipe = file->f_path.dentry->d_inode->i_pipe; + + pipe_lock(pipe); + pipe->readers++; + pipe->writers--; + + while ((pipe->readers > 1) && (!signal_pending(current))) { + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_wait(pipe); + } + + pipe->readers--; + pipe->writers++; + pipe_unlock(pipe); + +} + +/* + * umh_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct file *files[2]; + struct coredump_params *cp = (struct coredump_params *)info->data; + int err = create_pipe_files(files, 0); + if (err) + return err; + + cp->file = files[1]; + + replace_fd(0, files[0], 0); + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return 0; +} + +void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) +{ + struct core_state core_state; + struct core_name cn; + struct mm_struct *mm = current->mm; + struct linux_binfmt * binfmt; + const struct cred *old_cred; + struct cred *cred; + int retval = 0; + int flag = 0; + int ispipe; + struct files_struct *displaced; + bool need_nonrelative = false; + static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .siginfo = siginfo, + .regs = regs, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, + }; + + audit_core_dumps(siginfo->si_signo); + + binfmt = mm->binfmt; + if (!binfmt || !binfmt->core_dump) + goto fail; + if (!__get_dumpable(cprm.mm_flags)) + goto fail; + + cred = prepare_creds(); + if (!cred) + goto fail; + /* + * We cannot trust fsuid as being the "true" uid of the process + * nor do we know its entire history. We only know it was tainted + * so we dump it as root in mode 2, and only into a controlled + * environment (pipe handler or fully qualified path). + */ + if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { + /* Setuid core dump mode */ + flag = O_EXCL; /* Stop rewrite attacks */ + cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ + need_nonrelative = true; + } + + retval = coredump_wait(siginfo->si_signo, &core_state); + if (retval < 0) + goto fail_creds; + + old_cred = override_creds(cred); + + /* + * Clear any false indication of pending signals that might + * be seen by the filesystem code called to write the core file. + */ + clear_thread_flag(TIF_SIGPENDING); + + ispipe = format_corename(&cn, &cprm); + + if (ispipe) { + int dump_count; + char **helper_argv; + + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + + if (cprm.limit == 1) { + /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + * + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a speacial value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + printk(KERN_WARNING + "Process %d(%s) has RLIMIT_CORE set to 1\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Aborting core\n"); + goto fail_unlock; + } + cprm.limit = RLIM_INFINITY; + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_dropcount; + } + + helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_dropcount; + } + + retval = call_usermodehelper_fns(helper_argv[0], helper_argv, + NULL, UMH_WAIT_EXEC, umh_pipe_setup, + NULL, &cprm); + argv_free(helper_argv); + if (retval) { + printk(KERN_INFO "Core dump to %s pipe failed\n", + cn.corename); + goto close_fail; + } + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + + if (need_nonrelative && cn.corename[0] != '/') { + printk(KERN_WARNING "Pid %d(%s) can only dump core "\ + "to fully qualified path!\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_unlock; + } + + cprm.file = filp_open(cn.corename, + O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + 0600); + if (IS_ERR(cprm.file)) + goto fail_unlock; + + inode = cprm.file->f_path.dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (!uid_eq(inode->i_uid, current_fsuid())) + goto close_fail; + if (!cprm.file->f_op || !cprm.file->f_op->write) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + + /* get us an unshared descriptor table; almost always a no-op */ + retval = unshare_files(&displaced); + if (retval) + goto close_fail; + if (displaced) + put_files_struct(displaced); + retval = binfmt->core_dump(&cprm); + if (retval) + current->signal->group_exit_code |= 0x80; + + if (ispipe && core_pipe_limit) + wait_for_dump_helpers(cprm.file); +close_fail: + if (cprm.file) + filp_close(cprm.file, NULL); +fail_dropcount: + if (ispipe) + atomic_dec(&core_dump_count); +fail_unlock: + kfree(cn.corename); +fail_corename: + coredump_finish(mm); + revert_creds(old_cred); +fail_creds: + put_cred(cred); +fail: + return; +} + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ + int ret = 1; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) { + ret = 0; + break; + } + off -= n; + } + free_page((unsigned long)buf); + } + return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/coredump.h b/fs/coredump.h new file mode 100644 index 0000000..e39ff07 --- /dev/null +++ b/fs/coredump.h @@ -0,0 +1,6 @@ +#ifndef _FS_COREDUMP_H +#define _FS_COREDUMP_H + +extern int __get_dumpable(unsigned long mm_flags); + +#endif diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 28cca01..c6c3f91 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -90,8 +90,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_gid = cramfs_inode->gid; + i_uid_write(inode, cramfs_inode->uid); + i_gid_write(inode, cramfs_inode->gid); /* if the lower 2 bits are zero, the inode contains data */ if (!(inode->i_ino & 3)) { diff --git a/fs/dcache.c b/fs/dcache.c index 8086636..3a463d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -389,7 +389,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) * Inform try_to_ascend() that we are no longer attached to the * dentry tree */ - dentry->d_flags |= DCACHE_DISCONNECTED; + dentry->d_flags |= DCACHE_DENTRY_KILLED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1048,7 +1048,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq * or deletion */ if (new != old->d_parent || - (old->d_flags & DCACHE_DISCONNECTED) || + (old->d_flags & DCACHE_DENTRY_KILLED) || (!locked && read_seqretry(&rename_lock, seq))) { spin_unlock(&new->d_lock); new = NULL; @@ -1134,6 +1134,8 @@ positive: return 1; rename_retry: + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; @@ -1141,7 +1143,7 @@ rename_retry: EXPORT_SYMBOL(have_submounts); /* - * Search the dentry child list for the specified parent, + * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue @@ -1236,6 +1238,8 @@ out: rename_retry: if (found) return found; + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; @@ -2109,7 +2113,7 @@ again: inode = dentry->d_inode; isdir = S_ISDIR(inode->i_mode); if (dentry->d_count == 1) { - if (inode && !spin_trylock(&inode->i_lock)) { + if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); cpu_relax(); goto again; @@ -3035,6 +3039,8 @@ resume: return; rename_retry: + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 2340f69..c5ca6ae 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -526,73 +526,51 @@ struct array_data { u32 elements; }; -static int u32_array_open(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - return nonseekable_open(inode, file); -} - -static size_t format_array(char *buf, size_t bufsize, const char *fmt, - u32 *array, u32 array_size) +static size_t u32_format_array(char *buf, size_t bufsize, + u32 *array, int array_size) { size_t ret = 0; - u32 i; - for (i = 0; i < array_size; i++) { + while (--array_size >= 0) { size_t len; + char term = array_size ? ' ' : '\n'; - len = snprintf(buf, bufsize, fmt, array[i]); - len++; /* ' ' or '\n' */ + len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; - if (buf) { - buf += len; - bufsize -= len; - buf[-1] = (i == array_size-1) ? '\n' : ' '; - } + buf += len; + bufsize -= len; } - - ret++; /* \0 */ - if (buf) - *buf = '\0'; - return ret; } -static char *format_array_alloc(const char *fmt, u32 *array, - u32 array_size) +static int u32_array_open(struct inode *inode, struct file *file) { - size_t len = format_array(NULL, 0, fmt, array, array_size); - char *ret; - - ret = kmalloc(len, GFP_KERNEL); - if (ret == NULL) - return NULL; + struct array_data *data = inode->i_private; + int size, elements = data->elements; + char *buf; + + /* + * Max size: + * - 10 digits + ' '/'\n' = 11 bytes per number + * - terminating NUL character + */ + size = elements*11; + buf = kmalloc(size+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + buf[size] = 0; + + file->private_data = buf; + u32_format_array(buf, size, data->array, data->elements); - format_array(ret, len, fmt, array, array_size); - return ret; + return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - struct inode *inode = file->f_path.dentry->d_inode; - struct array_data *data = inode->i_private; - size_t size; - - if (*ppos == 0) { - if (file->private_data) { - kfree(file->private_data); - file->private_data = NULL; - } - - file->private_data = format_array_alloc("%u", data->array, - data->elements); - } - - size = 0; - if (file->private_data) - size = strlen(file->private_data); + size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 4733eab..b607d92 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -28,7 +28,7 @@ #include <linux/magic.h> #include <linux/slab.h> -#define DEBUGFS_DEFAULT_MODE 0755 +#define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; @@ -128,8 +128,8 @@ static inline int debugfs_positive(struct dentry *dentry) } struct debugfs_mount_opts { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; }; @@ -156,6 +156,8 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) substring_t args[MAX_OPT_ARGS]; int option; int token; + kuid_t uid; + kgid_t gid; char *p; opts->mode = DEBUGFS_DEFAULT_MODE; @@ -169,12 +171,18 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; - opts->uid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; break; case Opt_gid: if (match_octal(&args[0], &option)) return -EINVAL; - opts->gid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; break; case Opt_mode: if (match_octal(&args[0], &option)) @@ -226,10 +234,12 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; struct debugfs_mount_opts *opts = &fsi->mount_opts; - if (opts->uid != 0) - seq_printf(m, ",uid=%u", opts->uid); - if (opts->gid != 0) - seq_printf(m, ",gid=%u", opts->gid); + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); if (opts->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", opts->mode); @@ -291,9 +301,9 @@ static struct file_system_type debug_fs_type = { .kill_sb = kill_litter_super, }; -struct dentry *__create_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops) +static struct dentry *__create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) { struct dentry *dentry = NULL; int error; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 63dc19c..27a6ba9 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -15,8 +15,8 @@ #include "lock.h" #include "user.h" -static uint64_t dlm_cb_seq; -static spinlock_t dlm_cb_seq_spin; +static uint64_t dlm_cb_seq; +static DEFINE_SPINLOCK(dlm_cb_seq_spin); static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) { diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 9ccf734..a0387dd 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -750,6 +750,7 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) { struct sockaddr_storage *addr; + int rv; if (len != sizeof(struct sockaddr_storage)) return -EINVAL; @@ -762,6 +763,13 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) return -ENOMEM; memcpy(addr, buf, len); + + rv = dlm_lowcomms_addr(cm->nodeid, addr, len); + if (rv) { + kfree(addr); + return rv; + } + cm->addr[cm->addr_count++] = addr; return len; } @@ -878,34 +886,7 @@ static void put_space(struct dlm_space *sp) config_item_put(&sp->group.cg_item); } -static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) -{ - switch (x->ss_family) { - case AF_INET: { - struct sockaddr_in *sinx = (struct sockaddr_in *)x; - struct sockaddr_in *siny = (struct sockaddr_in *)y; - if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) - return 0; - if (sinx->sin_port != siny->sin_port) - return 0; - break; - } - case AF_INET6: { - struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; - struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; - if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) - return 0; - if (sinx->sin6_port != siny->sin6_port) - return 0; - break; - } - default: - return 0; - } - return 1; -} - -static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) +static struct dlm_comm *get_comm(int nodeid) { struct config_item *i; struct dlm_comm *cm = NULL; @@ -919,19 +900,11 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) list_for_each_entry(i, &comm_list->cg_children, ci_entry) { cm = config_item_to_comm(i); - if (nodeid) { - if (cm->nodeid != nodeid) - continue; - found = 1; - config_item_get(i); - break; - } else { - if (!cm->addr_count || !addr_compare(cm->addr[0], addr)) - continue; - found = 1; - config_item_get(i); - break; - } + if (cm->nodeid != nodeid) + continue; + found = 1; + config_item_get(i); + break; } mutex_unlock(&clusters_root.subsys.su_mutex); @@ -995,7 +968,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int dlm_comm_seq(int nodeid, uint32_t *seq) { - struct dlm_comm *cm = get_comm(nodeid, NULL); + struct dlm_comm *cm = get_comm(nodeid); if (!cm) return -EEXIST; *seq = cm->seq; @@ -1003,28 +976,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) -{ - struct dlm_comm *cm = get_comm(nodeid, NULL); - if (!cm) - return -EEXIST; - if (!cm->addr_count) - return -ENOENT; - memcpy(addr, cm->addr[0], sizeof(*addr)); - put_comm(cm); - return 0; -} - -int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) -{ - struct dlm_comm *cm = get_comm(0, addr); - if (!cm) - return -EEXIST; - *nodeid = cm->nodeid; - put_comm(cm); - return 0; -} - int dlm_our_nodeid(void) { return local_comm ? local_comm->nodeid : 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index dbd35a0..f30697b 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -46,8 +46,6 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); -int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 9d3e485..871c1ab 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -604,6 +604,7 @@ struct dlm_ls { struct idr ls_recover_idr; spinlock_t ls_recover_idr_lock; wait_queue_head_t ls_wait_general; + wait_queue_head_t ls_recover_lock_wait; struct mutex ls_clear_proc_locks; struct list_head ls_root_list; /* root resources */ @@ -616,15 +617,40 @@ struct dlm_ls { char ls_name[1]; }; -#define LSFL_WORK 0 -#define LSFL_RUNNING 1 -#define LSFL_RECOVERY_STOP 2 -#define LSFL_RCOM_READY 3 -#define LSFL_RCOM_WAIT 4 -#define LSFL_UEVENT_WAIT 5 -#define LSFL_TIMEWARN 6 -#define LSFL_CB_DELAY 7 -#define LSFL_NODIR 8 +/* + * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines + * that they should abort what they're doing so new recovery can be started. + * + * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it + * should do down_write() on the in_recovery rw_semaphore. (doing down_write + * within dlm_ls_stop causes complaints about the lock acquired/released + * in different contexts.) + * + * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore. + * It sets this after it is done with down_write() on the in_recovery + * rw_semaphore and clears it after it has released the rw_semaphore. + * + * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it + * should begin recovery of the lockspace. + * + * LSFL_RUNNING - set when normal locking activity is enabled. + * dlm_ls_stop() clears this to tell dlm locking routines that they should + * quit what they are doing so recovery can run. dlm_recoverd sets + * this after recovery is finished. + */ + +#define LSFL_RECOVER_STOP 0 +#define LSFL_RECOVER_DOWN 1 +#define LSFL_RECOVER_LOCK 2 +#define LSFL_RECOVER_WORK 3 +#define LSFL_RUNNING 4 + +#define LSFL_RCOM_READY 5 +#define LSFL_RCOM_WAIT 6 +#define LSFL_UEVENT_WAIT 7 +#define LSFL_TIMEWARN 8 +#define LSFL_CB_DELAY 9 +#define LSFL_NODIR 10 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ @@ -667,7 +693,7 @@ static inline int dlm_locking_stopped(struct dlm_ls *ls) static inline int dlm_recovery_stopped(struct dlm_ls *ls) { - return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); } static inline int dlm_no_directory(struct dlm_ls *ls) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 952557d..2e99fb0 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -582,8 +582,6 @@ static int new_lockspace(const char *name, const char *cluster, INIT_LIST_HEAD(&ls->ls_root_list); init_rwsem(&ls->ls_root_sem); - down_write(&ls->ls_in_recovery); - spin_lock(&lslist_lock); ls->ls_create_count = 1; list_add(&ls->ls_list, &lslist); @@ -597,13 +595,24 @@ static int new_lockspace(const char *name, const char *cluster, } } - /* needs to find ls in lslist */ + init_waitqueue_head(&ls->ls_recover_lock_wait); + + /* + * Once started, dlm_recoverd first looks for ls in lslist, then + * initializes ls_in_recovery as locked in "down" mode. We need + * to wait for the wakeup from dlm_recoverd because in_recovery + * has to start out in down mode. + */ + error = dlm_recoverd_start(ls); if (error) { log_error(ls, "can't start dlm_recoverd %d", error); goto out_callback; } + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + ls->ls_kobj.kset = dlm_kset; error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, "%s", ls->ls_name); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5c1b0e3..331ea4f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -140,6 +140,16 @@ struct writequeue_entry { struct connection *con; }; +struct dlm_node_addr { + struct list_head list; + int nodeid; + int addr_count; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +static LIST_HEAD(dlm_node_addrs); +static DEFINE_SPINLOCK(dlm_node_addrs_spin); + static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; static int dlm_allow_conn; @@ -264,31 +274,146 @@ static struct connection *assoc2con(int assoc_id) return NULL; } -static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) +static struct dlm_node_addr *find_node_addr(int nodeid) +{ + struct dlm_node_addr *na; + + list_for_each_entry(na, &dlm_node_addrs, list) { + if (na->nodeid == nodeid) + return na; + } + return NULL; +} + +static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, + struct sockaddr *sa_out) { - struct sockaddr_storage addr; - int error; + struct sockaddr_storage sas; + struct dlm_node_addr *na; if (!dlm_local_count) return -1; - error = dlm_nodeid_to_addr(nodeid, &addr); - if (error) - return error; + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (na && na->addr_count) + memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); + spin_unlock(&dlm_node_addrs_spin); + + if (!na) + return -EEXIST; + + if (!na->addr_count) + return -ENOENT; + + if (sas_out) + memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); + + if (!sa_out) + return 0; if (dlm_local_addr[0]->ss_family == AF_INET) { - struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; - struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; + struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; + struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; } else { - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; - struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; ret6->sin6_addr = in6->sin6_addr; } return 0; } +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +{ + struct dlm_node_addr *na; + int rv = -EEXIST; + + spin_lock(&dlm_node_addrs_spin); + list_for_each_entry(na, &dlm_node_addrs, list) { + if (!na->addr_count) + continue; + + if (!addr_compare(na->addr[0], addr)) + continue; + + *nodeid = na->nodeid; + rv = 0; + break; + } + spin_unlock(&dlm_node_addrs_spin); + return rv; +} + +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + struct sockaddr_storage *new_addr; + struct dlm_node_addr *new_node, *na; + + new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); + if (!new_node) + return -ENOMEM; + + new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS); + if (!new_addr) { + kfree(new_node); + return -ENOMEM; + } + + memcpy(new_addr, addr, len); + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (!na) { + new_node->nodeid = nodeid; + new_node->addr[0] = new_addr; + new_node->addr_count = 1; + list_add(&new_node->list, &dlm_node_addrs); + spin_unlock(&dlm_node_addrs_spin); + return 0; + } + + if (na->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&dlm_node_addrs_spin); + kfree(new_addr); + kfree(new_node); + return -ENOSPC; + } + + na->addr[na->addr_count++] = new_addr; + spin_unlock(&dlm_node_addrs_spin); + kfree(new_node); + return 0; +} + /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk, int count_unused) { @@ -348,7 +473,7 @@ int dlm_lowcomms_connect_node(int nodeid) } /* Make a socket active */ -static int add_sock(struct socket *sock, struct connection *con) +static void add_sock(struct socket *sock, struct connection *con) { con->sock = sock; @@ -358,7 +483,6 @@ static int add_sock(struct socket *sock, struct connection *con) con->sock->sk->sk_state_change = lowcomms_state_change; con->sock->sk->sk_user_data = con; con->sock->sk->sk_allocation = GFP_NOFS; - return 0; } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -510,7 +634,7 @@ static void process_sctp_notification(struct connection *con, return; } make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { unsigned char *b=(unsigned char *)&prim.ssp_addr; log_print("reject connect from unknown addr"); print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, @@ -747,7 +871,7 @@ static int tcp_accept_from_sock(struct connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); - if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { + if (addr_to_nodeid(&peeraddr, &nodeid)) { unsigned char *b=(unsigned char *)&peeraddr; log_print("connect from non cluster node"); print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, @@ -862,7 +986,7 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { + if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; } @@ -928,11 +1052,11 @@ static void sctp_init_assoc(struct connection *con) /* Connect a new socket to its peer */ static void tcp_connect_to_sock(struct connection *con) { - int result = -EHOSTUNREACH; struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; int one = 1; + int result; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -944,10 +1068,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out; /* Some odd races can cause double-connects, ignore them */ - if (con->sock) { - result = 0; + if (con->sock) goto out; - } /* Create a socket to communicate with */ result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, @@ -956,8 +1078,11 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - if (dlm_nodeid_to_addr(con->nodeid, &saddr)) + result = nodeid_to_addr(con->nodeid, &saddr, NULL); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); goto out_err; + } sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; @@ -983,8 +1108,7 @@ static void tcp_connect_to_sock(struct connection *con) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, sizeof(one)); - result = - sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, + result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); if (result == -EINPROGRESS) result = 0; @@ -1002,11 +1126,17 @@ out_err: * Some errors are fatal and this list might need adjusting. For other * errors we try again until the max number of retries is reached. */ - if (result != -EHOSTUNREACH && result != -ENETUNREACH && - result != -ENETDOWN && result != -EINVAL - && result != -EPROTONOSUPPORT) { + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + mutex_unlock(&con->sock_mutex); + msleep(1000); lowcomms_connect_sock(con); - result = 0; + return; } out: mutex_unlock(&con->sock_mutex); @@ -1044,10 +1174,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, if (result < 0) { log_print("Failed to set SO_REUSEADDR on socket: %d", result); } - sock->sk->sk_user_data = con; con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; - con->sock = sock; /* Bind to our port */ make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); @@ -1358,8 +1486,7 @@ static void send_to_sock(struct connection *con) } cond_resched(); goto out; - } - if (ret <= 0) + } else if (ret < 0) goto send_error; } @@ -1376,7 +1503,6 @@ static void send_to_sock(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); free_entry(e); - continue; } } spin_unlock(&con->writequeue_lock); @@ -1394,7 +1520,6 @@ out_connect: mutex_unlock(&con->sock_mutex); if (!test_bit(CF_INIT_PENDING, &con->flags)) lowcomms_connect_sock(con); - return; } static void clean_one_writequeue(struct connection *con) @@ -1414,6 +1539,7 @@ static void clean_one_writequeue(struct connection *con) int dlm_lowcomms_close(int nodeid) { struct connection *con; + struct dlm_node_addr *na; log_print("closing connection to node %d", nodeid); con = nodeid2con(nodeid, 0); @@ -1428,6 +1554,17 @@ int dlm_lowcomms_close(int nodeid) clean_one_writequeue(con); close_connection(con, true); } + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (na) { + list_del(&na->list); + while (na->addr_count--) + kfree(na->addr[na->addr_count]); + kfree(na); + } + spin_unlock(&dlm_node_addrs_spin); + return 0; } @@ -1577,3 +1714,17 @@ fail_destroy: fail: return error; } + +void dlm_lowcomms_exit(void) +{ + struct dlm_node_addr *na, *safe; + + spin_lock(&dlm_node_addrs_spin); + list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) { + list_del(&na->list); + while (na->addr_count--) + kfree(na->addr[na->addr_count]); + kfree(na); + } + spin_unlock(&dlm_node_addrs_spin); +} diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 1311e64..67462e5 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -16,10 +16,12 @@ int dlm_lowcomms_start(void); void dlm_lowcomms_stop(void); +void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_lowcomms_commit_buffer(void *mh); int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 5a59efa..079c0bd 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -17,6 +17,7 @@ #include "user.h" #include "memory.h" #include "config.h" +#include "lowcomms.h" static int __init init_dlm(void) { @@ -78,6 +79,7 @@ static void __exit exit_dlm(void) dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); + dlm_lowcomms_exit(); dlm_unregister_debugfs(); } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 862640a..476557b 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -616,13 +616,13 @@ int dlm_ls_stop(struct dlm_ls *ls) down_write(&ls->ls_recv_active); /* - * Abort any recovery that's in progress (see RECOVERY_STOP, + * Abort any recovery that's in progress (see RECOVER_STOP, * dlm_recovery_stopped()) and tell any other threads running in the * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). */ spin_lock(&ls->ls_recover_lock); - set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + set_bit(LSFL_RECOVER_STOP, &ls->ls_flags); new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); ls->ls_recover_seq++; spin_unlock(&ls->ls_recover_lock); @@ -642,12 +642,16 @@ int dlm_ls_stop(struct dlm_ls *ls) * when recovery is complete. */ - if (new) - down_write(&ls->ls_in_recovery); + if (new) { + set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); + wait_event(ls->ls_recover_lock_wait, + test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)); + } /* * The recoverd suspend/resume makes sure that dlm_recoverd (if - * running) has noticed RECOVERY_STOP above and quit processing the + * running) has noticed RECOVER_STOP above and quit processing the * previous recovery. */ @@ -709,7 +713,8 @@ int dlm_ls_start(struct dlm_ls *ls) kfree(rv_old); } - dlm_recoverd_kick(ls); + set_bit(LSFL_RECOVER_WORK, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); return 0; fail: diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ef17e01..60a3278 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -14,7 +14,7 @@ #include "dlm_internal.h" static uint32_t dlm_nl_seqnum; -static uint32_t listener_nlpid; +static uint32_t listener_nlportid; static struct genl_family family = { .id = GENL_ID_GENERATE, @@ -64,13 +64,13 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(&init_net, skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlportid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) { - listener_nlpid = info->snd_pid; - printk("user_cmd nlpid %u\n", listener_nlpid); + listener_nlportid = info->snd_portid; + printk("user_cmd nlpid %u\n", listener_nlportid); return 0; } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 87f1a56..9d61947 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -581,7 +581,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) spin_lock(&ls->ls_recover_lock); status = ls->ls_recover_status; - stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 88ce65f..32f9f89 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -41,6 +41,7 @@ static int enable_locking(struct dlm_ls *ls, uint64_t seq) set_bit(LSFL_RUNNING, &ls->ls_flags); /* unblocks processes waiting to enter the dlm */ up_write(&ls->ls_in_recovery); + clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); error = 0; } spin_unlock(&ls->ls_recover_lock); @@ -262,7 +263,7 @@ static void do_ls_recovery(struct dlm_ls *ls) rv = ls->ls_recover_args; ls->ls_recover_args = NULL; if (rv && ls->ls_recover_seq == rv->seq) - clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags); spin_unlock(&ls->ls_recover_lock); if (rv) { @@ -282,26 +283,34 @@ static int dlm_recoverd(void *arg) return -1; } + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); - if (!test_bit(LSFL_WORK, &ls->ls_flags)) + if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) schedule(); set_current_state(TASK_RUNNING); - if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) + if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + down_write(&ls->ls_in_recovery); + set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); + wake_up(&ls->ls_recover_lock_wait); + } + + if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags)) do_ls_recovery(ls); } + if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)) + up_write(&ls->ls_in_recovery); + dlm_put_lockspace(ls); return 0; } -void dlm_recoverd_kick(struct dlm_ls *ls) -{ - set_bit(LSFL_WORK, &ls->ls_flags); - wake_up_process(ls->ls_recoverd_task); -} - int dlm_recoverd_start(struct dlm_ls *ls) { struct task_struct *p; diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h index 866657c..8856079 100644 --- a/fs/dlm/recoverd.h +++ b/fs/dlm/recoverd.h @@ -14,7 +14,6 @@ #ifndef __RECOVERD_DOT_H__ #define __RECOVERD_DOT_H__ -void dlm_recoverd_kick(struct dlm_ls *ls); void dlm_recoverd_stop(struct dlm_ls *ls); int dlm_recoverd_start(struct dlm_ls *ls); void dlm_recoverd_suspend(struct dlm_ls *ls); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index eb4ed9b..7ff4985 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,6 +503,13 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; +#ifdef CONFIG_COMPAT + if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) +#else + if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) +#endif + return -EINVAL; + kbuf = kzalloc(count + 1, GFP_NOFS); if (!kbuf) return -ENOMEM; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 44ce5c6..d45ba45 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -275,8 +275,14 @@ out: static int ecryptfs_flush(struct file *file, fl_owner_t td) { - return file->f_mode & FMODE_WRITE - ? filemap_write_and_wait(file->f_mapping) : 0; + struct file *lower_file = ecryptfs_file_to_lower(file); + + if (lower_file->f_op && lower_file->f_op->flush) { + filemap_write_and_wait(file->f_mapping); + return lower_file->f_op->flush(lower_file, td); + } + + return 0; } static int ecryptfs_release(struct inode *inode, struct file *file) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 534b129..cc7709e7 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -619,6 +619,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; struct dentry *trap = NULL; + struct inode *target_inode; lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); @@ -626,6 +627,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); + target_inode = new_dentry->d_inode; trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); /* source should not be ancestor of target */ if (trap == lower_old_dentry) { @@ -641,6 +643,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; + if (target_inode) + fsstack_copy_attr_all(target_inode, + ecryptfs_inode_to_lower(target_inode)); fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 2768138..4e0886c 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -162,6 +162,7 @@ void ecryptfs_put_lower_file(struct inode *inode) inode_info = ecryptfs_inode_to_private(inode); if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, &inode_info->lower_file_mutex)) { + filemap_write_and_wait(inode->i_mapping); fput(inode_info->lower_file); inode_info->lower_file = NULL; mutex_unlock(&inode_info->lower_file_mutex); @@ -544,11 +545,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { + if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", - path.dentry->d_inode->i_uid, current_uid()); + i_uid_read(path.dentry->d_inode), + from_kuid(&init_user_ns, current_uid())); goto out_free; } @@ -709,6 +711,12 @@ static void ecryptfs_free_kmem_caches(void) { int i; + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index b29bb8b..5fa2471 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -33,7 +33,7 @@ static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; static int ecryptfs_hash_bits; #define ecryptfs_current_euid_hash(uid) \ - hash_long((unsigned long)current_euid(), ecryptfs_hash_bits) + hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -121,8 +121,7 @@ int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon) hlist_for_each_entry(*daemon, elem, &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], euid_chain) { - if ((*daemon)->file->f_cred->euid == current_euid() && - (*daemon)->file->f_cred->user_ns == current_user_ns()) { + if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) { rc = 0; goto out; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index bc84f36..f3913eb 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -97,8 +97,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode->i_mode = be16_to_cpu(efs_inode->di_mode); set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); - inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); - inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); + i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid)); + i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid)); inode->i_size = be32_to_cpu(efs_inode->di_size); inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); diff --git a/fs/efs/super.c b/fs/efs/super.c index e755ec7..2002431 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -96,6 +96,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(efs_inode_cachep); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eedec84..da72250 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op != EPOLL_CTL_DEL; + return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; } /* Initialize the poll safe wake up structure */ @@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } +/* + * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item + * had no event flags set, indicating that another thread may be currently + * handling that item's events (in the case that EPOLLONESHOT was being + * used). Otherwise a zero result indicates that the item has been disabled + * from receiving events. A disabled item may be re-enabled via + * EPOLL_CTL_MOD. Must be called with "mtx" held. + */ +static int ep_disable(struct eventpoll *ep, struct epitem *epi) +{ + int result = 0; + unsigned long flags; + + spin_lock_irqsave(&ep->lock, flags); + if (epi->event.events & ~EP_PRIVATE_BITS) { + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + /* Ensure ep_poll_callback will not add epi back onto ready + list: */ + epi->event.events &= EP_PRIVATE_BITS; + } + else + result = -EBUSY; + spin_unlock_irqrestore(&ep->lock, flags); + + return result; +} + static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } - - #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; + case EPOLL_CTL_DISABLE: + if (epi) + error = ep_disable(ep, epi); + else + error = -ENOENT; + break; } mutex_unlock(&ep->mtx); @@ -1810,7 +1842,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { int error; - struct file *file; + struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -1818,38 +1850,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { - error = -EFAULT; - goto error_return; - } + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget(epfd); - if (!file) - goto error_return; + f = fdget(epfd); + if (!f.file) + return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!is_file_epoll(file)) + if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout); error_fput: - fput(file); -error_return: - + fdput(f); return error; } @@ -59,26 +59,15 @@ #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> -#include <asm/exec.h> #include <trace/events/task.h> #include "internal.h" +#include "coredump.h" #include <trace/events/sched.h> -int core_uses_pid; -char core_pattern[CORENAME_MAX_SIZE] = "core"; -unsigned int core_pipe_limit; int suid_dumpable = 0; -struct core_name { - char *corename; - int used, size; -}; -static atomic_t call_count = ATOMIC_INIT(1); - -/* The maximal length of core_pattern is also specified in sysctl.c */ - static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); @@ -402,7 +391,7 @@ struct user_arg_ptr { union { const char __user *const __user *native; #ifdef CONFIG_COMPAT - compat_uptr_t __user *compat; + const compat_uptr_t __user *compat; #endif } ptr; }; @@ -613,7 +602,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length)) + vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); @@ -888,9 +877,11 @@ static int de_thread(struct task_struct *tsk) sig->notify_count--; while (sig->notify_count) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); + if (unlikely(__fatal_signal_pending(tsk))) + goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); @@ -908,9 +899,11 @@ static int de_thread(struct task_struct *tsk) write_lock_irq(&tasklist_lock); if (likely(leader->exit_state)) break; - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); schedule(); + if (unlikely(__fatal_signal_pending(tsk))) + goto killed; } /* @@ -1004,40 +997,14 @@ no_thread_group: BUG_ON(!thread_group_leader(tsk)); return 0; -} - -/* - * These functions flushes out all traces of the currently running executable - * so that a new one can be started - */ -static void flush_old_files(struct files_struct * files) -{ - long j = -1; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - for (;;) { - unsigned long set, i; - j++; - i = j * BITS_PER_LONG; - fdt = files_fdtable(files); - if (i >= fdt->max_fds) - break; - set = fdt->close_on_exec[j]; - if (!set) - continue; - fdt->close_on_exec[j] = 0; - spin_unlock(&files->file_lock); - for ( ; set ; i++,set >>= 1) { - if (set & 1) { - sys_close(i); - } - } - spin_lock(&files->file_lock); - - } - spin_unlock(&files->file_lock); +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; } char *get_task_comm(char *buf, struct task_struct *tsk) @@ -1050,6 +1017,11 @@ char *get_task_comm(char *buf, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(get_task_comm); +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ + void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); @@ -1136,7 +1108,7 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, 1); + set_dumpable(current->mm, SUID_DUMPABLE_ENABLED); else set_dumpable(current->mm, suid_dumpable); @@ -1171,7 +1143,7 @@ void setup_new_exec(struct linux_binprm * bprm) current->self_exec_id++; flush_signal_handlers(current, 0); - flush_old_files(current->files); + do_close_on_exec(current->files); } EXPORT_SYMBOL(setup_new_exec); @@ -1601,9 +1573,9 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -int compat_do_execve(char *filename, - compat_uptr_t __user *__argv, - compat_uptr_t __user *__envp, +int compat_do_execve(const char *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, struct pt_regs *regs) { struct user_arg_ptr argv = { @@ -1632,353 +1604,6 @@ void set_binfmt(struct linux_binfmt *new) EXPORT_SYMBOL(set_binfmt); -static int expand_corename(struct core_name *cn) -{ - char *old_corename = cn->corename; - - cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); - cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); - - if (!cn->corename) { - kfree(old_corename); - return -ENOMEM; - } - - return 0; -} - -static int cn_printf(struct core_name *cn, const char *fmt, ...) -{ - char *cur; - int need; - int ret; - va_list arg; - - va_start(arg, fmt); - need = vsnprintf(NULL, 0, fmt, arg); - va_end(arg); - - if (likely(need < cn->size - cn->used - 1)) - goto out_printf; - - ret = expand_corename(cn); - if (ret) - goto expand_fail; - -out_printf: - cur = cn->corename + cn->used; - va_start(arg, fmt); - vsnprintf(cur, need + 1, fmt, arg); - va_end(arg); - cn->used += need; - return 0; - -expand_fail: - return ret; -} - -static void cn_escape(char *str) -{ - for (; *str; str++) - if (*str == '/') - *str = '!'; -} - -static int cn_print_exe_file(struct core_name *cn) -{ - struct file *exe_file; - char *pathbuf, *path; - int ret; - - exe_file = get_mm_exe_file(current->mm); - if (!exe_file) { - char *commstart = cn->corename + cn->used; - ret = cn_printf(cn, "%s (path unknown)", current->comm); - cn_escape(commstart); - return ret; - } - - pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); - if (!pathbuf) { - ret = -ENOMEM; - goto put_exe_file; - } - - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); - if (IS_ERR(path)) { - ret = PTR_ERR(path); - goto free_buf; - } - - cn_escape(path); - - ret = cn_printf(cn, "%s", path); - -free_buf: - kfree(pathbuf); -put_exe_file: - fput(exe_file); - return ret; -} - -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. - */ -static int format_corename(struct core_name *cn, long signr) -{ - const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; - int ispipe = (*pat_ptr == '|'); - int pid_in_pattern = 0; - int err = 0; - - cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); - cn->corename = kmalloc(cn->size, GFP_KERNEL); - cn->used = 0; - - if (!cn->corename) - return -ENOMEM; - - /* Repeat as long as we have more pattern to process and more output - space */ - while (*pat_ptr) { - if (*pat_ptr != '%') { - if (*pat_ptr == 0) - goto out; - err = cn_printf(cn, "%c", *pat_ptr++); - } else { - switch (*++pat_ptr) { - /* single % at the end, drop that */ - case 0: - goto out; - /* Double percent, output one percent */ - case '%': - err = cn_printf(cn, "%c", '%'); - break; - /* pid */ - case 'p': - pid_in_pattern = 1; - err = cn_printf(cn, "%d", - task_tgid_vnr(current)); - break; - /* uid */ - case 'u': - err = cn_printf(cn, "%d", cred->uid); - break; - /* gid */ - case 'g': - err = cn_printf(cn, "%d", cred->gid); - break; - /* signal that caused the coredump */ - case 's': - err = cn_printf(cn, "%ld", signr); - break; - /* UNIX time of coredump */ - case 't': { - struct timeval tv; - do_gettimeofday(&tv); - err = cn_printf(cn, "%lu", tv.tv_sec); - break; - } - /* hostname */ - case 'h': { - char *namestart = cn->corename + cn->used; - down_read(&uts_sem); - err = cn_printf(cn, "%s", - utsname()->nodename); - up_read(&uts_sem); - cn_escape(namestart); - break; - } - /* executable */ - case 'e': { - char *commstart = cn->corename + cn->used; - err = cn_printf(cn, "%s", current->comm); - cn_escape(commstart); - break; - } - case 'E': - err = cn_print_exe_file(cn); - break; - /* core limit size */ - case 'c': - err = cn_printf(cn, "%lu", - rlimit(RLIMIT_CORE)); - break; - default: - break; - } - ++pat_ptr; - } - - if (err) - return err; - } - - /* Backward compatibility with core_uses_pid: - * - * If core_pattern does not include a %p (as is the default) - * and core_uses_pid is set, then .%pid will be appended to - * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern && core_uses_pid) { - err = cn_printf(cn, ".%d", task_tgid_vnr(current)); - if (err) - return err; - } -out: - return ispipe; -} - -static int zap_process(struct task_struct *start, int exit_code) -{ - struct task_struct *t; - int nr = 0; - - start->signal->flags = SIGNAL_GROUP_EXIT; - start->signal->group_exit_code = exit_code; - start->signal->group_stop_count = 0; - - t = start; - do { - task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - if (t != current && t->mm) { - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - nr++; - } - } while_each_thread(start, t); - - return nr; -} - -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - struct core_state *core_state, int exit_code) -{ - struct task_struct *g, *p; - unsigned long flags; - int nr = -EAGAIN; - - spin_lock_irq(&tsk->sighand->siglock); - if (!signal_group_exit(tsk->signal)) { - mm->core_state = core_state; - nr = zap_process(tsk, exit_code); - } - spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(nr < 0)) - return nr; - - if (atomic_read(&mm->mm_users) == nr + 1) - goto done; - /* - * We should find and kill all tasks which use this mm, and we should - * count them correctly into ->nr_threads. We don't take tasklist - * lock, but this is safe wrt: - * - * fork: - * None of sub-threads can fork after zap_process(leader). All - * processes which were created before this point should be - * visible to zap_threads() because copy_process() adds the new - * process to the tail of init_task.tasks list, and lock/unlock - * of ->siglock provides a memory barrier. - * - * do_exit: - * The caller holds mm->mmap_sem. This means that the task which - * uses this mm can't pass exit_mm(), so it can't exit or clear - * its ->mm. - * - * de_thread: - * It does list_replace_rcu(&leader->tasks, ¤t->tasks), - * we must see either old or new leader, this does not matter. - * However, it can change p->sighand, so lock_task_sighand(p) - * must be used. Since p->mm != NULL and we hold ->mmap_sem - * it can't fail. - * - * Note also that "g" can be the old leader with ->mm == NULL - * and already unhashed and thus removed from ->thread_group. - * This is OK, __unhash_process()->list_del_rcu() does not - * clear the ->next pointer, we will find the new leader via - * next_thread(). - */ - rcu_read_lock(); - for_each_process(g) { - if (g == tsk->group_leader) - continue; - if (g->flags & PF_KTHREAD) - continue; - p = g; - do { - if (p->mm) { - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code); - unlock_task_sighand(p, &flags); - } - break; - } - } while_each_thread(g, p); - } - rcu_read_unlock(); -done: - atomic_set(&core_state->nr_threads, nr); - return nr; -} - -static int coredump_wait(int exit_code, struct core_state *core_state) -{ - struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; - int core_waiters = -EBUSY; - - init_completion(&core_state->startup); - core_state->dumper.task = tsk; - core_state->dumper.next = NULL; - - down_write(&mm->mmap_sem); - if (!mm->core_state) - core_waiters = zap_threads(tsk, mm, core_state, exit_code); - up_write(&mm->mmap_sem); - - if (core_waiters > 0) { - struct core_thread *ptr; - - wait_for_completion(&core_state->startup); - /* - * Wait for all the threads to become inactive, so that - * all the thread context (extended register state, like - * fpu etc) gets copied to the memory. - */ - ptr = core_state->dumper.next; - while (ptr != NULL) { - wait_task_inactive(ptr->task, 0); - ptr = ptr->next; - } - } - - return core_waiters; -} - -static void coredump_finish(struct mm_struct *mm) -{ - struct core_thread *curr, *next; - struct task_struct *task; - - next = mm->core_state->dumper.next; - while ((curr = next) != NULL) { - next = curr->next; - task = curr->task; - /* - * see exit_mm(), curr->task must not see - * ->task == NULL before we read ->next. - */ - smp_mb(); - curr->task = NULL; - wake_up_process(task); - } - - mm->core_state = NULL; -} - /* * set_dumpable converts traditional three-value dumpable to two flags and * stores them into mm->flags. It modifies lower two bits of mm->flags, but @@ -2020,7 +1645,7 @@ void set_dumpable(struct mm_struct *mm, int value) } } -static int __get_dumpable(unsigned long mm_flags) +int __get_dumpable(unsigned long mm_flags) { int ret; @@ -2033,289 +1658,54 @@ int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -static void wait_for_dump_helpers(struct file *file) +#ifdef __ARCH_WANT_SYS_EXECVE +SYSCALL_DEFINE3(execve, + const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp) { - struct pipe_inode_info *pipe; - - pipe = file->f_path.dentry->d_inode->i_pipe; - - pipe_lock(pipe); - pipe->readers++; - pipe->writers--; - - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); + const char *path = getname(filename); + int error = PTR_ERR(path); + if (!IS_ERR(path)) { + error = do_execve(path, argv, envp, current_pt_regs()); + putname(path); } - - pipe->readers--; - pipe->writers++; - pipe_unlock(pipe); - + return error; } - - -/* - * umh_pipe_setup - * helper function to customize the process used - * to collect the core in userspace. Specifically - * it sets up a pipe and installs it as fd 0 (stdin) - * for the process. Returns 0 on success, or - * PTR_ERR on failure. - * Note that it also sets the core limit to 1. This - * is a special value that we use to trap recursive - * core dumps - */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) -{ - struct file *files[2]; - struct fdtable *fdt; - struct coredump_params *cp = (struct coredump_params *)info->data; - struct files_struct *cf = current->files; - int err = create_pipe_files(files, 0); - if (err) - return err; - - cp->file = files[1]; - - sys_close(0); - fd_install(0, files[0]); - spin_lock(&cf->file_lock); - fdt = files_fdtable(cf); - __set_open_fd(0, fdt); - __clear_close_on_exec(0, fdt); - spin_unlock(&cf->file_lock); - - /* and disallow core files too */ - current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - - return 0; +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_execve(const char __user * filename, + const compat_uptr_t __user * argv, + const compat_uptr_t __user * envp) +{ + const char *path = getname(filename); + int error = PTR_ERR(path); + if (!IS_ERR(path)) { + error = compat_do_execve(path, argv, envp, current_pt_regs()); + putname(path); + } + return error; } +#endif +#endif -void do_coredump(long signr, int exit_code, struct pt_regs *regs) +#ifdef __ARCH_WANT_KERNEL_EXECVE +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { - struct core_state core_state; - struct core_name cn; - struct mm_struct *mm = current->mm; - struct linux_binfmt * binfmt; - const struct cred *old_cred; - struct cred *cred; - int retval = 0; - int flag = 0; - int ispipe; - bool need_nonrelative = false; - static atomic_t core_dump_count = ATOMIC_INIT(0); - struct coredump_params cprm = { - .signr = signr, - .regs = regs, - .limit = rlimit(RLIMIT_CORE), - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency of bit flags, since this flag is not protected - * by any locks. - */ - .mm_flags = mm->flags, - }; - - audit_core_dumps(signr); - - binfmt = mm->binfmt; - if (!binfmt || !binfmt->core_dump) - goto fail; - if (!__get_dumpable(cprm.mm_flags)) - goto fail; - - cred = prepare_creds(); - if (!cred) - goto fail; - /* - * We cannot trust fsuid as being the "true" uid of the process - * nor do we know its entire history. We only know it was tainted - * so we dump it as root in mode 2, and only into a controlled - * environment (pipe handler or fully qualified path). - */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { - /* Setuid core dump mode */ - flag = O_EXCL; /* Stop rewrite attacks */ - cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_nonrelative = true; - } - - retval = coredump_wait(exit_code, &core_state); - if (retval < 0) - goto fail_creds; + struct pt_regs *p = current_pt_regs(); + int ret; - old_cred = override_creds(cred); + ret = do_execve(filename, + (const char __user *const __user *)argv, + (const char __user *const __user *)envp, p); + if (ret < 0) + return ret; /* - * Clear any false indication of pending signals that might - * be seen by the filesystem code called to write the core file. + * We were successful. We won't be returning to our caller, but + * instead to user space by manipulating the kernel stack. */ - clear_thread_flag(TIF_SIGPENDING); - - ispipe = format_corename(&cn, signr); - - if (ispipe) { - int dump_count; - char **helper_argv; - - if (ispipe < 0) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; - } - - if (cprm.limit == 1) { - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. - * - * Normally core limits are irrelevant to pipes, since - * we're not writing to the file system, but we use - * cprm.limit of 1 here as a speacial value, this is a - * consistent way to catch recursive crashes. - * We can still crash if the core_pattern binary sets - * RLIM_CORE = !1, but it runs as root, and can do - * lots of stupid things. - * - * Note that we use task_tgid_vnr here to grab the pid - * of the process group leader. That way we get the - * right pid if a thread in a multi-threaded - * core_pattern process dies. - */ - printk(KERN_WARNING - "Process %d(%s) has RLIMIT_CORE set to 1\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Aborting core\n"); - goto fail_unlock; - } - cprm.limit = RLIM_INFINITY; - - dump_count = atomic_inc_return(&core_dump_count); - if (core_pipe_limit && (core_pipe_limit < dump_count)) { - printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Skipping core dump\n"); - goto fail_dropcount; - } - - helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); - if (!helper_argv) { - printk(KERN_WARNING "%s failed to allocate memory\n", - __func__); - goto fail_dropcount; - } - - retval = call_usermodehelper_fns(helper_argv[0], helper_argv, - NULL, UMH_WAIT_EXEC, umh_pipe_setup, - NULL, &cprm); - argv_free(helper_argv); - if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", - cn.corename); - goto close_fail; - } - } else { - struct inode *inode; - - if (cprm.limit < binfmt->min_coredump) - goto fail_unlock; - - if (need_nonrelative && cn.corename[0] != '/') { - printk(KERN_WARNING "Pid %d(%s) can only dump core "\ - "to fully qualified path!\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Skipping core dump\n"); - goto fail_unlock; - } - - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, - 0600); - if (IS_ERR(cprm.file)) - goto fail_unlock; - - inode = cprm.file->f_path.dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; - if (d_unhashed(cprm.file->f_path.dentry)) - goto close_fail; - /* - * AK: actually i see no reason to not allow this for named - * pipes etc, but keep the previous behaviour for now. - */ - if (!S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files. - */ - if (!uid_eq(inode->i_uid, current_fsuid())) - goto close_fail; - if (!cprm.file->f_op || !cprm.file->f_op->write) - goto close_fail; - if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) - goto close_fail; - } - - retval = binfmt->core_dump(&cprm); - if (retval) - current->signal->group_exit_code |= 0x80; - - if (ispipe && core_pipe_limit) - wait_for_dump_helpers(cprm.file); -close_fail: - if (cprm.file) - filp_close(cprm.file, NULL); -fail_dropcount: - if (ispipe) - atomic_dec(&core_dump_count); -fail_unlock: - kfree(cn.corename); -fail_corename: - coredump_finish(mm); - revert_creds(old_cred); -fail_creds: - put_cred(cred); -fail: - return; -} - -/* - * Core dumping helper functions. These are the only things you should - * do on a core-file: use only these functions to write out all the - * necessary info. - */ -int dump_write(struct file *file, const void *addr, int nr) -{ - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; + ret_from_kernel_execve(p); } -EXPORT_SYMBOL(dump_write); - -int dump_seek(struct file *file, loff_t off) -{ - int ret = 1; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; - } - free_page((unsigned long)buf); - } - return ret; -} -EXPORT_SYMBOL(dump_seek); +#endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 1562c27..b561810 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1172,8 +1172,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) /* copy stuff from on-disk struct to in-memory struct */ inode->i_mode = le16_to_cpu(fcb.i_mode); - inode->i_uid = le32_to_cpu(fcb.i_uid); - inode->i_gid = le32_to_cpu(fcb.i_gid); + i_uid_write(inode, le32_to_cpu(fcb.i_uid)); + i_gid_write(inode, le32_to_cpu(fcb.i_gid)); set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); @@ -1385,8 +1385,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync) fcb = &args->fcb; fcb->i_mode = cpu_to_le16(inode->i_mode); - fcb->i_uid = cpu_to_le32(inode->i_uid); - fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_uid = cpu_to_le32(i_uid_read(inode)); + fcb->i_gid = cpu_to_le32(i_gid_read(inode)); fcb->i_links_count = cpu_to_le16(inode->i_nlink); fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 5f376d1..b963f38 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) { - unsigned p; + int p; for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index dde41a7..59e3bbf 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -206,6 +206,11 @@ static int init_inodecache(void) */ static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(exofs_inode_cachep); } diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 5a7b691..1b4f2f9 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -80,8 +80,13 @@ static ssize_t uri_show(struct exofs_dev *edp, char *buf) static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) { + uint8_t *new_uri; + edp->urilen = strlen(buf) + 1; - edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + if (new_uri == NULL) + return -ENOMEM; + edp->uri = new_uri; strncpy(edp->uri, buf, edp->urilen); return edp->urilen; } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 35d6a3c..110b6b3 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -53,16 +53,23 @@ ext2_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext2_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext2_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext2_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -96,14 +103,19 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION); e = (char *)ext_acl + sizeof(ext2_acl_header); for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext2_acl_entry *entry = (ext2_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext2_acl_entry); + break; case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext2_acl_entry); break; @@ -350,7 +362,7 @@ ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -371,7 +383,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 376aa77..2616d0e 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -479,7 +479,7 @@ void ext2_discard_reservation(struct inode *inode) /** * ext2_free_blocks() -- Free given blocks and update quota and i_blocks * @inode: inode - * @block: start physcial block to free + * @block: start physical block to free * @count: number of blocks to free */ void ext2_free_blocks (struct inode * inode, unsigned long block, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index af74d9e..6c205d0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,6 +206,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext2_inode_cachep); } diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index c76832c..dbb5ad5 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -48,16 +48,23 @@ ext3_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext3_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext3_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -91,14 +98,19 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); e = (char *)ext_acl + sizeof(ext3_acl_header); for (n=0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext3_acl_entry *entry = (ext3_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext3_acl_entry); + break; case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext3_acl_entry); break; @@ -369,7 +381,7 @@ ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -392,7 +404,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 90d901f..7320a66 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -483,7 +483,7 @@ void ext3_discard_reservation(struct inode *inode) * ext3_free_blocks_sb() -- Free given blocks and update quota * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physical block to free * @count: number of blocks to free * @pdquot_freed_blocks: pointer to quota */ diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index a075973..7e87e37 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; uid_t i_uid; gid_t i_gid; @@ -3113,7 +3115,11 @@ again: raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -3129,8 +3135,11 @@ again: if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -3183,6 +3192,8 @@ again: ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); @@ -3196,7 +3207,7 @@ out_brelse: * * - Within generic_file_write() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * * - Within sys_sync(), kupdate and such. * We wait on commit, if tol to. diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8c892e9..17ae5c8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -532,6 +532,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext3_inode_cachep); } @@ -975,7 +980,7 @@ static int parse_options (char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -1479,10 +1484,12 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } @@ -2803,7 +2810,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) static inline struct inode *dquot_to_inode(struct dquot *dquot) { - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext3_write_dquot(struct dquot *dquot) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a5c29bb..d3c5b88 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -55,16 +55,23 @@ ext4_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -98,13 +105,19 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch (acl->a_entries[n].e_tag) { + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(ext4_acl_entry); + break; case ACL_GROUP: - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; @@ -374,7 +387,7 @@ ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -397,7 +410,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3411d4..3ab2539 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -186,7 +186,6 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 #define EXT4_IO_END_DIRECT 0x0008 -#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -912,9 +911,7 @@ struct ext4_inode_info { struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - /* current io_end structure for async DIO write*/ - ext4_io_end_t *cur_aio_dio; - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1233,6 +1230,7 @@ struct ext4_sb_info { spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; + unsigned int s_group_info_size; /* tunables */ unsigned long s_stripe; @@ -1243,6 +1241,7 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_writeback_mb_bump; + unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -1270,8 +1269,12 @@ struct ext4_sb_info { unsigned long s_sectors_written_start; u64 s_kbytes_written; + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; @@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + atomic_inc(&EXT4_I(inode)->i_unwritten); } } +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + /* * Inode dynamic state flags */ @@ -1345,6 +1358,8 @@ enum { EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_completed_IO(struct inode *); +extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, @@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb, extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); @@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations; extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); +extern void ext4_unwritten_wait(struct inode *inode); /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; @@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); +extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern int ext4_end_io_nolock(ext4_io_end_t *io); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, @@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aabbb3f..1c94cca 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + le16_add_cpu(&neh->eh_depth, 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, } /* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2)) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + brelse(path[1].p_bh); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); +} + +/* * This function tries to merge the @ex extent to neighbours in the tree. * return 1 if merge left else 0. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; - int ret = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode, merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); + (void) ext4_ext_try_to_merge_right(inode, path, ex); - return ret; + ext4_ext_try_to_merge_up(handle, inode, path); } /* @@ -1893,7 +1937,7 @@ has_space: merge: /* try to merge extents */ if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, nearex); + ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ @@ -1901,7 +1945,7 @@ merge: if (err) goto cleanup; - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: if (npath) { @@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_check_cache() + * ext4_ext_in_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. + * cache extent pointer. * * @inode: The files inode * @block: The block to look for in the cache @@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ struct ext4_ext_cache *cex; struct ext4_sb_info *sbi; int ret = 0; @@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ex->ee_block = cpu_to_le32(cex->ec_block); + ext4_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); @@ -2138,37 +2183,6 @@ errout: } /* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; -} - - -/* * ext4_ext_rm_idx: * removes index from the index block. */ @@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = 0; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; + /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we make a note @@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_ext_path *path = NULL; ext4_fsblk_t partial_cluster = 0; handle_t *handle; - int i = 0, err; + int i = 0, err = 0; ext_debug("truncate since %u to %u\n", start, end); @@ -2604,12 +2621,16 @@ again: return PTR_ERR(path); } depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; - goto cont; + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EIO; + } + goto out; } ee_block = le32_to_cpu(ex->ee_block); @@ -2641,8 +2662,6 @@ again: goto out; } } -cont: - /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. @@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } else if (err) goto fix_extent_len; @@ -3041,7 +3060,6 @@ out: return err ? err : map->m_len; } -#define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth; - int allocated; + int allocated, max_zeroout = 0; int err = 0; int split_flag = 0; @@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len); + sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) @@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + inode->i_sb->s_blocksize_bits; + + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; @@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (max_zeroout && (allocated > map->m_len)) { + if (allocated <= max_zeroout) { /* case 3 */ zero_ex.ee_block = cpu_to_le32(map->m_lblk); @@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { /* case 2 */ if (map->m_lblk != ee_block) { zero_ex.ee_block = ex->ee_block; @@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, 0); if (allocated < 0) err = allocated; @@ -3256,7 +3276,7 @@ out: * to an uninitialized extent. * * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple /initialized uninitialized extents (up to three) + * extent into multiple initialized/uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized * b> Splits in two extents: Write is happening at either end of the extent @@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; @@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, { int ret = 0; int err = 0; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, path, flags); + if (ret <= 0) + goto out; /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to conversion to written when IO is @@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; + int set_unwritten = 0; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4082,13 +4105,8 @@ got_allocated_blocks: * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + set_unwritten = 1; if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; } @@ -4100,6 +4118,15 @@ got_allocated_blocks: if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + + if (!err && set_unwritten) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; @@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode) * finish any pending end_io work so we won't run the risk of * converting any truncated blocks to initialized later */ - ext4_flush_completed_IO(inode); + ext4_flush_unwritten_io(inode); /* * probably first extent we're gonna free will be last in block @@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t first_page_offset, last_page_offset; int credits, err = 0; + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + + if (err) + return err; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + err = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + err = -ETXTBSY; + goto out_mutex; + } + /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) - return 0; + goto out_mutex; /* * If the hole extends beyond i_size, set the hole @@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, last_page_offset - 1); } - /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + err = ext4_flush_unwritten_io(inode); + if (err) + goto out_dio; + inode_dio_wait(inode); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_dio; + } - err = ext4_orphan_add(handle, inode); - if (err) - goto out; /* * Now we need to zero out the non-page-aligned data in the @@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); return err; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3b0e3bd..bf3966b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_aiodio_wait(struct inode *inode) +void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* @@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, "performance will be poor.", inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); - ext4_aiodio_wait(inode); + ext4_unwritten_wait(inode); } BUG_ON(iocb->ki_pos != pos); @@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2a1dcea..be1d89f 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -34,87 +34,6 @@ #include <trace/events/ext4.h> -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4FS_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int ext4_flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - list_del_init(&io->list); - io->flag |= EXT4_IO_END_IN_FSYNC; - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * conversion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - if (ret < 0) - ret2 = ret; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - io->flag &= ~EXT4_IO_END_IN_FSYNC; - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since @@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret; + int ret, err; tid_t commit_tid; bool needs_barrier = false; @@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) goto out; - ret = ext4_flush_completed_IO(inode); + ret = ext4_flush_unwritten_io(inode); if (ret < 0) goto out; @@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) needs_barrier = true; jbd2_log_start_commit(journal, commit_tid); ret = jbd2_log_wait_commit(journal, commit_tid); - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (needs_barrier) { + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } out: mutex_unlock(&inode->i_mutex); trace_ext4_sync_file_exit(inode, ret); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 26154b8..fa36372f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -697,6 +697,15 @@ got_group: if (!gdp) goto fail; + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 830e1b2..792e388 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(!list_empty(&ei->i_completed_io_list))) { + if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { mutex_lock(&inode->i_mutex); - ext4_flush_completed_IO(inode); + ext4_flush_unwritten_io(inode); mutex_unlock(&inode->i_mutex); } + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + atomic_inc(&inode->i_dio_count); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) { + inode_dio_done(inode); + goto locked; + } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); + inode_dio_done(inode); } else { +locked: ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, ext4_get_block); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dff171c..b3c243b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); + /* ensure we send some value back into *errp */ + *errp = 0; + if (err < 0) *errp = err; if (err <= 0) return NULL; - *errp = 0; bh = sb_getblk(inode->i_sb, map.m_pblk); if (!bh) { @@ -1954,9 +1956,6 @@ out: return ret; } -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); - /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb) free_blocks = EXT4_C2B(sbi, percpu_counter_read_positive(&sbi->s_freeclusters_counter)); dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && + !writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } + if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { /* @@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb) */ return 1; } - /* - * Even if we don't switch but are nearing capacity, - * start pushing delalloc when 1/2 of free blocks are dirty. - */ - if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); - return 0; } @@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ext4_io_end_t *io_end = iocb->private; - struct workqueue_struct *wq; - unsigned long flags; - struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -2910,24 +2909,14 @@ out: io_end->iocb = iocb; io_end->result = ret; } - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* Add the io_end to per-inode completed aio dio list*/ - ei = EXT4_I(io_end->inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &ei->i_completed_io_list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) { ext4_io_end_t *io_end = bh->b_private; - struct workqueue_struct *wq; struct inode *inode; - unsigned long flags; if (!test_clear_buffer_uninit(bh) || !io_end) goto out; @@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) */ inode = io_end->inode; ext4_set_io_unwritten_flag(inode, io_end); - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); out: bh->b_private = NULL; bh->b_end_io = NULL; @@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, overwrite = *((int *)iocb->private); if (overwrite) { + atomic_inc(&inode->i_dio_count); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * hook to the iocb. */ iocb->private = NULL; - EXT4_I(inode)->cur_aio_dio = NULL; + ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); @@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * is a unwritten extents needs to be converted * when IO is completed. */ - EXT4_I(inode)->cur_aio_dio = iocb->private; + ext4_inode_aio_set(inode, io_end); } if (overwrite) @@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, DIO_LOCKING); if (iocb->private) - EXT4_I(inode)->cur_aio_dio = NULL; + ext4_inode_aio_set(inode, NULL); /* * The io_end structure takes a reference to the inode, * that structure needs to be destroyed and the @@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, retake_lock: /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { + inode_dio_done(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } @@ -3313,7 +3296,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * handle: The journal handle * inode: The files inode * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the begining of the file) + * from: The starting byte offset (from the beginning of the file) * to begin discarding * len: The length of bytes to discard * flags: Optional flags that may be used: @@ -3321,11 +3304,11 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED * Only zero the regions of the page whose buffer heads * have already been unmapped. This flag is appropriate - * for updateing the contents of a page whose blocks may + * for updating the contents of a page whose blocks may * have already been released, and we only want to zero * out the regions that correspond to those released blocks. * - * Returns zero on sucess or negative on failure. + * Returns zero on success or negative on failure. */ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, struct inode *inode, struct page *page, loff_t from, @@ -3486,7 +3469,7 @@ int ext4_can_truncate(struct inode *inode) * @offset: The offset where the hole will begin * @len: The length of the hole * - * Returns: 0 on sucess or negative on failure + * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) @@ -4008,7 +3991,7 @@ static int ext4_inode_blocks_set(handle_t *handle, if (i_blocks <= ~0U) { /* - * i_blocks can be represnted in a 32 bit variable + * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); @@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; uid_t i_uid; gid_t i_gid; @@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize != ext4_isize(raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - ext4_update_inode_fsync_trans(handle, inode, 0); + ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -4169,7 +4156,7 @@ out_brelse: * * - Within generic_file_write() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * * - Within sys_sync(), kupdate and such. * We wait on commit, if tol to. @@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - inode_dio_wait(inode); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) + if (attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); + /* Inode size will be reduced, wait for dio in flight. + * Temporarily disable dioread_nolock to prevent + * livelock. */ + if (orphan) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } + } ext4_truncate(inode); } @@ -4413,7 +4408,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over - * different block groups too. If they are contiuguous, with flexbg, + * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks @@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + jbd2_journal_lock_updates(journal); /* @@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ @@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int retries = 0; sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7f7dad7..5747f52 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -233,7 +233,7 @@ group_extend_out: case EXT4_IOC_MOVE_EXT: { struct move_extent me; - struct file *donor_filp; + struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || @@ -245,11 +245,11 @@ group_extend_out: return -EFAULT; me.moved_len = 0; - donor_filp = fget(me.donor_fd); - if (!donor_filp) + donor = fdget(me.donor_fd); + if (!donor.file) return -EBADF; - if (!(donor_filp->f_mode & FMODE_WRITE)) { + if (!(donor.file->f_mode & FMODE_WRITE)) { err = -EBADF; goto mext_out; } @@ -258,14 +258,15 @@ group_extend_out: EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); if (err) goto mext_out; - err = ext4_move_extents(filp, donor_filp, me.orig_start, + err = ext4_move_extents(filp, donor.file, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); @@ -273,7 +274,7 @@ group_extend_out: &me, sizeof(me))) err = -EFAULT; mext_out: - fput(donor_filp); + fdput(donor); return err; } @@ -365,26 +366,11 @@ group_add_out: return -EOPNOTSUPP; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with meta_bg"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } - if (n_blocks_count > MAX_32_NUM && - !EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, - "File system only supports 32-bit block numbers"); - return -EOPNOTSUPP; - } - err = ext4_resize_begin(sb); if (err) return err; @@ -419,13 +405,6 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "FITRIM not supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8eae947..f8b27bf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -24,6 +24,7 @@ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/debugfs.h> +#include <linux/log2.h> #include <linux/slab.h> #include <trace/events/ext4.h> @@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); } -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, +static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; - int max; + int max, order; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); - buddy = mb_find_buddy(e4b, order, &max); + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { @@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, return 0; } - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; @@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; @@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; @@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; + if (grp->bb_free == 0) + return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { @@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, @@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; ac->ac_b_ex = ex; @@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); + free = grp->bb_free; + if (free == 0) + return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } - free = grp->bb_free; fragments = grp->bb_fragments; - if (free == 0) - return 0; if (fragments == 0) return 0; @@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) return cachep; } +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + ext4_kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} + /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) @@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } - memset(meta_group_info[i], 0, kmem_cache_size(cachep)); set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); @@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb) ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int num_meta_group_infos; - int num_meta_group_infos_max; - int array_size; + int err; struct ext4_group_desc *desc; struct kmem_cache *cachep; - /* This is the number of blocks used by GDT */ - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - - /* - * This is the total number of blocks used by GDT including - * the number of reserved blocks for GDT. - * The s_group_info array is allocated with this value - * to allow a clean online resize without a complex - * manipulation of pointer. - * The drawback is the unused memory when no resize - * occurs but it's very low in terms of pages - * (see comments below) - * Need to handle this properly when META_BG resizing is allowed - */ - num_meta_group_infos_max = num_meta_group_infos + - le16_to_cpu(es->s_reserved_gdt_blocks); + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; - /* - * array_size is the size of s_group_info array. We round it - * to the next power of two because this approximation is done - * internally by kmalloc so we can have some more memory - * for free here (e.g. may be used for META_BG resize). - */ - array_size = 1; - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); - return -ENOMEM; - } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); @@ -2322,7 +2323,7 @@ err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); - i = num_meta_group_infos; + i = sbi->s_group_info_size; while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); @@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - memset(ac, 0, sizeof(struct ext4_allocation_context)); ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; @@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } } - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; @@ -4657,6 +4657,8 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4709,7 +4711,7 @@ error_return: * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to add to the block group + * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. @@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; - minlen = range->minlen >> sb->s_blocksize_bits; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || unlikely(start >= max_blks)) @@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: - range->len = trimmed * sb->s_blocksize; + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index c070618..3ccd889 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -65,11 +65,6 @@ extern u8 mb_enable_debug; #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c5826c6..292daee 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * mext_check_null_inode - NULL check for two inodes - * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ -static int -mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function, unsigned int line) -{ - int ret = 0; - - if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 NULL inode2 %lu", inode2->i_ino); - ret = -EIO; - } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 %lu inode2 NULL", inode1->i_ino); - ret = -EIO; - } - return ret; -} - -/** * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire write lock of i_data_sem of the two inodes (orig and donor) by - * i_ino order. + * Acquire write lock of i_data_sem of the two inodes */ static void -double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *first, struct inode *second) { - struct inode *first = orig_inode, *second = donor_inode; + if (first < second) { + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } else { + down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; } - - down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** @@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, diff = donor_off - le32_to_cpu(tmp_dext->ee_block); ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - tmp_dext->ee_block = - cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + le32_add_cpu(&tmp_dext->ee_block, diff); + le16_add_cpu(&tmp_dext->ee_len, -diff); if (max_count < ext4_ext_get_actual_len(tmp_dext)) tmp_dext->ee_len = cpu_to_le16(max_count); @@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, } /** + * mext_check_coverage - Check that all extents in range has the same type + * + * @inode: inode in question + * @from: block offset of inode + * @count: block count to be checked + * @uninit: extents expected to be uninitialized + * @err: pointer to save error value + * + * Return 1 if all extents in range has expected type, and zero otherwise. + */ +static int +mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, + int uninit, int *err) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ext; + ext4_lblk_t last = from + count; + while (from < last) { + *err = get_ext_path(inode, from, &path); + if (*err) + return 0; + ext = path[ext_depth(inode)].p_ext; + if (!ext) { + ext4_ext_drop_refs(path); + return 0; + } + if (uninit != ext4_ext_is_uninitialized(ext)) { + ext4_ext_drop_refs(path); + return 0; + } + from += ext4_ext_get_actual_len(ext); + ext4_ext_drop_refs(path); + } + return 1; +} + +/** * mext_replace_branches - Replace original extents with new extents * * @handle: journal handle @@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); - /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -764,12 +763,122 @@ out: ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); - double_up_write_data_sem(orig_inode, donor_inode); - return replaced_count; } /** + * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * @index: page index + * @page: result page vector + * + * Grab two locked pages for inode's by inode order + */ +static int +mext_page_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index, struct page *page[2]) +{ + struct address_space *mapping[2]; + unsigned fl = AOP_FLAG_NOFS; + + BUG_ON(!inode1 || !inode2); + if (inode1 < inode2) { + mapping[0] = inode1->i_mapping; + mapping[1] = inode2->i_mapping; + } else { + mapping[0] = inode2->i_mapping; + mapping[1] = inode1->i_mapping; + } + + page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + if (!page[0]) + return -ENOMEM; + + page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + if (!page[1]) { + unlock_page(page[0]); + page_cache_release(page[0]); + return -ENOMEM; + } + + if (inode1 > inode2) { + struct page *tmp; + tmp = page[0]; + page[0] = page[1]; + page[1] = tmp; + } + return 0; +} + +/* Force page buffers uptodate w/o dropping page's lock */ +static int +mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + sector_t block; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, block_start, block_end; + int i, err, nr = 0, partial = 0; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (PageUptodate(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + head = page_buffers(page); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + continue; + } + if (buffer_uptodate(bh)) + continue; + if (!buffer_mapped(bh)) { + int err = 0; + err = ext4_get_block(inode, block, bh, 0); + if (err) { + SetPageError(page); + return err; + } + if (!buffer_mapped(bh)) { + zero_user(page, block_start, blocksize); + if (!err) + set_buffer_uptodate(bh); + continue; + } + } + BUG_ON(nr >= MAX_BUF_PER_PAGE); + arr[nr++] = bh; + } + /* No io required */ + if (!nr) + goto out; + + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); + if (err) + return err; + } + } +out: + if (!partial) + SetPageUptodate(page); + return 0; +} + +/** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file @@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct address_space *mapping = orig_inode->i_mapping; - struct buffer_head *bh; - struct page *page = NULL; - const struct address_space_operations *a_ops = mapping->a_ops; + struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; - void *fsdata; - int i, jblocks; - int err2 = 0; + int err2, jblocks, retries = 0; int replaced_count = 0; + int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ +again: + *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { @@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; - /* - * If orig extent is uninitialized one, - * it's not necessary force the page into memory - * and then force it to be written out again. - * Just swap data blocks between orig and donor. - */ - if (uninit) { - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); - goto out2; - } - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; /* Calculate data_size */ @@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, replaced_size = data_size; - *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, - &page, &fsdata); + *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, + pagep); if (unlikely(*err < 0)) - goto out; - - if (!PageUptodate(page)) { - mapping->a_ops->readpage(o_filp, page); - lock_page(page); - } - + goto stop_journal; /* - * try_to_release_page() doesn't call releasepage in writeback mode. - * We should care about the order of writing to the same file - * by multiple move extent processes. - * It needs to call wait_on_page_writeback() to wait for the - * writeback of the page. + * If orig extent was uninitialized it can become initialized + * at any time after i_data_sem was dropped, in order to + * serialize with delalloc we have recheck extent while we + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. */ - wait_on_page_writeback(page); + if (uninit) { + double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to + * fallback to data copying */ + uninit = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; - /* Release old bh and drop refs */ - try_to_release_page(page, 0); + uninit &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); + if (*err) + goto drop_data_sem; + + if (!uninit) { + double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } + if ((page_has_private(pagep[0]) && + !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && + !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto drop_data_sem; + } + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + drop_data_sem: + double_up_write_data_sem(orig_inode, donor_inode); + goto unlock_pages; + } +data_copy: + *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + if (*err) + goto unlock_pages; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ + if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || + (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + *err = -EBUSY; + goto unlock_pages; + } replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page, - &err2); - if (err2) { + orig_blk_offset, + block_len_in_page, err); + if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto out; + goto unlock_pages; } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ + *err = __block_write_begin(pagep[0], from, from + replaced_size, + ext4_get_block); + if (!*err) + *err = block_commit_write(pagep[0], from, from + replaced_size); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); - - bh = page_buffers(page); - for (i = 0; i < data_offset_in_page; i++) - bh = bh->b_this_page; - - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, - (sector_t)(orig_blk_offset + i), bh, 0); - if (*err < 0) - goto out; - - if (bh->b_this_page != NULL) - bh = bh->b_this_page; - } - - *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, - page, fsdata); - page = NULL; - -out: - if (unlikely(page)) { - if (PageLocked(page)) - unlock_page(page); - page_cache_release(page); - ext4_journal_stop(handle); - } -out2: + if (unlikely(*err < 0)) + goto repair_branches; + + /* Even in case of data=writeback it is reasonable to pin + * inode to transaction, to prevent unexpected data loss */ + *err = ext4_jbd2_file_inode(handle, orig_inode); + +unlock_pages: + unlock_page(pagep[0]); + page_cache_release(pagep[0]); + unlock_page(pagep[1]); + page_cache_release(pagep[1]); +stop_journal: ext4_journal_stop(handle); - - if (err2) - *err = err2; - + /* Buffer was busy because probably is pinned to journal transaction, + * force transaction commit may help to free it. */ + if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, + &retries)) + goto again; return replaced_count; + +repair_branches: + /* + * This should never ever happen! + * Extents are swapped already, but we are not able to copy data. + * Try to swap extents to it's original places + */ + double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, + orig_blk_offset, + block_len_in_page, &err2); + double_up_write_data_sem(orig_inode, donor_inode); + if (replaced_count != block_len_in_page) { + EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), + "Unable to copy data block," + " data will be lost."); + *err = -EIO; + } + replaced_count = 0; + goto unlock_pages; } /** @@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* Files should be in the same ext4 FS */ - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " @@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode, } if ((orig_start >= EXT_MAX_BLOCKS) || - (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " @@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode, * @inode1: the inode structure * @inode2: the inode structure * - * Lock two inodes' i_mutex by i_ino order. - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + * Lock two inodes' i_mutex */ -static int +static void mext_inode_double_lock(struct inode *inode1, struct inode *inode2) { - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1 == inode2) { - mutex_lock(&inode1->i_mutex); - goto out; - } - - if (inode1->i_ino < inode2->i_ino) { + BUG_ON(inode1 == inode2); + if (inode1 < inode2) { mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); } else { mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); } - -out: - return ret; } /** @@ -1109,28 +1223,13 @@ out: * @inode1: the inode that is released first * @inode2: the inode that is released second * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static int +static void mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) { - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); - -out: - return ret; + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); } /** @@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret1, ret2, depth, last_extent = 0; + int ret, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; int uninit; - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* orig and donor should be different inodes */ + if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", + "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - + /* TODO: This is non obvious task to swap blocks for inodes with full + jornaling enabled */ + if (ext4_should_journal_data(orig_inode) || + ext4_should_journal_data(donor_inode)) { + return -EINVAL; + } /* Protect orig and donor inodes against a truncate */ - ret1 = mext_inode_double_lock(orig_inode, donor_inode); - if (ret1 < 0) - return ret1; + mext_inode_double_lock(orig_inode, donor_inode); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(orig_inode); + ext4_inode_block_unlocked_dio(donor_inode); + inode_dio_wait(orig_inode); + inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); - if (ret1) + if (ret) goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; @@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (file_end < block_end) len -= block_end - file_end; - ret1 = get_ext_path(orig_inode, block_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &orig_path); + if (ret) goto out; /* Get path structure to check the hole */ - ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret) goto out; depth = ext_depth(orig_inode); @@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; goto out; } seq_start = le32_to_cpu(ext_cur->ee_block); @@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret1 = -EINVAL; + ret = -EINVAL; goto out; } @@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret1 = last_extent; + ret = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, orig_page_offset, data_offset_in_page, block_len_in_page, uninit, - &ret1); + &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - if (ret1 < 0) + if (ret < 0) break; if (*moved_len > len) { EXT4_ERROR_INODE(orig_inode, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); - ret1 = -EIO; + ret = -EIO; break; } @@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } double_down_write_data_sem(orig_inode, donor_inode); - if (ret1 < 0) + if (ret < 0) break; /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - ret1 = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret1) + ret = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret) break; ext_cur = holecheck_path[depth].p_ext; @@ -1412,12 +1527,9 @@ out: kfree(holecheck_path); } double_up_write_data_sem(orig_inode, donor_inode); - ret2 = mext_inode_double_unlock(orig_inode, donor_inode); - - if (ret1) - return ret1; - else if (ret2) - return ret2; + ext4_inode_resume_unlocked_dio(orig_inode); + ext4_inode_resume_unlocked_dio(donor_inode); + mext_inode_double_unlock(orig_inode, donor_inode); - return 0; + return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a42cc0..6d600a6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle, { struct buffer_head *bh; + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && + ((inode->i_size >> 10) >= + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { + *err = -ENOSPC; + return NULL; + } + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; bh = ext4_bread(handle, inode, *block, 1, err); @@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle, bh = NULL; } } + if (!bh && !(*err)) { + *err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } return bh; } @@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { + if (*err == 0) + *err = ERR_BAD_DX_DIR; goto fail; + } root = (struct dx_root *) bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && @@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { + if (!(*err)) + *err = ERR_BAD_DX_DIR; goto fail2; + } at = entries = ((struct dx_node *) bh->b_data)->entries; if (!buffer_verified(bh) && @@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, */ while (num_frames--) { if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) + 0, &err))) { + if (!err) { + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + return -EIO; + } return err; /* Failure */ + } if (!buffer_verified(bh) && !ext4_dx_csum_verify(dir, @@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file, { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; - int err, count = 0; + int err = 0, count = 0; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { + if (!err) { + err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } return err; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { + if (!(*err)) { + *err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } goto errout; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, @@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - bh = ext4_bread(handle, dir, block, 0, &retval); - if(!bh) + if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { + if (!retval) { + retval = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } return retval; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, entries = frame->entries; at = frame->at; - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { + if (!err) { + err = -EIO; + ext4_error(dir->i_sb, + "Directory hole detected on inode %lu\n", + dir->i_ino); + } goto cleanup; + } if (!buffer_verified(bh) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) @@ -2149,9 +2202,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4_FS_XATTR inode->i_op = &ext4_special_inode_operations; -#endif err = ext4_add_nondir(handle, dentry, inode); } ext4_journal_stop(handle); @@ -2199,9 +2250,15 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); - if (!dir_block) + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } goto out_clear_inode; + } BUFFER_TRACE(dir_block, "get_write_access"); err = ext4_journal_get_write_access(handle, dir_block); if (err) @@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode) EXT4_ERROR_INODE(inode, "error %d reading directory " "lblock %u", err, lblock); + else + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + offset += sb->s_blocksize; continue; } @@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0, rc; - if (!ext4_handle_valid(handle)) + if (!EXT4_SB(sb)->s_journal) return 0; mutex_lock(&EXT4_SB(sb)->s_orphan_lock); @@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - /* ext4_handle_valid() assumes a valid handle_t pointer */ - if (handle && !ext4_handle_valid(handle)) + if (!EXT4_SB(inode->i_sb)->s_journal) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); @@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (sbi->s_journal && !handle) + if (!handle) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); @@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) + if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { + if (!retval) { + retval = -EIO; + ext4_error(old_inode->i_sb, + "Directory hole detected on inode %lu\n", + old_inode->i_ino); + } goto end_rename; + } if (!buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dcdeef1..68e896e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io) int i; BUG_ON(!io); + BUG_ON(!list_empty(&io->list)); + BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + if (io->page) put_page(io->page); for (i = 0; i < io->num_io_pages; i++) @@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io) kmem_cache_free(io_end_cachep, io); } -/* - * check a range of space and convert unwritten extents to written. - * - * Called with inode->i_mutex; we depend on this when we manipulate - * io->flag, since we could otherwise race with ext4_flush_completed_IO() - */ -int ext4_end_io_nolock(ext4_io_end_t *io) +/* check a range of space and convert unwritten extents to written. */ +static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; @@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - if (io->iocb) aio_complete(io->iocb, io->result, 0); if (io->flag & EXT4_IO_END_DIRECT) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); return ret; } -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) +static void dump_completed_IO(struct inode *inode) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { + ext4_debug("inode %lu completed_io list is empty\n", + inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} + +/* Add the io_end to per-inode completed end_io list. */ +void ext4_add_complete_io(ext4_io_end_t *io_end) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct workqueue_struct *wq; + unsigned long flags; + + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io->flag & EXT4_IO_END_IN_FSYNC) - goto requeue; - if (list_empty(&io->list)) { - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - goto free; + if (list_empty(&ei->i_completed_io_list)) { + io_end->flag |= EXT4_IO_END_QUEUED; + queue_work(wq, &io_end->work); } + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} - if (!mutex_trylock(&inode->i_mutex)) { - bool was_queued; -requeue: - was_queued = !!(io->flag & EXT4_IO_END_QUEUED); - io->flag |= EXT4_IO_END_QUEUED; - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - /* - * Requeue the work instead of waiting so that the work - * items queued after this can be processed. - */ - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); - /* - * To prevent the ext4-dio-unwritten thread from keeping - * requeueing end_io requests and occupying cpu for too long, - * yield the cpu if it sees an end_io request that has already - * been requeued. - */ - if (was_queued) - yield(); - return; +static int ext4_do_flush_completed_IO(struct inode *inode, + ext4_io_end_t *work_io) +{ + ext4_io_end_t *io; + struct list_head unwritten, complete, to_free; + unsigned long flags; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, ret = 0; + + INIT_LIST_HEAD(&complete); + INIT_LIST_HEAD(&to_free); + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + dump_completed_IO(inode); + list_replace_init(&ei->i_completed_io_list, &unwritten); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + while (!list_empty(&unwritten)) { + io = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io->list); + + err = ext4_end_io(io); + if (unlikely(!ret && err)) + ret = err; + + list_add_tail(&io->list, &complete); + } + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&complete)) { + io = list_entry(complete.next, ext4_io_end_t, list); + io->flag &= ~EXT4_IO_END_UNWRITTEN; + /* end_io context can not be destroyed now because it still + * used by queued worker. Worker thread will destroy it later */ + if (io->flag & EXT4_IO_END_QUEUED) + list_del_init(&io->list); + else + list_move(&io->list, &to_free); + } + /* If we are called from worker context, it is time to clear queued + * flag, and destroy it's end_io if it was converted already */ + if (work_io) { + work_io->flag &= ~EXT4_IO_END_QUEUED; + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) + list_add_tail(&work_io->list, &to_free); } - list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - (void) ext4_end_io_nolock(io); - mutex_unlock(&inode->i_mutex); -free: - ext4_free_io_end(io); + + while (!list_empty(&to_free)) { + io = list_entry(to_free.next, ext4_io_end_t, list); + list_del_init(&io->list); + ext4_free_io_end(io); + } + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + ext4_do_flush_completed_IO(io->inode, io); +} + +int ext4_flush_unwritten_io(struct inode *inode) +{ + int ret; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && + !(inode->i_state & I_FREEING)); + ret = ext4_do_flush_completed_IO(inode, NULL); + ext4_unwritten_wait(inode); + return ret; } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct workqueue_struct *wq; struct inode *inode; - unsigned long flags; int i; sector_t bi_sector = bio->bi_sector; @@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error) return; } - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); + ext4_add_complete_io(io_end); } void ext4_io_submit(struct ext4_io_submit *io) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 41f6ef6..7a75e10 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb) smp_mb__after_clear_bit(); } +static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, + ext4_group_t group) { + return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << + EXT4_DESC_PER_BLOCK_BITS(sb); +} + +static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, + ext4_group_t group) { + group = ext4_meta_bg_first_group(sb, group); + return ext4_group_first_block_no(sb, group); +} + +static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, + ext4_group_t group) { + ext4_grpblk_t overhead; + overhead = ext4_bg_num_gdb(sb, group); + if (ext4_bg_has_super(sb, group)) + overhead += 1 + + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + return overhead; +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb, ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_bg_has_super(sb, group) ? - (1 + ext4_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + unsigned overhead = ext4_group_overhead_blocks(sb, group); ext4_fsblk_t metaend = start + overhead; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; @@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) * be a partial of a flex group. * * @sb: super block of fs to which the groups belongs + * + * Returns 0 on a successful allocation of the metadata blocks in the + * block group. */ -static void ext4_alloc_group_tables(struct super_block *sb, +static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_fsblk_t start_blk; ext4_fsblk_t last_blk; ext4_group_t src_group; @@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb, (last_group & ~(flexbg_size - 1)))); next_group: group = group_data[0].group; + if (src_group >= group_data[0].group + flex_gd->count) + return -ENOSPC; start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; - overhead = ext4_bg_has_super(sb, src_group) ? - (1 + ext4_bg_num_gdb(sb, src_group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, src_group); start_blk += overhead; - BUG_ON(src_group >= group_data[0].group + flex_gd->count); /* We collect contiguous blocks as much as possible. */ src_group++; - for (; src_group <= last_group; src_group++) - if (!ext4_bg_has_super(sb, src_group)) + for (; src_group <= last_group; src_group++) { + overhead = ext4_group_overhead_blocks(sb, src_group); + if (overhead != 0) last_blk += group_data[src_group - group].blocks_count; else break; + } /* Allocate block bitmaps */ for (; bb_index < flex_gd->count; bb_index++) { @@ -300,6 +323,7 @@ next_group: group_data[i].free_blocks_count); } } + return 0; } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, @@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb, ext4_group_t group, count; struct buffer_head *bh = NULL; int reserved_gdb, i, j, err = 0, err2; + int meta_bg; BUG_ON(!flex_gd->count || !group_data || group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); @@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb, group = group_data[0].group; for (i = 0; i < flex_gd->count; i++, group++) { unsigned long gdblocks; + ext4_grpblk_t overhead; gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) + goto handle_itb; + + if (meta_bg == 1) { + ext4_group_t first_group; + first_group = ext4_meta_bg_first_group(sb, group); + if (first_group != group + 1 && + first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) + goto handle_itb; + } + + block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ - for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + for (j = 0; j < gdblocks; j++, block++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); @@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; } +handle_itb: /* Initialize group tables of the grop @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; @@ -521,11 +561,11 @@ handle_bb: err = PTR_ERR(bh); goto out; } - if (ext4_bg_has_super(sb, group)) { + overhead = ext4_group_overhead_blocks(sb, group); + if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + - 1); + ext4_set_bits(bh->b_data, 0, overhead); } ext4_mark_bitmap_end(group_data[i].blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -822,6 +862,45 @@ exit_bh: } /* + * add_new_gdb_meta_bg is the sister of add_new_gdb. + */ +static int add_new_gdb_meta_bg(struct super_block *sb, + handle_t *handle, ext4_group_t group) { + ext4_fsblk_t gdblock; + struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int err; + + gdblock = ext4_meta_bg_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + return err; + } + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree(o_group_desc); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + brelse(gdb_bh); + return err; +} + +/* * Called when we are adding a new group which has a backup copy of each of * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. * We need to add these reserved backup GDT blocks to the resize inode, so @@ -949,16 +1028,16 @@ exit_free: * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) +static void update_backups(struct super_block *sb, int blk_off, char *data, + int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const ext4_group_t last = sbi->s_groups_count; + ext4_group_t last; const int bpg = EXT4_BLOCKS_PER_GROUP(sb); unsigned three = 1; unsigned five = 5; unsigned seven = 7; - ext4_group_t group; + ext4_group_t group = 0; int rest = sb->s_blocksize - size; handle_t *handle; int err = 0, err2; @@ -970,10 +1049,17 @@ static void update_backups(struct super_block *sb, goto exit_err; } - ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); + if (meta_bg == 0) { + group = ext4_list_backups(sb, &three, &five, &seven); + last = sbi->s_groups_count; + } else { + group = ext4_meta_bg_first_group(sb, group) + 1; + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); + } - while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + while (group < sbi->s_groups_count) { struct buffer_head *bh; + ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ if (ext4_handle_valid(handle) && @@ -982,13 +1068,20 @@ static void update_backups(struct super_block *sb, (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; - bh = sb_getblk(sb, group * bpg + blk_off); + if (meta_bg == 0) + backup_block = group * bpg + blk_off; + else + backup_block = (ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group)); + + bh = sb_getblk(sb, backup_block); if (!bh) { err = -EIO; break; } - ext4_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); + ext4_debug("update metadata backup %llu(+%llu)\n", + backup_block, backup_block - + ext4_group_first_block_no(sb, group)); if ((err = ext4_journal_get_write_access(handle, bh))) break; lock_buffer(bh); @@ -1001,6 +1094,13 @@ static void update_backups(struct super_block *sb, if (unlikely(err)) ext4_std_error(sb, err); brelse(bh); + + if (meta_bg == 0) + group = ext4_list_backups(sb, &three, &five, &seven); + else if (group == last) + break; + else + group = last; } if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -1043,7 +1143,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, struct ext4_super_block *es = sbi->s_es; struct buffer_head *gdb_bh; int i, gdb_off, gdb_num, err = 0; + int meta_bg; + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; @@ -1063,8 +1165,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); - } else + } else if (meta_bg != 0) { + err = add_new_gdb_meta_bg(sb, handle, group); + } else { err = add_new_gdb(handle, resize_inode, group); + } if (err) break; } @@ -1076,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) struct buffer_head *bh = sb_getblk(sb, block); if (!bh) return NULL; - - if (bitmap_uptodate(bh)) - return bh; - - lock_buffer(bh); - if (bh_submit_read(bh) < 0) { - unlock_buffer(bh); - brelse(bh); - return NULL; + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + brelse(bh); + return NULL; + } } - unlock_buffer(bh); return bh; } @@ -1161,6 +1261,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_free_group_clusters_set(sb, gdp, EXT4_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + if (ext4_has_group_desc_csum(sb)) + ext4_itable_unused_set(sb, gdp, + EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); ext4_group_desc_csum_set(sb, group, gdp); @@ -1216,7 +1319,7 @@ static void ext4_update_super(struct super_block *sb, } reserved_blocks = ext4_r_blocks_count(es) * 100; - do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); @@ -1227,6 +1330,7 @@ static void ext4_update_super(struct super_block *sb, le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); /* * We need to protect s_groups_count against other CPUs seeing * inconsistent state in the superblock. @@ -1261,6 +1365,8 @@ static void ext4_update_super(struct super_block *sb, percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + ext4_debug("free blocks count %llu", + percpu_counter_read(&sbi->s_freeclusters_counter)); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && sbi->s_log_groups_per_flex) { @@ -1349,16 +1455,24 @@ exit_journal: err = err2; if (!err) { - int i; + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + int gdb_num_end = ((group + flex_gd->count - 1) / + EXT4_DESC_PER_BLOCK(sb)); + int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG); + sector_t old_gdb = 0; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); - for (i = 0; i < flex_gd->count; i++, group++) { + sizeof(struct ext4_super_block), 0); + for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; - int gdb_num; - gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + if (old_gdb == gdb_bh->b_blocknr) + continue; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size); + gdb_bh->b_size, meta_bg); + old_gdb = gdb_bh->b_blocknr; } } exit: @@ -1402,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, group_data[i].group = group + i; group_data[i].blocks_count = blocks_per_group; - overhead = ext4_bg_has_super(sb, group + i) ? - (1 + ext4_bg_num_gdb(sb, group + i) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].free_blocks_count = blocks_per_group - overhead; if (ext4_has_group_desc_csum(sb)) flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | @@ -1492,6 +1604,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (err) goto out; + err = ext4_alloc_flex_bg_array(sb, input->group + 1); + if (err) + return err; + + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); + if (err) + goto out; + flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; @@ -1544,11 +1664,13 @@ errout: err = err2; if (!err) { + ext4_fsblk_t first_block; + first_block = ext4_group_first_block_no(sb, 0); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, + (char *)es, sizeof(struct ext4_super_block), 0); } return err; } @@ -1631,6 +1753,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, return err; } /* ext4_group_extend */ + +static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) +{ + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); +} + +/* + * Release the resize inode and drop the resize_inode feature if there + * are no more reserved gdt blocks, and then convert the file system + * to enable meta_bg + */ +static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) +{ + handle_t *handle; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t nr; + int i, ret, err = 0; + int credits = 1; + + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); + if (inode) { + if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "Unexpected non-zero " + "s_reserved_gdt_blocks"); + return -EPERM; + } + + /* Do a quick sanity check of the resize inode */ + if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + goto invalid_resize_inode; + for (i = 0; i < EXT4_N_BLOCKS; i++) { + if (i == EXT4_DIND_BLOCK) { + if (ei->i_data[i]) + continue; + else + goto invalid_resize_inode; + } + if (ei->i_data[i]) + goto invalid_resize_inode; + } + credits += 3; /* block bitmap, bg descriptor, resize inode */ + } + + handle = ext4_journal_start_sb(sb, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto errout; + + EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + sbi->s_es->s_first_meta_bg = + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + + err = ext4_handle_dirty_super(handle, sb); + if (err) { + ext4_std_error(sb, err); + goto errout; + } + + if (inode) { + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + ei->i_data[EXT4_DIND_BLOCK] = 0; + inode->i_blocks = 0; + + err = ext4_mark_inode_dirty(handle, inode); + if (err) + ext4_std_error(sb, err); + } + +errout: + ret = ext4_journal_stop(handle); + if (!err) + err = ret; + return ret; + +invalid_resize_inode: + ext4_error(sb, "corrupted/inconsistent resize inode"); + return -EINVAL; +} + /* * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count * @@ -1643,21 +1853,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *bh; - struct inode *resize_inode; - ext4_fsblk_t o_blocks_count; - ext4_group_t o_group; - ext4_group_t n_group; - ext4_grpblk_t offset, add; + struct inode *resize_inode = NULL; + ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; - unsigned long desc_blocks; - int err = 0, flexbg_size = 1; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_fsblk_t o_blocks_count; + ext4_fsblk_t n_blocks_count_retry = 0; + unsigned long last_update_time = 0; + int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int meta_bg; + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + +retry: o_blocks_count = ext4_blocks_count(es); - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " - "to %llu blocks", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1672,32 +1892,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); - n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / - EXT4_DESC_PER_BLOCK(sb); - o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - desc_blocks = n_desc_blocks - o_desc_blocks; + n_desc_blocks = num_desc_blocks(sb, n_group + 1); + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); - if (desc_blocks && - (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || - le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { - ext4_warning(sb, "No reserved GDT blocks, can't resize"); - return -EPERM; - } + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); - if (IS_ERR(resize_inode)) { - ext4_warning(sb, "Error opening resize inode"); - return PTR_ERR(resize_inode); + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (meta_bg) { + ext4_error(sb, "resize_inode and meta_bg enabled " + "simultaneously"); + return -EINVAL; + } + if (n_desc_blocks > o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks)) { + n_blocks_count_retry = n_blocks_count; + n_desc_blocks = o_desc_blocks + + le16_to_cpu(es->s_reserved_gdt_blocks); + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); + n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_group--; /* set to last group number */ + } + + if (!resize_inode) + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } } - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, n_blocks_count - 1); - if (!bh) { - ext4_warning(sb, "can't read last block, resize aborted"); - return -ENOSPC; + if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + err = ext4_convert_meta_bg(sb, resize_inode); + if (err) + goto out; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + if (n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + goto retry; + } } - brelse(bh); /* extend the last group */ if (n_group == o_group) @@ -1710,12 +1947,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) goto out; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - es->s_log_groups_per_flex) - flexbg_size = 1 << es->s_log_groups_per_flex; + if (ext4_blocks_count(es) == n_blocks_count) + goto out; - o_blocks_count = ext4_blocks_count(es); - if (o_blocks_count == n_blocks_count) + err = ext4_alloc_flex_bg_array(sb, n_group + 1); + if (err) + return err; + + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); + if (err) goto out; flex_gd = alloc_flex_gd(flexbg_size); @@ -1729,19 +1969,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, flexbg_size)) { - ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + if (jiffies - last_update_time > HZ * 10) { + if (last_update_time) + ext4_msg(sb, KERN_INFO, + "resized to %llu blocks", + ext4_blocks_count(es)); + last_update_time = jiffies; + } + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) + break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); if (unlikely(err)) break; } + if (!err && n_blocks_count_retry) { + n_blocks_count = n_blocks_count_retry; + n_blocks_count_retry = 0; + free_flex_gd(flex_gd); + flex_gd = NULL; + goto retry; + } + out: if (flex_gd) free_flex_gd(flex_gd); - - iput(resize_inode); - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " - "upto %llu blocks", o_blocks_count, n_blocks_count); + if (resize_inode != NULL) + iput(resize_inode); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c6e0cb3..7265a03 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func, */ if (!es->s_error_count) mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); + le32_add_cpu(&es->s_error_count, 1); } static void save_error_info(struct super_block *sb, const char *func, @@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb) flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); - lock_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb) * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ - unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) @@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); - ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); - atomic_set(&ei->i_aiodio_unwritten, 0); + atomic_set(&ei->i_unwritten, 0); return &ei->vfs_inode; } @@ -1019,6 +1016,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } @@ -1219,6 +1221,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, }; static const match_table_t tokens = { @@ -1292,6 +1295,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1472,6 +1476,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1587,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, if (!args->from) arg = EXT4_DEF_LI_WAIT_MULT; sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; } else if (token == Opt_stripe) { sbi->s_stripe = arg; } else if (m->flags & MOPT_DATAJ) { @@ -1659,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); if (handle_mount_opt(sb, p, token, args, journal_devnum, journal_ioprio, is_remount) < 0) @@ -1735,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, static const char *token2str(int token) { - static const struct match_token *t; + const struct match_token *t; for (t = tokens; t->token != Opt_err; t++) if (t->token == token && !strchr(t->pattern, '=')) @@ -1818,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); ext4_show_quota_options(seq, sb); return 0; @@ -1909,15 +1918,45 @@ done: return res; } +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *new_groups; + int size; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); + new_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", + size / (int) sizeof(struct flex_groups)); + return -ENOMEM; + } + + if (sbi->s_flex_groups) { + memcpy(new_groups, sbi->s_flex_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups))); + ext4_kvfree(sbi->s_flex_groups); + } + sbi->s_flex_groups = new_groups; + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + return 0; +} + static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - ext4_group_t flex_group_count; ext4_group_t flex_group; unsigned int groups_per_flex = 0; - size_t size; - int i; + int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { @@ -1926,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb) } groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", - flex_group_count); + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) goto failed; - } for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); @@ -2139,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } @@ -2523,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); static struct attribute *ext4_attrs[] = { @@ -2538,6 +2572,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), NULL, }; @@ -2545,10 +2580,12 @@ static struct attribute *ext4_attrs[] = { /* Features this copy of ext4 supports */ EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); +EXT4_INFO_ATTR(meta_bg_resize); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), NULL, }; @@ -3369,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb) && + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); @@ -3738,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; + sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode @@ -4514,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - lock_super(sb); /* Reset the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); - unlock_super(sb); return 0; } @@ -4554,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) char *orig_data = kstrdup(data, GFP_KERNEL); /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; @@ -4696,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); - unlock_super(sb); #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -4709,10 +4743,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { err = ext4_enable_quotas(sb); - if (err) { - lock_super(sb); + if (err) goto restore_opts; - } } } #endif @@ -4739,7 +4771,6 @@ restore_opts: sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); kfree(orig_data); return err; } @@ -4791,7 +4822,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) static inline struct inode *dquot_to_inode(struct dquot *dquot) { - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) @@ -5264,8 +5295,10 @@ static int __init ext4_init_fs(void) if (err) goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) + if (!ext4_kset) { + err = -ENOMEM; goto out5; + } ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); diff --git a/fs/fat/Makefile b/fs/fat/Makefile index e061903..964b634 100644 --- a/fs/fat/Makefile +++ b/fs/fat/Makefile @@ -6,6 +6,6 @@ obj-$(CONFIG_FAT_FS) += fat.o obj-$(CONFIG_VFAT_FS) += vfat.o obj-$(CONFIG_MSDOS_FS) += msdos.o -fat-y := cache.o dir.o fatent.o file.o inode.o misc.o +fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o vfat-y := namei_vfat.o msdos-y := namei_msdos.o diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 1cc7038e..91ad9e1 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -190,7 +190,8 @@ static void __fat_cache_inval_inode(struct inode *inode) struct fat_cache *cache; while (!list_empty(&i->cache_lru)) { - cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list); + cache = list_entry(i->cache_lru.next, + struct fat_cache, cache_list); list_del_init(&cache->cache_list); i->nr_caches--; fat_cache_free(cache); @@ -261,9 +262,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { - fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, + "%s: invalid cluster chain (i_pos %lld)", + __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index dc49ed2..bca6d0a 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -18,7 +18,7 @@ #include <linux/time.h> #include <linux/buffer_head.h> #include <linux/compat.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kernel.h> #include "fat.h" @@ -123,7 +123,8 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, { /* Fast stuff first */ if (*bh && *de && - (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) { + (*de - (struct msdos_dir_entry *)(*bh)->b_data) < + MSDOS_SB(dir->i_sb)->dir_per_block - 1) { *pos += sizeof(struct msdos_dir_entry); (*de)++; return 0; @@ -155,7 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; - if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { + charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE); + if (charlen > 0) { op += charlen; len -= charlen; } else { @@ -172,12 +174,12 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, } if (unlikely(*ip)) { - fat_msg(sb, KERN_WARNING, "filename was truncated while " - "converting."); + fat_msg(sb, KERN_WARNING, + "filename was truncated while converting."); } *op = 0; - return (op - ascii); + return op - ascii; } static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, @@ -205,7 +207,8 @@ fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) } static inline int -fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) +fat_short2lower_uni(struct nls_table *t, unsigned char *c, + int clen, wchar_t *uni) { int charlen; wchar_t wc; @@ -220,7 +223,8 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *un if (!nc) nc = *c; - if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) { + charlen = t->char2uni(&nc, 1, uni); + if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; } @@ -537,7 +541,6 @@ end_of_dir: return err; } - EXPORT_SYMBOL_GPL(fat_search_long); struct fat_ioctl_filldir_callback { @@ -574,7 +577,8 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, /* Fake . and .. for the root directory. */ if (inode->i_ino == MSDOS_ROOT_INO) { while (cpos < 2) { - if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0) + if (filldir(dirent, "..", cpos+1, cpos, + MSDOS_ROOT_INO, DT_DIR) < 0) goto out; cpos++; filp->f_pos++; @@ -872,25 +876,26 @@ static int fat_get_short_entry(struct inode *dir, loff_t *pos, } /* - * The ".." entry can not provide the "struct fat_slot_info" informations - * for inode. So, this function provide the some informations only. + * The ".." entry can not provide the "struct fat_slot_info" information + * for inode, nor a usable i_pos. So, this function provides some information + * only. + * + * Since this function walks through the on-disk inodes within a directory, + * callers are responsible for taking any locks necessary to prevent the + * directory from changing. */ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, - struct msdos_dir_entry **de, loff_t *i_pos) + struct msdos_dir_entry **de) { - loff_t offset; + loff_t offset = 0; - offset = 0; - *bh = NULL; + *de = NULL; while (fat_get_short_entry(dir, &offset, bh, de) >= 0) { - if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) { - *i_pos = fat_make_i_pos(dir->i_sb, *bh, *de); + if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) return 0; - } } return -ENOENT; } - EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); /* See if directory is empty */ @@ -913,7 +918,6 @@ int fat_dir_empty(struct inode *dir) brelse(bh); return result; } - EXPORT_SYMBOL_GPL(fat_dir_empty); /* @@ -959,7 +963,6 @@ int fat_scan(struct inode *dir, const unsigned char *name, } return -ENOENT; } - EXPORT_SYMBOL_GPL(fat_scan); static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) @@ -1047,7 +1050,6 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) return 0; } - EXPORT_SYMBOL_GPL(fat_remove_entries); static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, @@ -1141,10 +1143,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) de[0].ctime_cs = de[1].ctime_cs = 0; de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0; } - de[0].start = cpu_to_le16(cluster); - de[0].starthi = cpu_to_le16(cluster >> 16); - de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart); - de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16); + fat_set_start(&de[0], cluster); + fat_set_start(&de[1], MSDOS_I(dir)->i_logstart); de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); @@ -1161,7 +1161,6 @@ error_free: error: return err; } - EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, @@ -1377,5 +1376,4 @@ error_remove: __fat_remove_entries(dir, pos, free_slots); return err; } - EXPORT_SYMBOL_GPL(fat_add_entries); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 2deeeb8..ca7e8f8 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -5,6 +5,7 @@ #include <linux/string.h> #include <linux/nls.h> #include <linux/fs.h> +#include <linux/hash.h> #include <linux/mutex.h> #include <linux/ratelimit.h> #include <linux/msdos_fs.h> @@ -23,30 +24,31 @@ #define FAT_ERRORS_RO 3 /* remount r/o on error */ struct fat_mount_options { - uid_t fs_uid; - gid_t fs_gid; + kuid_t fs_uid; + kgid_t fs_gid; unsigned short fs_fmask; unsigned short fs_dmask; - unsigned short codepage; /* Codepage for shortname conversions */ - char *iocharset; /* Charset used for filename input/display */ - unsigned short shortname; /* flags for shortname display/create rule */ - unsigned char name_check; /* r = relaxed, n = normal, s = strict */ - unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned short codepage; /* Codepage for shortname conversions */ + char *iocharset; /* Charset used for filename input/display */ + unsigned short shortname; /* flags for shortname display/create rule */ + unsigned char name_check; /* r = relaxed, n = normal, s = strict */ + unsigned char errors; /* On error: continue, panic, remount-ro */ unsigned short allow_utime;/* permission for setting the [am]time */ - unsigned quiet:1, /* set = fake successful chmods and chowns */ - showexec:1, /* set = only set x bit for com/exe/bat */ - sys_immutable:1, /* set = system files are immutable */ - dotsOK:1, /* set = hidden and system files are named '.filename' */ - isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ - utf8:1, /* Use of UTF-8 character set (Default) */ - unicode_xlate:1, /* create escape sequences for unhandled Unicode */ - numtail:1, /* Does first alias have a numeric '~1' type tail? */ - flush:1, /* write things quickly */ - nocase:1, /* Does this need case conversion? 0=need case conversion*/ - usefree:1, /* Use free_clusters for FAT32 */ - tz_utc:1, /* Filesystem timestamps are in UTC */ - rodir:1, /* allow ATTR_RO for directory */ - discard:1; /* Issue discard requests on deletions */ + unsigned quiet:1, /* set = fake successful chmods and chowns */ + showexec:1, /* set = only set x bit for com/exe/bat */ + sys_immutable:1, /* set = system files are immutable */ + dotsOK:1, /* set = hidden and system files are named '.filename' */ + isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ + utf8:1, /* Use of UTF-8 character set (Default) */ + unicode_xlate:1, /* create escape sequences for unhandled Unicode */ + numtail:1, /* Does first alias have a numeric '~1' type tail? */ + flush:1, /* write things quickly */ + nocase:1, /* Does this need case conversion? 0=need case conversion*/ + usefree:1, /* Use free_clusters for FAT32 */ + tz_utc:1, /* Filesystem timestamps are in UTC */ + rodir:1, /* allow ATTR_RO for directory */ + discard:1, /* Issue discard requests on deletions */ + nfs:1; /* Do extra work needed for NFS export */ }; #define FAT_HASH_BITS 8 @@ -56,28 +58,28 @@ struct fat_mount_options { * MS-DOS file system in-core superblock data */ struct msdos_sb_info { - unsigned short sec_per_clus; /* sectors/cluster */ - unsigned short cluster_bits; /* log2(cluster_size) */ - unsigned int cluster_size; /* cluster size */ - unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */ + unsigned short sec_per_clus; /* sectors/cluster */ + unsigned short cluster_bits; /* log2(cluster_size) */ + unsigned int cluster_size; /* cluster size */ + unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */ unsigned short fat_start; - unsigned long fat_length; /* FAT start & length (sec.) */ + unsigned long fat_length; /* FAT start & length (sec.) */ unsigned long dir_start; - unsigned short dir_entries; /* root dir start & entries */ - unsigned long data_start; /* first data sector */ - unsigned long max_cluster; /* maximum cluster number */ - unsigned long root_cluster; /* first cluster of the root directory */ - unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ + unsigned short dir_entries; /* root dir start & entries */ + unsigned long data_start; /* first data sector */ + unsigned long max_cluster; /* maximum cluster number */ + unsigned long root_cluster; /* first cluster of the root directory */ + unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; - unsigned int prev_free; /* previously allocated cluster number */ - unsigned int free_clusters; /* -1 if undefined */ + unsigned int prev_free; /* previously allocated cluster number */ + unsigned int free_clusters; /* -1 if undefined */ unsigned int free_clus_valid; /* is free_clusters valid? */ struct fat_mount_options options; - struct nls_table *nls_disk; /* Codepage used on disk */ - struct nls_table *nls_io; /* Charset used for input and display */ - const void *dir_ops; /* Opaque; default directory operations */ - int dir_per_block; /* dir entries per block */ - int dir_per_block_bits; /* log2(dir_per_block) */ + struct nls_table *nls_disk; /* Codepage used on disk */ + struct nls_table *nls_io; /* Charset used for input and display */ + const void *dir_ops; /* Opaque; default directory operations */ + int dir_per_block; /* dir entries per block */ + int dir_per_block_bits; /* log2(dir_per_block) */ int fatent_shift; struct fatent_operations *fatent_ops; @@ -88,6 +90,9 @@ struct msdos_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; + + spinlock_t dir_hash_lock; + struct hlist_head dir_hashtable[FAT_HASH_SIZE]; }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ @@ -110,6 +115,7 @@ struct msdos_inode_info { int i_attrs; /* unused attribute bits */ loff_t i_pos; /* on-disk position of directory entry or 0 */ struct hlist_node i_fat_hash; /* hash by i_location */ + struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ struct inode vfs_inode; }; @@ -262,7 +268,7 @@ extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, - struct msdos_dir_entry **de, loff_t *i_pos); + struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo); @@ -322,7 +328,7 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct dentry * dentry, struct iattr * attr); +extern int fat_setattr(struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); @@ -340,7 +346,12 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, - struct inode *i2); + struct inode *i2); +static inline unsigned long fat_dir_hash(int logstart) +{ + return hash_32(logstart, FAT_HASH_BITS); +} + /* fat/misc.c */ extern __printf(3, 4) __cold void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); @@ -366,6 +377,14 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); void fat_cache_destroy(void); +/* fat/nfs.c */ +struct fid; +extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *fat_get_parent(struct dentry *child_dir); + /* helper for printk */ typedef unsigned long long llu; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 31f08ab..260705c 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -186,9 +186,6 @@ static void fat16_ent_put(struct fat_entry *fatent, int new) static void fat32_ent_put(struct fat_entry *fatent, int new) { - if (new == FAT_ENT_EOF) - new = EOF_FAT32; - WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); @@ -203,15 +200,18 @@ static int fat12_ent_next(struct fat_entry *fatent) fatent->entry++; if (fatent->nr_bhs == 1) { - WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); - WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); + WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 2))); + WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 1))); if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { ent12_p[0] = nextp - 1; ent12_p[1] = nextp; return 1; } } else { - WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); + WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + + (bhs[0]->b_size - 1))); WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); ent12_p[0] = nextp - 1; ent12_p[1] = nextp; @@ -631,7 +631,6 @@ error: return err; } - EXPORT_SYMBOL_GPL(fat_free_clusters); /* 128kb is the whole sectors for FAT12 and FAT16 */ diff --git a/fs/fat/file.c b/fs/fat/file.c index e007b8b..a62e0ec 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -352,7 +352,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (current_fsuid() != inode->i_uid) { + if (!uid_eq(current_fsuid(), inode->i_uid)) { if (in_group_p(inode->i_gid)) allow_utime >>= 3; if (allow_utime & MAY_WRITE) @@ -407,9 +407,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) } if (((attr->ia_valid & ATTR_UID) && - (attr->ia_uid != sbi->options.fs_uid)) || + (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (attr->ia_gid != sbi->options.fs_gid)) || + (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 05e897f..76f60c6 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -281,15 +281,42 @@ static inline unsigned long fat_hash(loff_t i_pos) return hash_32(i_pos, FAT_HASH_BITS); } +static void dir_hash_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int i; + + spin_lock_init(&sbi->dir_hash_lock); + for (i = 0; i < FAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->dir_hashtable[i]); +} + void fat_attach(struct inode *inode, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); - spin_lock(&sbi->inode_hash_lock); - MSDOS_I(inode)->i_pos = i_pos; - hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); - spin_unlock(&sbi->inode_hash_lock); + if (inode->i_ino != MSDOS_ROOT_INO) { + struct hlist_head *head = sbi->inode_hashtable + + fat_hash(i_pos); + + spin_lock(&sbi->inode_hash_lock); + MSDOS_I(inode)->i_pos = i_pos; + hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); + spin_unlock(&sbi->inode_hash_lock); + } + + /* If NFS support is enabled, cache the mapping of start cluster + * to directory inode. This is used during reconnection of + * dentries to the filesystem root. + */ + if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { + struct hlist_head *d_head = sbi->dir_hashtable; + d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart); + + spin_lock(&sbi->dir_hash_lock); + hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head); + spin_unlock(&sbi->dir_hash_lock); + } } EXPORT_SYMBOL_GPL(fat_attach); @@ -300,6 +327,12 @@ void fat_detach(struct inode *inode) MSDOS_I(inode)->i_pos = 0; hlist_del_init(&MSDOS_I(inode)->i_fat_hash); spin_unlock(&sbi->inode_hash_lock); + + if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { + spin_lock(&sbi->dir_hash_lock); + hlist_del_init(&MSDOS_I(inode)->i_dir_hash); + spin_unlock(&sbi->dir_hash_lock); + } } EXPORT_SYMBOL_GPL(fat_detach); @@ -504,6 +537,7 @@ static void init_once(void *foo) ei->cache_valid_id = FAT_CACHE_VALID + 1; INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_fat_hash); + INIT_HLIST_NODE(&ei->i_dir_hash); inode_init_once(&ei->vfs_inode); } @@ -521,6 +555,11 @@ static int __init fat_init_inodecache(void) static void __exit fat_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(fat_inode_cachep); } @@ -663,125 +702,9 @@ static const struct super_operations fat_sops = { .show_options = fat_show_options, }; -/* - * a FAT file handle with fhtype 3 is - * 0/ i_ino - for fast, reliable lookup if still in the cache - * 1/ i_generation - to see if i_ino is still valid - * bit 0 == 0 iff directory - * 2/ i_pos(8-39) - if ino has changed, but still in cache - * 3/ i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos - * 4/ i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc - * - * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum - * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits - * of i_logstart is used to store the directory entry offset. - */ - -static struct dentry *fat_fh_to_dentry(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type) -{ - struct inode *inode = NULL; - u32 *fh = fid->raw; - - if (fh_len < 5 || fh_type != 3) - return NULL; - - inode = ilookup(sb, fh[0]); - if (!inode || inode->i_generation != fh[1]) { - if (inode) - iput(inode); - inode = NULL; - } - if (!inode) { - loff_t i_pos; - int i_logstart = fh[3] & 0x0fffffff; - - i_pos = (loff_t)fh[2] << 8; - i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28); - - /* try 2 - see if i_pos is in F-d-c - * require i_logstart to be the same - * Will fail if you truncate and then re-write - */ - - inode = fat_iget(sb, i_pos); - if (inode && MSDOS_I(inode)->i_logstart != i_logstart) { - iput(inode); - inode = NULL; - } - } - - /* - * For now, do nothing if the inode is not found. - * - * What we could do is: - * - * - follow the file starting at fh[4], and record the ".." entry, - * and the name of the fh[2] entry. - * - then follow the ".." file finding the next step up. - * - * This way we build a path to the root of the tree. If this works, we - * lookup the path and so get this inode into the cache. Finally try - * the fat_iget lookup again. If that fails, then we are totally out - * of luck. But all that is for another day - */ - return d_obtain_alias(inode); -} - -static int -fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) -{ - int len = *lenp; - struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - loff_t i_pos; - - if (len < 5) { - *lenp = 5; - return 255; /* no room */ - } - - i_pos = fat_i_pos_read(sbi, inode); - *lenp = 5; - fh[0] = inode->i_ino; - fh[1] = inode->i_generation; - fh[2] = i_pos >> 8; - fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart; - fh[4] = (i_pos & 0x0f) << 28; - if (parent) - fh[4] |= MSDOS_I(parent)->i_logstart; - return 3; -} - -static struct dentry *fat_get_parent(struct dentry *child) -{ - struct super_block *sb = child->d_sb; - struct buffer_head *bh; - struct msdos_dir_entry *de; - loff_t i_pos; - struct dentry *parent; - struct inode *inode; - int err; - - lock_super(sb); - - err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); - if (err) { - parent = ERR_PTR(err); - goto out; - } - inode = fat_build_inode(sb, de, i_pos); - brelse(bh); - - parent = d_obtain_alias(inode); -out: - unlock_super(sb); - - return parent; -} - static const struct export_operations fat_export_ops = { - .encode_fh = fat_encode_fh, .fh_to_dentry = fat_fh_to_dentry, + .fh_to_parent = fat_fh_to_parent, .get_parent = fat_get_parent, }; @@ -791,10 +714,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) struct fat_mount_options *opts = &sbi->options; int isvfat = opts->isvfat; - if (opts->fs_uid != 0) - seq_printf(m, ",uid=%u", opts->fs_uid); - if (opts->fs_gid != 0) - seq_printf(m, ",gid=%u", opts->fs_gid); + if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->fs_uid)); + if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->fs_gid)); seq_printf(m, ",fmask=%04o", opts->fs_fmask); seq_printf(m, ",dmask=%04o", opts->fs_dmask); if (opts->allow_utime) @@ -829,6 +754,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); + if (opts->nfs) + seq_puts(m, ",nfs"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) @@ -873,7 +800,7 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err, }; static const match_table_t fat_tokens = { @@ -902,6 +829,7 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, + {Opt_nfs, "nfs"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -982,6 +910,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_utc = 0; + opts->nfs = 0; opts->errors = FAT_ERRORS_RO; *debug = 0; @@ -1037,12 +966,16 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_uid: if (match_int(&args[0], &option)) return 0; - opts->fs_uid = option; + opts->fs_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(opts->fs_uid)) + return 0; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; - opts->fs_gid = option; + opts->fs_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(opts->fs_gid)) + return 0; break; case Opt_umask: if (match_octal(&args[0], &option)) @@ -1142,6 +1075,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_discard: opts->discard = 1; break; + case Opt_nfs: + opts->nfs = 1; + break; /* obsolete mount options */ case Opt_obsolete: @@ -1432,6 +1368,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* set up enough so that it can read an inode */ fat_hash_init(sb); + dir_hash_init(sb); fat_ent_access_init(sb); /* @@ -1486,6 +1423,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, } error = -ENOMEM; insert_inode_hash(root_inode); + fat_attach(root_inode, 0); sb->s_root = d_make_root(root_inode); if (!sb->s_root) { fat_msg(sb, KERN_ERR, "get root inode failed"); @@ -1525,18 +1463,14 @@ static int writeback_inode(struct inode *inode) { int ret; - struct address_space *mapping = inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 0, - }; - /* if we used WB_SYNC_ALL, sync_inode waits for the io for the - * inode to finish. So WB_SYNC_NONE is sent down to sync_inode + + /* if we used wait=1, sync_inode_metadata waits for the io for the + * inode to finish. So wait=0 is sent down to sync_inode_metadata * and filemap_fdatawrite is used for the data blocks */ - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 0); if (!ret) - ret = filemap_fdatawrite(mapping); + ret = filemap_fdatawrite(inode->i_mapping); return ret; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index b0e12bf..c1055e7 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -407,7 +407,7 @@ out: static int msdos_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - struct super_block *sb= inode->i_sb; + struct super_block *sb = inode->i_sb; struct fat_slot_info sinfo; int err; @@ -440,7 +440,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec ts; - loff_t dotdot_i_pos, new_i_pos; + loff_t new_i_pos; int err, old_attrs, is_dir, update_dotdot, corrupt = 0; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; @@ -456,8 +456,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, is_dir = S_ISDIR(old_inode->i_mode); update_dotdot = (is_dir && old_dir != new_dir); if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, - &dotdot_i_pos) < 0) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { err = -EIO; goto out; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6a6d8c0..e535dd7 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -914,7 +914,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec ts; - loff_t dotdot_i_pos, new_i_pos; + loff_t new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; @@ -929,8 +929,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, is_dir = S_ISDIR(old_inode->i_mode); update_dotdot = (is_dir && old_dir != new_dir); if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, - &dotdot_i_pos) < 0) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { err = -EIO; goto out; } diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c new file mode 100644 index 0000000..ef4b5fa --- /dev/null +++ b/fs/fat/nfs.c @@ -0,0 +1,101 @@ +/* fs/fat/nfs.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/exportfs.h> +#include "fat.h" + +/** + * Look up a directory inode given its starting cluster. + */ +static struct inode *fat_dget(struct super_block *sb, int i_logstart) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct hlist_head *head; + struct hlist_node *_p; + struct msdos_inode_info *i; + struct inode *inode = NULL; + + head = sbi->dir_hashtable + fat_dir_hash(i_logstart); + spin_lock(&sbi->dir_hash_lock); + hlist_for_each_entry(i, _p, head, i_dir_hash) { + BUG_ON(i->vfs_inode.i_sb != sb); + if (i->i_logstart != i_logstart) + continue; + inode = igrab(&i->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->dir_hash_lock); + return inode; +} + +static struct inode *fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) + return NULL; + + inode = ilookup(sb, ino); + if (inode && generation && (inode->i_generation != generation)) { + iput(inode); + inode = NULL; + } + + return inode; +} + +/** + * Map a NFS file handle to a corresponding dentry. + * The dentry may or may not be connected to the filesystem root. + */ +struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + fat_nfs_get_inode); +} + +/* + * Find the parent for a file specified by NFS handle. + * This requires that the handle contain the i_ino of the parent. + */ +struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + fat_nfs_get_inode); +} + +/* + * Find the parent for a directory that is not currently connected to + * the filesystem root. + * + * On entry, the caller holds child_dir->d_inode->i_mutex. + */ +struct dentry *fat_get_parent(struct dentry *child_dir) +{ + struct super_block *sb = child_dir->d_sb; + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de; + struct inode *parent_inode = NULL; + + if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { + int parent_logstart = fat_get_start(MSDOS_SB(sb), de); + parent_inode = fat_dget(sb, parent_logstart); + } + brelse(bh); + + return d_obtain_alias(parent_inode); +} @@ -26,124 +26,6 @@ #include <asm/siginfo.h> #include <asm/uaccess.h> -void set_close_on_exec(unsigned int fd, int flag) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); - spin_unlock(&files->file_lock); -} - -static bool get_close_on_exec(unsigned int fd) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - bool res; - rcu_read_lock(); - fdt = files_fdtable(files); - res = close_on_exec(fd, fdt); - rcu_read_unlock(); - return res; -} - -SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) -{ - int err = -EBADF; - struct file * file, *tofree; - struct files_struct * files = current->files; - struct fdtable *fdt; - - if ((flags & ~O_CLOEXEC) != 0) - return -EINVAL; - - if (unlikely(oldfd == newfd)) - return -EINVAL; - - spin_lock(&files->file_lock); - err = expand_files(files, newfd); - file = fcheck(oldfd); - if (unlikely(!file)) - goto Ebadf; - if (unlikely(err < 0)) { - if (err == -EMFILE) - goto Ebadf; - goto out_unlock; - } - /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. - */ - err = -EBUSY; - fdt = files_fdtable(files); - tofree = fdt->fd[newfd]; - if (!tofree && fd_is_open(newfd, fdt)) - goto out_unlock; - get_file(file); - rcu_assign_pointer(fdt->fd[newfd], file); - __set_open_fd(newfd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(newfd, fdt); - else - __clear_close_on_exec(newfd, fdt); - spin_unlock(&files->file_lock); - - if (tofree) - filp_close(tofree, files); - - return newfd; - -Ebadf: - err = -EBADF; -out_unlock: - spin_unlock(&files->file_lock); - return err; -} - -SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) -{ - if (unlikely(newfd == oldfd)) { /* corner case */ - struct files_struct *files = current->files; - int retval = oldfd; - - rcu_read_lock(); - if (!fcheck_files(files, oldfd)) - retval = -EBADF; - rcu_read_unlock(); - return retval; - } - return sys_dup3(oldfd, newfd, 0); -} - -SYSCALL_DEFINE1(dup, unsigned int, fildes) -{ - int ret = -EBADF; - struct file *file = fget_raw(fildes); - - if (file) { - ret = get_unused_fd(); - if (ret >= 0) - fd_install(ret, file); - else - fput(file); - } - return ret; -} - #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned long arg) @@ -267,7 +149,7 @@ pid_t f_getown(struct file *filp) static int f_setown_ex(struct file *filp, unsigned long arg) { - struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; struct pid *pid; int type; @@ -307,7 +189,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) static int f_getown_ex(struct file *filp, unsigned long arg) { - struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; int ret = 0; @@ -345,7 +227,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); - uid_t * __user dst = (void * __user)arg; + uid_t __user *dst = (void __user *)arg; uid_t src[2]; int err; @@ -373,14 +255,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, switch (cmd) { case F_DUPFD: + err = f_dupfd(arg, filp, 0); + break; case F_DUPFD_CLOEXEC: - if (arg >= rlimit(RLIMIT_NOFILE)) - break; - err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); - if (err >= 0) { - get_file(filp); - fd_install(err, filp); - } + err = f_dupfd(arg, filp, O_CLOEXEC); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; @@ -470,25 +348,23 @@ static int check_fcntl_cmd(unsigned cmd) SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct file *filp; - int fput_needed; + struct fd f = fdget_raw(fd); long err = -EBADF; - filp = fget_raw_light(fd, &fput_needed); - if (!filp) + if (!f.file) goto out; - if (unlikely(filp->f_mode & FMODE_PATH)) { + if (unlikely(f.file->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) goto out1; } - err = security_file_fcntl(filp, cmd, arg); + err = security_file_fcntl(f.file, cmd, arg); if (!err) - err = do_fcntl(fd, cmd, arg, filp); + err = do_fcntl(fd, cmd, arg, f.file); out1: - fput_light(filp, fput_needed); + fdput(f); out: return err; } @@ -497,38 +373,36 @@ out: SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct file * filp; + struct fd f = fdget_raw(fd); long err = -EBADF; - int fput_needed; - filp = fget_raw_light(fd, &fput_needed); - if (!filp) + if (!f.file) goto out; - if (unlikely(filp->f_mode & FMODE_PATH)) { + if (unlikely(f.file->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) goto out1; } - err = security_file_fcntl(filp, cmd, arg); + err = security_file_fcntl(f.file, cmd, arg); if (err) goto out1; switch (cmd) { case F_GETLK64: - err = fcntl_getlk64(filp, (struct flock64 __user *) arg); + err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); break; case F_SETLK64: case F_SETLKW64: - err = fcntl_setlk64(fd, filp, cmd, + err = fcntl_setlk64(fd, f.file, cmd, (struct flock64 __user *) arg); break; default: - err = do_fcntl(fd, cmd, arg, filp); + err = do_fcntl(fd, cmd, arg, f.file); break; } out1: - fput_light(filp, fput_needed); + fdput(f); out: return err; } diff --git a/fs/fhandle.c b/fs/fhandle.c index a48e4a1..f775bfd 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -113,24 +113,21 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, static struct vfsmount *get_vfsmount_from_fd(int fd) { - struct path path; + struct vfsmount *mnt; if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); - path = fs->pwd; - mntget(path.mnt); + mnt = mntget(fs->pwd.mnt); spin_unlock(&fs->lock); } else { - int fput_needed; - struct file *file = fget_light(fd, &fput_needed); - if (!file) + struct fd f = fdget(fd); + if (!f.file) return ERR_PTR(-EBADF); - path = file->f_path; - mntget(path.mnt); - fput_light(file, fput_needed); + mnt = mntget(f.file->f_path.mnt); + fdput(f); } - return path.mnt; + return mnt; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) @@ -6,6 +6,7 @@ * Manage the dynamic fd arrays in the process files_struct. */ +#include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> @@ -84,22 +85,14 @@ static void free_fdtable_work(struct work_struct *work) } } -void free_fdtable_rcu(struct rcu_head *rcu) +static void free_fdtable_rcu(struct rcu_head *rcu) { struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); struct fdtable_defer *fddef; BUG_ON(!fdt); + BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); - if (fdt->max_fds <= NR_OPEN_DEFAULT) { - /* - * This fdtable is embedded in the files structure and that - * structure itself is getting destroyed. - */ - kmem_cache_free(files_cachep, - container_of(fdt, struct files_struct, fdtab)); - return; - } if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { kfree(fdt->fd); kfree(fdt->open_fds); @@ -229,7 +222,7 @@ static int expand_fdtable(struct files_struct *files, int nr) copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt->max_fds > NR_OPEN_DEFAULT) - free_fdtable(cur_fdt); + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ __free_fdtable(new_fdt); @@ -245,19 +238,12 @@ static int expand_fdtable(struct files_struct *files, int nr) * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ -int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, int nr) { struct fdtable *fdt; fdt = files_fdtable(files); - /* - * N.B. For clone tasks sharing a files structure, this test - * will limit the total number of files that can be opened. - */ - if (nr >= rlimit(RLIMIT_NOFILE)) - return -EMFILE; - /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; @@ -270,6 +256,26 @@ int expand_files(struct files_struct *files, int nr) return expand_fdtable(files, nr); } +static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +{ + __set_bit(fd, fdt->close_on_exec); +} + +static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +{ + __clear_bit(fd, fdt->close_on_exec); +} + +static inline void __set_open_fd(int fd, struct fdtable *fdt) +{ + __set_bit(fd, fdt->open_fds); +} + +static inline void __clear_open_fd(int fd, struct fdtable *fdt) +{ + __clear_bit(fd, fdt->open_fds); +} + static int count_open_files(struct fdtable *fdt) { int size = fdt->max_fds; @@ -395,6 +401,95 @@ out: return NULL; } +static void close_files(struct files_struct * files) +{ + int i, j; + struct fdtable *fdt; + + j = 0; + + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. But use RCU to shut RCU-lockdep up. + */ + rcu_read_lock(); + fdt = files_fdtable(files); + rcu_read_unlock(); + for (;;) { + unsigned long set; + i = j * BITS_PER_LONG; + if (i >= fdt->max_fds) + break; + set = fdt->open_fds[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&fdt->fd[i], NULL); + if (file) { + filp_close(file, files); + cond_resched(); + } + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void put_files_struct(struct files_struct *files) +{ + struct fdtable *fdt; + + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* not really needed, since nobody can see us */ + rcu_read_lock(); + fdt = files_fdtable(files); + rcu_read_unlock(); + /* free the arrays if they are not embedded */ + if (fdt != &files->fdtab) + __free_fdtable(fdt); + kmem_cache_free(files_cachep, files); + } +} + +void reset_files_struct(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + static void __devinit fdtable_defer_list_init(int cpu) { struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); @@ -424,12 +519,18 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; +void daemonize_descriptors(void) +{ + atomic_inc(&init_files.count); + reset_files_struct(&init_files); +} + /* * allocate a file descriptor, mark it busy. */ -int alloc_fd(unsigned start, unsigned flags) +int __alloc_fd(struct files_struct *files, + unsigned start, unsigned end, unsigned flags) { - struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; @@ -444,6 +545,14 @@ repeat: if (fd < fdt->max_fds) fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + error = -EMFILE; + if (fd >= end) + goto out; + error = expand_files(files, fd); if (error < 0) goto out; @@ -477,8 +586,424 @@ out: return error; } -int get_unused_fd(void) +static int alloc_fd(unsigned start, unsigned flags) +{ + return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); +} + +int get_unused_fd_flags(unsigned flags) +{ + return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); +} +EXPORT_SYMBOL(get_unused_fd_flags); + +static void __put_unused_fd(struct files_struct *files, unsigned int fd) +{ + struct fdtable *fdt = files_fdtable(files); + __clear_open_fd(fd, fdt); + if (fd < files->next_fd) + files->next_fd = fd; +} + +void put_unused_fd(unsigned int fd) +{ + struct files_struct *files = current->files; + spin_lock(&files->file_lock); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); +} + +EXPORT_SYMBOL(put_unused_fd); + +/* + * Install a file pointer in the fd array. + * + * The VFS is full of places where we drop the files lock between + * setting the open_fds bitmap and installing the file in the file + * array. At any such point, we are vulnerable to a dup2() race + * installing a file in the array before us. We need to detect this and + * fput() the struct file we are about to overwrite in this case. + * + * It should never happen - if we allow dup2() do it, _really_ bad things + * will follow. + * + * NOTE: __fd_install() variant is really, really low-level; don't + * use it unless you are forced to by truly lousy API shoved down + * your throat. 'files' *MUST* be either current->files or obtained + * by get_files_struct(current) done by whoever had given it to you, + * or really bad things will happen. Normally you want to use + * fd_install() instead. + */ + +void __fd_install(struct files_struct *files, unsigned int fd, + struct file *file) +{ + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + +void fd_install(unsigned int fd, struct file *file) { - return alloc_fd(0, 0); + __fd_install(current->files, fd, file); +} + +EXPORT_SYMBOL(fd_install); + +/* + * The same warnings as for __alloc_fd()/__fd_install() apply here... + */ +int __close_fd(struct files_struct *files, unsigned fd) +{ + struct file *file; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + goto out_unlock; + file = fdt->fd[fd]; + if (!file) + goto out_unlock; + rcu_assign_pointer(fdt->fd[fd], NULL); + __clear_close_on_exec(fd, fdt); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + return filp_close(file, files); + +out_unlock: + spin_unlock(&files->file_lock); + return -EBADF; +} + +void do_close_on_exec(struct files_struct *files) +{ + unsigned i; + struct fdtable *fdt; + + /* exec unshares first */ + BUG_ON(atomic_read(&files->count) != 1); + spin_lock(&files->file_lock); + for (i = 0; ; i++) { + unsigned long set; + unsigned fd = i * BITS_PER_LONG; + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + break; + set = fdt->close_on_exec[i]; + if (!set) + continue; + fdt->close_on_exec[i] = 0; + for ( ; set ; fd++, set >>= 1) { + struct file *file; + if (!(set & 1)) + continue; + file = fdt->fd[fd]; + if (!file) + continue; + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + filp_close(file, files); + cond_resched(); + spin_lock(&files->file_lock); + } + + } + spin_unlock(&files->file_lock); +} + +struct file *fget(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget); + +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + +/* + * Lightweight file lookup - no refcnt increment if fd table isn't shared. + * + * You can use this instead of fget if you satisfy all of the following + * conditions: + * 1) You must call fput_light before exiting the syscall and returning control + * to userspace (i.e. you cannot remember the returned struct file * after + * returning to userspace). + * 2) You must not call filp_close on the returned struct file * in between + * calls to fget_light and fput_light. + * 3) You must not clone the current task in between the calls to fget_light + * and fput_light. + * + * The fput_needed flag returned by fget_light should be passed to the + * corresponding fput_light. + */ +struct file *fget_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} +EXPORT_SYMBOL(fget_light); + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +void set_close_on_exec(unsigned int fd, int flag) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (flag) + __set_close_on_exec(fd, fdt); + else + __clear_close_on_exec(fd, fdt); + spin_unlock(&files->file_lock); +} + +bool get_close_on_exec(unsigned int fd) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + bool res; + rcu_read_lock(); + fdt = files_fdtable(files); + res = close_on_exec(fd, fdt); + rcu_read_unlock(); + return res; +} + +static int do_dup2(struct files_struct *files, + struct file *file, unsigned fd, unsigned flags) +{ + struct file *tofree; + struct fdtable *fdt; + + /* + * We need to detect attempts to do dup2() over allocated but still + * not finished descriptor. NB: OpenBSD avoids that at the price of + * extra work in their equivalent of fget() - they insert struct + * file immediately after grabbing descriptor, mark it larval if + * more work (e.g. actual opening) is needed and make sure that + * fget() treats larval files as absent. Potentially interesting, + * but while extra work in fget() is trivial, locking implications + * and amount of surgery on open()-related paths in VFS are not. + * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" + * deadlocks in rather amusing ways, AFAICS. All of that is out of + * scope of POSIX or SUS, since neither considers shared descriptor + * tables and this condition does not arise without those. + */ + fdt = files_fdtable(files); + tofree = fdt->fd[fd]; + if (!tofree && fd_is_open(fd, fdt)) + goto Ebusy; + get_file(file); + rcu_assign_pointer(fdt->fd[fd], file); + __set_open_fd(fd, fdt); + if (flags & O_CLOEXEC) + __set_close_on_exec(fd, fdt); + else + __clear_close_on_exec(fd, fdt); + spin_unlock(&files->file_lock); + + if (tofree) + filp_close(tofree, files); + + return fd; + +Ebusy: + spin_unlock(&files->file_lock); + return -EBUSY; +} + +int replace_fd(unsigned fd, struct file *file, unsigned flags) +{ + int err; + struct files_struct *files = current->files; + + if (!file) + return __close_fd(files, fd); + + if (fd >= rlimit(RLIMIT_NOFILE)) + return -EMFILE; + + spin_lock(&files->file_lock); + err = expand_files(files, fd); + if (unlikely(err < 0)) + goto out_unlock; + return do_dup2(files, file, fd, flags); + +out_unlock: + spin_unlock(&files->file_lock); + return err; +} + +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + int err = -EBADF; + struct file *file; + struct files_struct *files = current->files; + + if ((flags & ~O_CLOEXEC) != 0) + return -EINVAL; + + if (newfd >= rlimit(RLIMIT_NOFILE)) + return -EMFILE; + + spin_lock(&files->file_lock); + err = expand_files(files, newfd); + file = fcheck(oldfd); + if (unlikely(!file)) + goto Ebadf; + if (unlikely(err < 0)) { + if (err == -EMFILE) + goto Ebadf; + goto out_unlock; + } + return do_dup2(files, file, newfd, flags); + +Ebadf: + err = -EBADF; +out_unlock: + spin_unlock(&files->file_lock); + return err; +} + +SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) +{ + if (unlikely(newfd == oldfd)) { /* corner case */ + struct files_struct *files = current->files; + int retval = oldfd; + + rcu_read_lock(); + if (!fcheck_files(files, oldfd)) + retval = -EBADF; + rcu_read_unlock(); + return retval; + } + return sys_dup3(oldfd, newfd, 0); +} + +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + int ret = -EBADF; + struct file *file = fget_raw(fildes); + + if (file) { + ret = get_unused_fd(); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + } + return ret; +} + +int f_dupfd(unsigned int from, struct file *file, unsigned flags) +{ + int err; + if (from >= rlimit(RLIMIT_NOFILE)) + return -EINVAL; + err = alloc_fd(from, flags); + if (err >= 0) { + get_file(file); + fd_install(err, file); + } + return err; +} + +int iterate_fd(struct files_struct *files, unsigned n, + int (*f)(const void *, struct file *, unsigned), + const void *p) +{ + struct fdtable *fdt; + struct file *file; + int res = 0; + if (!files) + return 0; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + while (!res && n < fdt->max_fds) { + file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); + if (file) + res = f(p, file, n); + } + spin_unlock(&files->file_lock); + return res; } -EXPORT_SYMBOL(get_unused_fd); +EXPORT_SYMBOL(iterate_fd); diff --git a/fs/file_table.c b/fs/file_table.c index 701985e..dac6792 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -243,10 +243,10 @@ static void __fput(struct file *file) if (file->f_op && file->f_op->fasync) file->f_op->fasync(-1, file, 0); } + ima_file_free(file); if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); - ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(file->f_mode & FMODE_PATH))) { cdev_put(inode->i_cdev); @@ -339,112 +339,6 @@ void __fput_sync(struct file *file) EXPORT_SYMBOL(fput); -struct file *fget(unsigned int fd) -{ - struct file *file; - struct files_struct *files = current->files; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } - rcu_read_unlock(); - - return file; -} - -EXPORT_SYMBOL(fget); - -struct file *fget_raw(unsigned int fd) -{ - struct file *file; - struct files_struct *files = current->files; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (!atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } - rcu_read_unlock(); - - return file; -} - -EXPORT_SYMBOL(fget_raw); - -/* - * Lightweight file lookup - no refcnt increment if fd table isn't shared. - * - * You can use this instead of fget if you satisfy all of the following - * conditions: - * 1) You must call fput_light before exiting the syscall and returning control - * to userspace (i.e. you cannot remember the returned struct file * after - * returning to userspace). - * 2) You must not call filp_close on the returned struct file * in between - * calls to fget_light and fput_light. - * 3) You must not clone the current task in between the calls to fget_light - * and fput_light. - * - * The fput_needed flag returned by fget_light should be passed to the - * corresponding fput_light. - */ -struct file *fget_light(unsigned int fd, int *fput_needed) -{ - struct file *file; - struct files_struct *files = current->files; - - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - if (file && (file->f_mode & FMODE_PATH)) - file = NULL; - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!(file->f_mode & FMODE_PATH) && - atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - } - - return file; -} - -struct file *fget_raw_light(unsigned int fd, int *fput_needed) -{ - struct file *file; - struct files_struct *files = current->files; - - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - } - - return file; -} - void put_filp(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ef67c95..f47df72 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -224,8 +224,8 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) { ip->i_mode = vxfs_transmod(vip); - ip->i_uid = (uid_t)vip->vii_uid; - ip->i_gid = (gid_t)vip->vii_gid; + i_uid_write(ip, (uid_t)vip->vii_uid); + i_gid_write(ip, (gid_t)vip->vii_gid); set_nlink(ip, vip->vii_nlink); ip->i_size = vip->vii_size; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index d4fabd2..fed2c8a 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -279,6 +279,11 @@ static void __exit vxfs_cleanup(void) { unregister_filesystem(&vxfs_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(vxfs_inode_cachep); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be3efc4..401b6c6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi) { return test_bit(BDI_writeback_running, &bdi->state); } +EXPORT_SYMBOL(writeback_in_progress); static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { @@ -438,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * setting I_SYNC flag and calling inode_sync_complete() to clear it. */ static int -__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +__writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; @@ -526,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); - ret = __writeback_single_inode(inode, wb, wbc); + ret = __writeback_single_inode(inode, wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); @@ -577,10 +577,6 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, /* * Write a portion of b_io inodes which belong to @sb. * - * If @only_this_sb is true, then find and write all such - * inodes. Otherwise write only ones which go sequentially - * in reverse order. - * * Return the number of pages and/or inodes written. */ static long writeback_sb_inodes(struct super_block *sb, @@ -673,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb, * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ - __writeback_single_inode(inode, wb, &wbc); + __writeback_single_inode(inode, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 03ff5b1..75a20c0 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -117,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned val; + unsigned uninitialized_var(val); ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -154,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned val; + unsigned uninitialized_var(val); ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 3426521..ee8d550 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -396,7 +396,7 @@ err_device: err_region: unregister_chrdev_region(devt, 1); err: - fc->conn_error = 1; + fuse_conn_kill(fc); goto out; } @@ -532,8 +532,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - /* kill connection and shutdown channel */ - fuse_conn_kill(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */ return rc; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7df2b5e..8c23fa7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -148,8 +148,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, if (ff->reserved_req) { req = ff->reserved_req; ff->reserved_req = NULL; - get_file(file); - req->stolen_file = file; + req->stolen_file = get_file(file); } spin_unlock(&fc->lock); } while (!req); @@ -1576,6 +1575,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->pages[req->num_pages] = page; req->num_pages++; + offset = 0; num -= this_num; total_len += this_num; index++; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aba15f1..78d2837 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, .page_mkwrite = fuse_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ce0a283..f0eda12 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -367,11 +367,6 @@ void fuse_conn_kill(struct fuse_conn *fc) wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); wake_up_all(&fc->reserved_req_waitq); - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - fuse_bdi_destroy(fc); } EXPORT_SYMBOL_GPL(fuse_conn_kill); @@ -380,7 +375,14 @@ static void fuse_put_super(struct super_block *sb) struct fuse_conn *fc = get_fuse_conn_super(sb); fuse_send_destroy(fc); + fuse_conn_kill(fc); + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + fuse_bdi_destroy(fc); + fuse_conn_put(fc); } @@ -1195,6 +1197,12 @@ static void fuse_fs_cleanup(void) { unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(fuse_inode_cachep); } diff --git a/fs/generic_acl.c b/fs/generic_acl.c index d0dddac..b3f3676 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -56,7 +56,7 @@ generic_acl_get(struct dentry *dentry, const char *name, void *buffer, acl = get_cached_acl(dentry->d_inode, type); if (!acl) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -77,7 +77,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, if (!inode_owner_or_capable(inode)) return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index bd4a589..f850020 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -63,7 +63,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) if (len == 0) return NULL; - acl = posix_acl_from_xattr(data, len); + acl = posix_acl_from_xattr(&init_user_ns, data, len); kfree(data); return acl; } @@ -88,13 +88,13 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) const char *name = gfs2_acl_name(type); BUG_ON(name == NULL); - len = posix_acl_to_xattr(acl, NULL, 0); + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) return 0; data = kmalloc(len, GFP_NOFS); if (data == NULL) return -ENOMEM; - error = posix_acl_to_xattr(acl, data, len); + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); if (error < 0) goto out; error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); @@ -166,12 +166,12 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (error) return error; - len = posix_acl_to_xattr(acl, NULL, 0); + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); data = kmalloc(len, GFP_NOFS); error = -ENOMEM; if (data == NULL) goto out; - posix_acl_to_xattr(acl, data, len); + posix_acl_to_xattr(&init_user_ns, acl, data, len); error = gfs2_xattr_acl_chmod(ip, attr, data); kfree(data); set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); @@ -212,7 +212,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -245,7 +245,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index d652634..01c4975 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -612,6 +612,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(mapping->host); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + unsigned requested = 0; int alloc_required; int error = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; @@ -641,7 +642,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (error) goto out_unlock; - error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); + requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, requested); if (error) goto out_qunlock; } @@ -654,7 +656,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; if (alloc_required) - rblocks += gfs2_rg_blocks(ip); + rblocks += gfs2_rg_blocks(ip, requested); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -868,8 +870,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, brelse(dibh); failed: gfs2_trans_end(sdp); - if (gfs2_mb_reserved(ip)) - gfs2_inplace_release(ip); + gfs2_inplace_release(ip); if (ip->i_res->rs_qa_qd_num) gfs2_quota_unlock(ip); if (inode == sdp->sd_rindex) { @@ -1023,7 +1024,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); out: - gfs2_glock_dq_m(1, &gh); + gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); return rv; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 49cd7dd..1fd3ae2 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -786,7 +786,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, goto out_rlist; if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip->i_res); + gfs2_rs_deltree(ip, ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d1d791e..0def050 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -323,6 +323,29 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /** + * gfs2_size_hint - Give a hint to the size of a write request + * @file: The struct file + * @offset: The file offset of the write + * @size: The length of the write + * + * When we are about to do a write, this function records the total + * write size in order to provide a suitable hint to the lower layers + * about how many blocks will be required. + * + */ + +static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) +{ + struct inode *inode = filep->f_dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; + int hint = min_t(size_t, INT_MAX, blks); + + atomic_set(&ip->i_res->rs_sizehint, hint); +} + +/** * gfs2_allocate_page_backing - Use bmap to allocate blocks * @page: The (locked) page to allocate backing for * @@ -382,8 +405,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) return ret; - atomic_set(&ip->i_res->rs_sizehint, - PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift); + gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -419,7 +441,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; - rblocks += gfs2_rg_blocks(ip); + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) @@ -470,6 +492,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .page_mkwrite = gfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /** @@ -504,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) return error; } vma->vm_ops = &gfs2_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -663,7 +685,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret) return ret; - atomic_set(&ip->i_res->rs_sizehint, writesize >> sdp->sd_sb.sb_bsize_shift); + gfs2_size_hint(file, pos, writesize); + if (file->f_flags & O_APPEND) { struct gfs2_holder gh; @@ -789,7 +812,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (unlikely(error)) goto out_uninit; - atomic_set(&ip->i_res->rs_sizehint, len >> sdp->sd_sb.sb_bsize_shift); + gfs2_size_hint(file, offset, len); while (len > 0) { if (len < bytes) @@ -822,7 +845,7 @@ retry: &max_bytes, &data_blocks, &ind_blocks); rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(ip); + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1ed81f4..e6c2fd5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -186,20 +186,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) } /** - * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list - * @gl: the glock - * - * If the glock is demotable, then we add it (or move it) to the end - * of the glock LRU list. - */ - -static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) -{ - if (demote_ok(gl)) - gfs2_glock_add_to_lru(gl); -} - -/** * gfs2_glock_put_nolock() - Decrement reference count on glock * @gl: The glock to put * @@ -883,7 +869,14 @@ static int gfs2_glock_demote_wait(void *word) return 0; } -static void wait_on_holder(struct gfs2_holder *gh) +/** + * gfs2_glock_wait - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) { unsigned long time1 = jiffies; @@ -894,12 +887,7 @@ static void wait_on_holder(struct gfs2_holder *gh) gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + GL_GLOCK_HOLD_INCR, GL_GLOCK_MAX_HOLD); -} - -static void wait_on_demote(struct gfs2_glock *gl) -{ - might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + return gh->gh_error; } /** @@ -929,19 +917,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, trace_gfs2_demote_rq(gl); } -/** - * gfs2_glock_wait - wait on a glock acquisition - * @gh: the glock holder - * - * Returns: 0 on success - */ - -int gfs2_glock_wait(struct gfs2_holder *gh) -{ - wait_on_holder(gh); - return gh->gh_error; -} - void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { struct va_format vaf; @@ -979,7 +954,7 @@ __acquires(&gl->gl_spin) struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; - int try_lock = 0; + int try_futile = 0; BUG_ON(gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) @@ -987,7 +962,7 @@ __acquires(&gl->gl_spin) if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { if (test_bit(GLF_LOCK, &gl->gl_flags)) - try_lock = 1; + try_futile = !may_grant(gl, gh); if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; } @@ -996,9 +971,8 @@ __acquires(&gl->gl_spin) if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) goto trap_recursive; - if (try_lock && - !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && - !may_grant(gl, gh)) { + if (try_futile && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: gh->gh_error = GLR_TRYFAILED; gfs2_holder_wake(gh); @@ -1121,8 +1095,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - if (!test_bit(GLF_LFLUSH, &gl->gl_flags)) - __gfs2_glock_schedule_for_reclaim(gl); + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + gfs2_glock_add_to_lru(gl); + trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); if (likely(fast_path)) @@ -1141,7 +1116,8 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); - wait_on_demote(gl); + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4bdcf37..32cc4fd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -94,6 +94,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) /* A shortened, inline version of gfs2_trans_begin() */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); tr.tr_ip = (unsigned long)__builtin_return_address(0); + sb_start_intwrite(sdp->sd_vfs); gfs2_log_reserve(sdp, tr.tr_reserved); BUG_ON(current->journal_info); current->journal_info = &tr; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index aaecc80..3d469d3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -99,9 +99,26 @@ struct gfs2_rgrpd { #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ struct rb_root rd_rstree; /* multi-block reservation tree */ - u32 rd_rs_cnt; /* count of current reservations */ }; +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + struct gfs2_bitmap *bi; /* Bitmap must belong to the rgd */ + u32 offset; /* The offset is bitmap relative */ +}; + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset; +} + +static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, + const struct gfs2_rbm *rbm2) +{ + return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && + (rbm1->offset == rbm2->offset); +} + enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, @@ -250,18 +267,11 @@ struct gfs2_blkreserv { /* components used during write (step 1): */ atomic_t rs_sizehint; /* hint of the write size */ - /* components used during inplace_reserve (step 2): */ - u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */ - - /* components used during get_local_rgrp (step 3): */ - struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */ struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ struct rb_node rs_node; /* link to other block reservations */ - - /* components used during block searches and assignments (step 4): */ - struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */ - u32 rs_biblk; /* start block relative to the bi */ + struct gfs2_rbm rs_rbm; /* Start of reservation */ u32 rs_free; /* how many blocks are still free */ + u64 rs_inum; /* Inode number for reservation */ /* ancillary quota stuff */ struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 4ce22e5..381893c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -712,14 +712,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; - /* The newly created inode needs a reservation so it can allocate - xattrs. At the same time, we want new blocks allocated to the new - dinode to be as contiguous as possible. Since we allocated the - dinode block under the directory's reservation, we transfer - ownership of that reservation to the new inode. The directory - doesn't need a reservation unless it needs a new allocation. */ - ip->i_res = dip->i_res; - dip->i_res = NULL; + error = gfs2_rs_alloc(ip); + if (error) + goto fail_gunlock2; error = gfs2_acl_create(dip, inode); if (error) @@ -737,10 +732,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, brelse(bh); gfs2_trans_end(sdp); - /* Check if we reserved space in the rgrp. Function link_dinode may - not, depending on whether alloc is required. */ - if (gfs2_mb_reserved(dip)) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); gfs2_quota_unlock(dip); mark_inode_dirty(inode); gfs2_glock_dq_uninit_m(2, ghs); @@ -897,7 +889,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(dip) + + gfs2_rg_blocks(dip, sdp->sd_max_dirres) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -1378,7 +1370,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(ndip) + + gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) @@ -1722,7 +1714,9 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = generic_setxattr(dentry, name, data, size, flags); + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_setxattr(dentry, name, data, size, flags); gfs2_glock_dq(&gh); } gfs2_holder_uninit(&gh); @@ -1757,7 +1751,9 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = generic_removexattr(dentry, name); + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_removexattr(dentry, name); gfs2_glock_dq(&gh); } gfs2_holder_uninit(&gh); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 4a38db7..0fb6539 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1289,7 +1289,7 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) spin_lock(&ls->ls_recover_spin); set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); spin_unlock(&ls->ls_recover_spin); - flush_delayed_work_sync(&sdp->sd_control_work); + flush_delayed_work(&sdp->sd_control_work); /* mounted_lock and control_lock will be purged in dlm recovery */ release: diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e5af9dc..e443966 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -19,6 +19,7 @@ #include <linux/mount.h> #include <linux/gfs2_ondisk.h> #include <linux/quotaops.h> +#include <linux/lockdep.h> #include "gfs2.h" #include "incore.h" @@ -766,6 +767,7 @@ fail: return error; } +static struct lock_class_key gfs2_quota_imutex_key; static int init_inodes(struct gfs2_sbd *sdp, int undo) { @@ -803,6 +805,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get quota file inode: %d\n", error); goto fail_rindex; } + /* + * i_mutex on quota files is special. Since this inode is hidden system + * file, we are safe to define locking ourselves. + */ + lockdep_set_class(&sdp->sd_quota_inode->i_mutex, + &gfs2_quota_imutex_key); error = gfs2_rindex_update(sdp); if (error) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a3bde91..40c4b0d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -765,6 +765,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) struct gfs2_holder *ghs, i_gh; unsigned int qx, x; struct gfs2_quota_data *qd; + unsigned reserved; loff_t offset; unsigned int nalloc = 0, blocks; int error; @@ -781,7 +782,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA); + mutex_lock(&ip->i_inode.i_mutex); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -811,13 +812,13 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) * two blocks need to be updated instead of 1 */ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; - error = gfs2_inplace_reserve(ip, 1 + - (nalloc * (data_blocks + ind_blocks))); + reserved = 1 + (nalloc * (data_blocks + ind_blocks)); + error = gfs2_inplace_reserve(ip, reserved); if (error) goto out_alloc; if (nalloc) - blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -1070,8 +1071,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); - quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, qd->qd_id, + quota_send_warning(make_kqid(&init_user_ns, + test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, + qd->qd_id), sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); error = -EDQUOT; @@ -1081,8 +1084,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { - quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, qd->qd_id, + quota_send_warning(make_kqid(&init_user_ns, + test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, + qd->qd_id), sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; @@ -1469,7 +1474,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb, return 0; } -static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, +static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct fs_disk_quota *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1477,20 +1482,21 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; + int type; memset(fdq, 0, sizeof(struct fs_disk_quota)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - if (type == USRQUOTA) + if (qid.type == USRQUOTA) type = QUOTA_USER; - else if (type == GRPQUOTA) + else if (qid.type == GRPQUOTA) type = QUOTA_GROUP; else return -EINVAL; - error = qd_get(sdp, type, id, &qd); + error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); @@ -1500,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; - fdq->d_id = id; + fdq->d_id = from_kqid(&init_user_ns, qid); fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; @@ -1514,7 +1520,7 @@ out: /* GFS2 only supports a subset of the XFS fields */ #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) -static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, +static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, struct fs_disk_quota *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1526,11 +1532,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, int alloc_required; loff_t offset; int error; + int type; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - switch(type) { + switch(qid.type) { case USRQUOTA: type = QUOTA_USER; if (fdq->d_flags != FS_USER_QUOTA) @@ -1547,10 +1554,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if (fdq->d_fieldmask & ~GFS2_FIELDMASK) return -EINVAL; - if (fdq->d_id != id) + if (fdq->d_id != from_kqid(&init_user_ns, qid)) return -EINVAL; - error = qd_get(sdp, type, id, &qd); + error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); if (error) return error; @@ -1598,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, error = gfs2_inplace_reserve(ip, blocks); if (error) goto out_i; - blocks += gfs2_rg_blocks(ip); + blocks += gfs2_rg_blocks(ip, blocks); } /* Some quotas span block boundaries and can update two blocks, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 4d34887..3cc402c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -35,9 +35,6 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) -#define RSRV_CONTENTION_FACTOR 4 -#define RGRP_RSRV_MAX_CONTENDERS 2 - #if BITS_PER_LONG == 32 #define LBITMASK (0x55555555UL) #define LBITSKIP55 (0x55555555UL) @@ -67,53 +64,48 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, - struct gfs2_bitmap **rbi); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, + const struct gfs2_inode *ip, bool nowrap); + /** * gfs2_setbit - Set a bit in the bitmaps - * @rgd: the resource group descriptor - * @buf2: the clone buffer that holds the bitmaps - * @bi: the bitmap structure - * @block: the block to set + * @rbm: The position of the bit to set + * @do_clone: Also set the clone bitmap, if it exists * @new_state: the new state of the block * */ -static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2, - struct gfs2_bitmap *bi, u32 block, +static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, unsigned char new_state) { unsigned char *byte1, *byte2, *end, cur_state; - unsigned int buflen = bi->bi_len; - const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + unsigned int buflen = rbm->bi->bi_len; + const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; - byte1 = bi->bi_bh->b_data + bi->bi_offset + (block / GFS2_NBBY); - end = bi->bi_bh->b_data + bi->bi_offset + buflen; + byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen; BUG_ON(byte1 >= end); cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; if (unlikely(!valid_change[new_state * 4 + cur_state])) { - printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, " - "new_state=%d\n", - (unsigned long long)block, cur_state, new_state); - printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n", - (unsigned long long)rgd->rd_addr, - (unsigned long)bi->bi_start); - printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n", - (unsigned long)bi->bi_offset, - (unsigned long)bi->bi_len); + printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " + "new_state=%d\n", rbm->offset, cur_state, new_state); + printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", + (unsigned long long)rbm->rgd->rd_addr, + rbm->bi->bi_start); + printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", + rbm->bi->bi_offset, rbm->bi->bi_len); dump_stack(); - gfs2_consist_rgrpd(rgd); + gfs2_consist_rgrpd(rbm->rgd); return; } *byte1 ^= (cur_state ^ new_state) << bit; - if (buf2) { - byte2 = buf2 + bi->bi_offset + (block / GFS2_NBBY); + if (do_clone && rbm->bi->bi_clone) { + byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY); cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; *byte2 ^= (cur_state ^ new_state) << bit; } @@ -121,30 +113,21 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2, /** * gfs2_testbit - test a bit in the bitmaps - * @rgd: the resource group descriptor - * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer - * @block: the block to read + * @rbm: The bit to test * + * Returns: The two bit block state of the requested bit */ -static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, - const unsigned char *buffer, - unsigned int buflen, u32 block) +static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) { - const unsigned char *byte, *end; - unsigned char cur_state; + const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset; + const u8 *byte; unsigned int bit; - byte = buffer + (block / GFS2_NBBY); - bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - end = buffer + buflen; - - gfs2_assert(rgd->rd_sbd, byte < end); + byte = buffer + (rbm->offset / GFS2_NBBY); + bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; - cur_state = (*byte >> bit) & GFS2_BIT_MASK; - - return cur_state; + return (*byte >> bit) & GFS2_BIT_MASK; } /** @@ -192,7 +175,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) */ static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) { - u64 startblk = gfs2_rs_startblk(rs); + u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm); if (blk >= startblk + rs->rs_free) return 1; @@ -202,36 +185,6 @@ static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) } /** - * rs_find - Find a rgrp multi-block reservation that contains a given block - * @rgd: The rgrp - * @rgblk: The block we're looking for, relative to the rgrp - */ -static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk) -{ - struct rb_node **newn; - int rc; - u64 fsblk = rgblk + rgd->rd_data0; - - spin_lock(&rgd->rd_rsspin); - newn = &rgd->rd_rstree.rb_node; - while (*newn) { - struct gfs2_blkreserv *cur = - rb_entry(*newn, struct gfs2_blkreserv, rs_node); - rc = rs_cmp(fsblk, 1, cur); - if (rc < 0) - newn = &((*newn)->rb_left); - else if (rc > 0) - newn = &((*newn)->rb_right); - else { - spin_unlock(&rgd->rd_rsspin); - return cur; - } - } - spin_unlock(&rgd->rd_rsspin); - return NULL; -} - -/** * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing * a block in a given allocation state. * @buf: the buffer that holds the bitmaps @@ -262,8 +215,6 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, u64 mask = 0x5555555555555555ULL; u32 bit; - BUG_ON(state > 3); - /* Mask off bits we don't care about at the start of the search */ mask <<= spoint; tmp = gfs2_bit_search(ptr, mask, state); @@ -285,6 +236,131 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, } /** + * gfs2_rbm_from_block - Set the rbm based upon rgd and block number + * @rbm: The rbm with rgd already set correctly + * @block: The block number (filesystem relative) + * + * This sets the bi and offset members of an rbm based on a + * resource group and a filesystem relative block number. The + * resource group must be set in the rbm on entry, the bi and + * offset members will be set by this function. + * + * Returns: 0 on success, or an error code + */ + +static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) +{ + u64 rblock = block - rbm->rgd->rd_data0; + u32 goal = (u32)rblock; + int x; + + if (WARN_ON_ONCE(rblock > UINT_MAX)) + return -EINVAL; + if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) + return -E2BIG; + + for (x = 0; x < rbm->rgd->rd_length; x++) { + rbm->bi = rbm->rgd->rd_bits + x; + if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) { + rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY); + break; + } + } + + return 0; +} + +/** + * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned + * @rbm: Position to search (value/result) + * @n_unaligned: Number of unaligned blocks to check + * @len: Decremented for each block found (terminate on zero) + * + * Returns: true if a non-free block is encountered + */ + +static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) +{ + u64 block; + u32 n; + u8 res; + + for (n = 0; n < n_unaligned; n++) { + res = gfs2_testbit(rbm); + if (res != GFS2_BLKST_FREE) + return true; + (*len)--; + if (*len == 0) + return true; + block = gfs2_rbm_to_block(rbm); + if (gfs2_rbm_from_block(rbm, block + 1)) + return true; + } + + return false; +} + +/** + * gfs2_free_extlen - Return extent length of free blocks + * @rbm: Starting position + * @len: Max length to check + * + * Starting at the block specified by the rbm, see how many free blocks + * there are, not reading more than len blocks ahead. This can be done + * using memchr_inv when the blocks are byte aligned, but has to be done + * on a block by block basis in case of unaligned blocks. Also this + * function can cope with bitmap boundaries (although it must stop on + * a resource group boundary) + * + * Returns: Number of free blocks in the extent + */ + +static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) +{ + struct gfs2_rbm rbm = *rrbm; + u32 n_unaligned = rbm.offset & 3; + u32 size = len; + u32 bytes; + u32 chunk_size; + u8 *ptr, *start, *end; + u64 block; + + if (n_unaligned && + gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) + goto out; + + n_unaligned = len & 3; + /* Start is now byte aligned */ + while (len > 3) { + start = rbm.bi->bi_bh->b_data; + if (rbm.bi->bi_clone) + start = rbm.bi->bi_clone; + end = start + rbm.bi->bi_bh->b_size; + start += rbm.bi->bi_offset; + BUG_ON(rbm.offset & 3); + start += (rbm.offset / GFS2_NBBY); + bytes = min_t(u32, len / GFS2_NBBY, (end - start)); + ptr = memchr_inv(start, 0, bytes); + chunk_size = ((ptr == NULL) ? bytes : (ptr - start)); + chunk_size *= GFS2_NBBY; + BUG_ON(len < chunk_size); + len -= chunk_size; + block = gfs2_rbm_to_block(&rbm); + gfs2_rbm_from_block(&rbm, block + chunk_size); + n_unaligned = 3; + if (ptr) + break; + n_unaligned = len & 3; + } + + /* Deal with any bits left over at the end */ + if (n_unaligned) + gfs2_unaligned_extlen(&rbm, n_unaligned, &len); +out: + return size - len; +} + +/** * gfs2_bitcount - count the number of bits in a certain state * @rgd: the resource group descriptor * @buffer: the buffer that holds the bitmaps @@ -487,6 +563,8 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) if (!res) error = -ENOMEM; + RB_CLEAR_NODE(&res->rs_node); + down_write(&ip->i_rw_mutex); if (ip->i_res) kmem_cache_free(gfs2_rsrv_cachep, res); @@ -496,11 +574,12 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) return error; } -static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs) +static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) { - gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u\n", - rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk, - rs->rs_free); + gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n", + (unsigned long long)rs->rs_inum, + (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), + rs->rs_rbm.offset, rs->rs_free); } /** @@ -508,41 +587,26 @@ static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -static void __rs_deltree(struct gfs2_blkreserv *rs) +static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; if (!gfs2_rs_active(rs)) return; - rgd = rs->rs_rgd; - /* We can't do this: The reason is that when the rgrp is invalidated, - it's in the "middle" of acquiring the glock, but the HOLDER bit - isn't set yet: - BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/ - trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL); - - if (!RB_EMPTY_ROOT(&rgd->rd_rstree)) - rb_erase(&rs->rs_node, &rgd->rd_rstree); - BUG_ON(!rgd->rd_rs_cnt); - rgd->rd_rs_cnt--; + rgd = rs->rs_rbm.rgd; + trace_gfs2_rs(rs, TRACE_RS_TREEDEL); + rb_erase(&rs->rs_node, &rgd->rd_rstree); + RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { /* return reserved blocks to the rgrp and the ip */ - BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free); - rs->rs_rgd->rd_reserved -= rs->rs_free; + BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); + rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; rs->rs_free = 0; - clear_bit(GBF_FULL, &rs->rs_bi->bi_flags); + clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags); smp_mb__after_clear_bit(); } - /* We can't change any of the step 1 or step 2 components of the rs. - E.g. We can't set rs_rgd to NULL because the rgd glock is held and - dequeued through this pointer. - Can't: atomic_set(&rs->rs_sizehint, 0); - Can't: rs->rs_requested = 0; - Can't: rs->rs_rgd = NULL;*/ - rs->rs_bi = NULL; - rs->rs_biblk = 0; } /** @@ -550,17 +614,16 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -void gfs2_rs_deltree(struct gfs2_blkreserv *rs) +void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; - if (!gfs2_rs_active(rs)) - return; - - rgd = rs->rs_rgd; - spin_lock(&rgd->rd_rsspin); - __rs_deltree(rs); - spin_unlock(&rgd->rd_rsspin); + rgd = rs->rs_rbm.rgd; + if (rgd) { + spin_lock(&rgd->rd_rsspin); + __rs_deltree(ip, rs); + spin_unlock(&rgd->rd_rsspin); + } } /** @@ -572,8 +635,7 @@ void gfs2_rs_delete(struct gfs2_inode *ip) { down_write(&ip->i_rw_mutex); if (ip->i_res) { - gfs2_rs_deltree(ip->i_res); - trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE); + gfs2_rs_deltree(ip, ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); ip->i_res = NULL; @@ -597,7 +659,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd) spin_lock(&rgd->rd_rsspin); while ((n = rb_first(&rgd->rd_rstree))) { rs = rb_entry(n, struct gfs2_blkreserv, rs_node); - __rs_deltree(rs); + __rs_deltree(NULL, rs); } spin_unlock(&rgd->rd_rsspin); } @@ -1270,211 +1332,276 @@ out: /** * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree - * @bi: the bitmap with the blocks * @ip: the inode structure - * @biblk: the 32-bit block number relative to the start of the bitmap - * @amount: the number of blocks to reserve * - * Returns: NULL - reservation was already taken, so not inserted - * pointer to the inserted reservation */ -static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi, - struct gfs2_inode *ip, u32 biblk, - int amount) +static void rs_insert(struct gfs2_inode *ip) { struct rb_node **newn, *parent = NULL; int rc; struct gfs2_blkreserv *rs = ip->i_res; - struct gfs2_rgrpd *rgd = rs->rs_rgd; - u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0; + struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; + u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); + + BUG_ON(gfs2_rs_active(rs)); spin_lock(&rgd->rd_rsspin); newn = &rgd->rd_rstree.rb_node; - BUG_ON(!ip->i_res); - BUG_ON(gfs2_rs_active(rs)); - /* Figure out where to put new node */ - /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/ while (*newn) { struct gfs2_blkreserv *cur = rb_entry(*newn, struct gfs2_blkreserv, rs_node); parent = *newn; - rc = rs_cmp(fsblock, amount, cur); + rc = rs_cmp(fsblock, rs->rs_free, cur); if (rc > 0) newn = &((*newn)->rb_right); else if (rc < 0) newn = &((*newn)->rb_left); else { spin_unlock(&rgd->rd_rsspin); - return NULL; /* reservation already in use */ + WARN_ON(1); + return; } } - /* Do our reservation work */ - rs = ip->i_res; - rs->rs_free = amount; - rs->rs_biblk = biblk; - rs->rs_bi = bi; rb_link_node(&rs->rs_node, parent, newn); rb_insert_color(&rs->rs_node, &rgd->rd_rstree); - /* Do our inode accounting for the reservation */ - /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/ - /* Do our rgrp accounting for the reservation */ - rgd->rd_reserved += amount; /* blocks reserved */ - rgd->rd_rs_cnt++; /* number of in-tree reservations */ + rgd->rd_reserved += rs->rs_free; /* blocks reserved */ spin_unlock(&rgd->rd_rsspin); - trace_gfs2_rs(ip, rs, TRACE_RS_INSERT); - return rs; + trace_gfs2_rs(rs, TRACE_RS_INSERT); } /** - * unclaimed_blocks - return number of blocks that aren't spoken for - */ -static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd) -{ - return rgd->rd_free_clone - rgd->rd_reserved; -} - -/** - * rg_mblk_search - find a group of multiple free blocks + * rg_mblk_search - find a group of multiple free blocks to form a reservation * @rgd: the resource group descriptor - * @rs: the block reservation * @ip: pointer to the inode for which we're reserving blocks + * @requested: number of blocks required for this allocation * - * This is very similar to rgblk_search, except we're looking for whole - * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing - * on aligned dwords for speed's sake. - * - * Returns: 0 if successful or BFITNOENT if there isn't enough free space */ -static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, + unsigned requested) { - struct gfs2_bitmap *bi = rgd->rd_bits; - const u32 length = rgd->rd_length; - u32 blk; - unsigned int buf, x, search_bytes; - u8 *buffer = NULL; - u8 *ptr, *end, *nonzero; - u32 goal, rsv_bytes; - struct gfs2_blkreserv *rs; - u32 best_rs_bytes, unclaimed; - int best_rs_blocks; + struct gfs2_rbm rbm = { .rgd = rgd, }; + u64 goal; + struct gfs2_blkreserv *rs = ip->i_res; + u32 extlen; + u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; + int ret; + + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + return; /* Find bitmap block that contains bits for goal block */ if (rgrp_contains_block(rgd, ip->i_goal)) - goal = ip->i_goal - rgd->rd_data0; + goal = ip->i_goal; else - goal = rgd->rd_last_alloc; - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - /* Convert scope of "goal" from rgrp-wide to within - found bit block */ - if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { - goal -= bi->bi_start * GFS2_NBBY; - goto do_search; - } + goal = rgd->rd_last_alloc + rgd->rd_data0; + + if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) + return; + + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); + if (ret == 0) { + rs->rs_rbm = rbm; + rs->rs_free = extlen; + rs->rs_inum = ip->i_no_addr; + rs_insert(ip); } - buf = 0; - goal = 0; - -do_search: - best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint), - (RGRP_RSRV_MINBLKS * rgd->rd_length)); - best_rs_bytes = (best_rs_blocks * - (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) / - GFS2_NBBY; /* 1 + is for our not-yet-created reservation */ - best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64)); - unclaimed = unclaimed_blocks(rgd); - if (best_rs_bytes * GFS2_NBBY > unclaimed) - best_rs_bytes = unclaimed >> GFS2_BIT_SIZE; - - for (x = 0; x <= length; x++) { - bi = rgd->rd_bits + buf; +} - if (test_bit(GBF_FULL, &bi->bi_flags)) - goto skip; +/** + * gfs2_next_unreserved_block - Return next block that is not reserved + * @rgd: The resource group + * @block: The starting block + * @length: The required length + * @ip: Ignore any reservations for this inode + * + * If the block does not appear in any reservation, then return the + * block number unchanged. If it does appear in the reservation, then + * keep looking through the tree of reservations in order to find the + * first block number which is not reserved. + */ - WARN_ON(!buffer_uptodate(bi->bi_bh)); - if (bi->bi_clone) - buffer = bi->bi_clone + bi->bi_offset; +static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, + u32 length, + const struct gfs2_inode *ip) +{ + struct gfs2_blkreserv *rs; + struct rb_node *n; + int rc; + + spin_lock(&rgd->rd_rsspin); + n = rgd->rd_rstree.rb_node; + while (n) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + rc = rs_cmp(block, length, rs); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; else - buffer = bi->bi_bh->b_data + bi->bi_offset; - - /* We have to keep the reservations aligned on u64 boundaries - otherwise we could get situations where a byte can't be - used because it's after a reservation, but a free bit still - is within the reservation's area. */ - ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64)); - end = (buffer + bi->bi_len); - while (ptr < end) { - rsv_bytes = 0; - if ((ptr + best_rs_bytes) <= end) - search_bytes = best_rs_bytes; - else - search_bytes = end - ptr; - BUG_ON(!search_bytes); - nonzero = memchr_inv(ptr, 0, search_bytes); - /* If the lot is all zeroes, reserve the whole size. If - there's enough zeroes to satisfy the request, use - what we can. If there's not enough, keep looking. */ - if (nonzero == NULL) - rsv_bytes = search_bytes; - else if ((nonzero - ptr) * GFS2_NBBY >= - ip->i_res->rs_requested) - rsv_bytes = (nonzero - ptr); - - if (rsv_bytes) { - blk = ((ptr - buffer) * GFS2_NBBY); - BUG_ON(blk >= bi->bi_len * GFS2_NBBY); - rs = rs_insert(bi, ip, blk, - rsv_bytes * GFS2_NBBY); - if (IS_ERR(rs)) - return PTR_ERR(rs); - if (rs) - return 0; - } - ptr += ALIGN(search_bytes, sizeof(u64)); + break; + } + + if (n) { + while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) { + block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; + n = n->rb_right; + if (n == NULL) + break; + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); } -skip: - /* Try next bitmap block (wrap back to rgrp header - if at end) */ - buf++; - buf %= length; - goal = 0; } - return BFITNOENT; + spin_unlock(&rgd->rd_rsspin); + return block; } /** - * try_rgrp_fit - See if a given reservation will fit in a given RG - * @rgd: the RG data - * @ip: the inode + * gfs2_reservation_check_and_update - Check for reservations during block alloc + * @rbm: The current position in the resource group + * @ip: The inode for which we are searching for blocks + * @minext: The minimum extent length * - * If there's room for the requested blocks to be allocated from the RG: - * This will try to get a multi-block reservation first, and if that doesn't - * fit, it will take what it can. + * This checks the current position in the rgrp to see whether there is + * a reservation covering this block. If not then this function is a + * no-op. If there is, then the position is moved to the end of the + * contiguous reservation(s) so that we are pointing at the first + * non-reserved block. * - * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) + * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error */ -static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, + u32 minext) { - struct gfs2_blkreserv *rs = ip->i_res; + u64 block = gfs2_rbm_to_block(rbm); + u32 extlen = 1; + u64 nblock; + int ret; - if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + /* + * If we have a minimum extent length, then skip over any extent + * which is less than the min extent length in size. + */ + if (minext) { + extlen = gfs2_free_extlen(rbm, minext); + nblock = block + extlen; + if (extlen < minext) + goto fail; + } + + /* + * Check the extent which has been found against the reservations + * and skip if parts of it are already reserved + */ + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); + if (nblock == block) return 0; - /* Look for a multi-block reservation. */ - if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS && - rg_mblk_search(rgd, ip) != BFITNOENT) - return 1; - if (unclaimed_blocks(rgd) >= rs->rs_requested) - return 1; +fail: + ret = gfs2_rbm_from_block(rbm, nblock); + if (ret < 0) + return ret; + return 1; +} - return 0; +/** + * gfs2_rbm_find - Look for blocks of a particular state + * @rbm: Value/result starting position and final position + * @state: The state which we want to find + * @minext: The requested extent length (0 for a single block) + * @ip: If set, check for reservations + * @nowrap: Stop looking at the end of the rgrp, rather than wrapping + * around until we've reached the starting point. + * + * Side effects: + * - If looking for free blocks, we set GBF_FULL on each bitmap which + * has no free blocks in it. + * + * Returns: 0 on success, -ENOSPC if there is no block of the requested state + */ + +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, + const struct gfs2_inode *ip, bool nowrap) +{ + struct buffer_head *bh; + struct gfs2_bitmap *initial_bi; + u32 initial_offset; + u32 offset; + u8 *buffer; + int index; + int n = 0; + int iters = rbm->rgd->rd_length; + int ret; + + /* If we are not starting at the beginning of a bitmap, then we + * need to add one to the bitmap count to ensure that we search + * the starting bitmap twice. + */ + if (rbm->offset != 0) + iters++; + + while(1) { + if (test_bit(GBF_FULL, &rbm->bi->bi_flags) && + (state == GFS2_BLKST_FREE)) + goto next_bitmap; + + bh = rbm->bi->bi_bh; + buffer = bh->b_data + rbm->bi->bi_offset; + WARN_ON(!buffer_uptodate(bh)); + if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone) + buffer = rbm->bi->bi_clone + rbm->bi->bi_offset; + initial_offset = rbm->offset; + offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state); + if (offset == BFITNOENT) + goto bitmap_full; + rbm->offset = offset; + if (ip == NULL) + return 0; + + initial_bi = rbm->bi; + ret = gfs2_reservation_check_and_update(rbm, ip, minext); + if (ret == 0) + return 0; + if (ret > 0) { + n += (rbm->bi - initial_bi); + goto next_iter; + } + if (ret == -E2BIG) { + index = 0; + rbm->offset = 0; + n += (rbm->bi - initial_bi); + goto res_covered_end_of_rgrp; + } + return ret; + +bitmap_full: /* Mark bitmap as full and fall through */ + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) + set_bit(GBF_FULL, &rbm->bi->bi_flags); + +next_bitmap: /* Find next bitmap in the rgrp */ + rbm->offset = 0; + index = rbm->bi - rbm->rgd->rd_bits; + index++; + if (index == rbm->rgd->rd_length) + index = 0; +res_covered_end_of_rgrp: + rbm->bi = &rbm->rgd->rd_bits[index]; + if ((index == 0) && nowrap) + break; + n++; +next_iter: + if (n >= iters) + break; + } + + return -ENOSPC; } /** @@ -1489,34 +1616,33 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { - u32 goal = 0, block; - u64 no_addr; + u64 block; struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl; struct gfs2_inode *ip; int error; int found = 0; - struct gfs2_bitmap *bi; + struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 }; - while (goal < rgd->rd_data) { + while (1) { down_write(&sdp->sd_log_flush_lock); - block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); up_write(&sdp->sd_log_flush_lock); - if (block == BFITNOENT) + if (error == -ENOSPC) + break; + if (WARN_ON_ONCE(error)) break; - block = gfs2_bi2rgd_blk(bi, block); - /* rgblk_search can return a block < goal, so we need to - keep it marching forward. */ - no_addr = block + rgd->rd_data0; - goal = max(block + 1, goal + 1); - if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + 1)) + break; + if (*last_unlinked != NO_BLOCK && block <= *last_unlinked) continue; - if (no_addr == skip) + if (block == skip) continue; - *last_unlinked = no_addr; + *last_unlinked = block; - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl); if (error) continue; @@ -1543,6 +1669,19 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip return; } +static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) +{ + struct gfs2_rgrpd *rgd = *pos; + + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == NULL) + rgd = gfs2_rgrpd_get_next(NULL); + *pos = rgd; + if (rgd != begin) /* If we didn't wrap */ + return true; + return false; +} + /** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for @@ -1562,103 +1701,96 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; - rs->rs_requested = requested; - if (gfs2_assert_warn(sdp, requested)) { - error = -EINVAL; - goto out; - } + if (gfs2_assert_warn(sdp, requested)) + return -EINVAL; if (gfs2_rs_active(rs)) { - begin = rs->rs_rgd; + begin = rs->rs_rbm.rgd; flags = 0; /* Yoda: Do or do not. There is no try */ } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { - rs->rs_rgd = begin = ip->i_rgd; + rs->rs_rbm.rgd = begin = ip->i_rgd; } else { - rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } - if (rs->rs_rgd == NULL) + if (rs->rs_rbm.rgd == NULL) return -EBADSLT; while (loops < 3) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else if (!loops && !gfs2_rs_active(rs) && - rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) { - /* If the rgrp already is maxed out for contenders, - we can eliminate it as a "first pass" without even - requesting the rgrp glock. */ - error = GLR_TRYFAILED; - } else { - error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl, + rg_locked = 1; + + if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { + rg_locked = 0; + error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &rs->rs_rgd_gh); - if (!error && sdp->sd_args.ar_rgrplvb) { - error = update_rgrp_lvb(rs->rs_rgd); - if (error) { + if (error == GLR_TRYFAILED) + goto next_rgrp; + if (unlikely(error)) + return error; + if (sdp->sd_args.ar_rgrplvb) { + error = update_rgrp_lvb(rs->rs_rbm.rgd); + if (unlikely(error)) { gfs2_glock_dq_uninit(&rs->rs_rgd_gh); return error; } } } - switch (error) { - case 0: - if (gfs2_rs_active(rs)) { - if (unclaimed_blocks(rs->rs_rgd) + - rs->rs_free >= rs->rs_requested) { - ip->i_rgd = rs->rs_rgd; - return 0; - } - /* We have a multi-block reservation, but the - rgrp doesn't have enough free blocks to - satisfy the request. Free the reservation - and look for a suitable rgrp. */ - gfs2_rs_deltree(rs); - } - if (try_rgrp_fit(rs->rs_rgd, ip)) { - if (sdp->sd_args.ar_rgrplvb) - gfs2_rgrp_bh_get(rs->rs_rgd); - ip->i_rgd = rs->rs_rgd; - return 0; - } - if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) { - if (sdp->sd_args.ar_rgrplvb) - gfs2_rgrp_bh_get(rs->rs_rgd); - try_rgrp_unlink(rs->rs_rgd, &last_unlinked, - ip->i_no_addr); - } - if (!rg_locked) - gfs2_glock_dq_uninit(&rs->rs_rgd_gh); - /* fall through */ - case GLR_TRYFAILED: - rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd); - rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */ - if (rs->rs_rgd != begin) /* If we didn't wrap */ - break; - flags &= ~LM_FLAG_TRY; - loops++; - /* Check that fs hasn't grown if writing to rindex */ - if (ip == GFS2_I(sdp->sd_rindex) && - !sdp->sd_rindex_uptodate) { - error = gfs2_ri_update(ip); - if (error) - goto out; - } else if (loops == 2) - /* Flushing the log may release space */ - gfs2_log_flush(sdp, NULL); - break; - default: - goto out; + /* Skip unuseable resource groups */ + if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + goto skip_rgrp; + + if (sdp->sd_args.ar_rgrplvb) + gfs2_rgrp_bh_get(rs->rs_rbm.rgd); + + /* Get a reservation if we don't already have one */ + if (!gfs2_rs_active(rs)) + rg_mblk_search(rs->rs_rbm.rgd, ip, requested); + + /* Skip rgrps when we can't get a reservation on first pass */ + if (!gfs2_rs_active(rs) && (loops < 1)) + goto check_rgrp; + + /* If rgrp has enough free space, use it */ + if (rs->rs_rbm.rgd->rd_free_clone >= requested) { + ip->i_rgd = rs->rs_rbm.rgd; + return 0; + } + + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(ip, rs); +check_rgrp: + /* Check for unlinked inodes which can be reclaimed */ + if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, + ip->i_no_addr); +skip_rgrp: + /* Unlock rgrp if required */ + if (!rg_locked) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); +next_rgrp: + /* Find the next rgrp, and continue looking */ + if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) + continue; + + /* If we've scanned all the rgrps, but found no free blocks + * then this checks for some less likely conditions before + * trying again. + */ + flags &= ~LM_FLAG_TRY; + loops++; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; } + /* Flushing the log may release space */ + if (loops == 2) + gfs2_log_flush(sdp, NULL); } - error = -ENOSPC; -out: - if (error) - rs->rs_requested = 0; - return error; + return -ENOSPC; } /** @@ -1672,15 +1804,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = ip->i_res; - if (!rs) - return; - - if (!rs->rs_free) - gfs2_rs_deltree(rs); - if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); - rs->rs_requested = 0; } /** @@ -1693,173 +1818,47 @@ void gfs2_inplace_release(struct gfs2_inode *ip) static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) { - struct gfs2_bitmap *bi = NULL; - u32 length, rgrp_block, buf_block; - unsigned int buf; - unsigned char type; - - length = rgd->rd_length; - rgrp_block = block - rgd->rd_data0; - - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; - } + struct gfs2_rbm rbm = { .rgd = rgd, }; + int ret; - gfs2_assert(rgd->rd_sbd, buf < length); - buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; + ret = gfs2_rbm_from_block(&rbm, block); + WARN_ON_ONCE(ret != 0); - type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, buf_block); - - return type; + return gfs2_testbit(&rbm); } -/** - * rgblk_search - find a block in @state - * @rgd: the resource group descriptor - * @goal: the goal block within the RG (start here to search for avail block) - * @state: GFS2_BLKST_XXX the before-allocation state to find - * @rbi: address of the pointer to the bitmap containing the block found - * - * Walk rgrp's bitmap to find bits that represent a block in @state. - * - * This function never fails, because we wouldn't call it unless we - * know (from reservation results, etc.) that a block is available. - * - * Scope of @goal is just within rgrp, not the whole filesystem. - * Scope of @returned block is just within bitmap, not the whole filesystem. - * - * Returns: the block number found relative to the bitmap rbi - */ - -static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char state, - struct gfs2_bitmap **rbi) -{ - struct gfs2_bitmap *bi = NULL; - const u32 length = rgd->rd_length; - u32 biblk = BFITNOENT; - unsigned int buf, x; - const u8 *buffer = NULL; - - *rbi = NULL; - /* Find bitmap block that contains bits for goal block */ - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - /* Convert scope of "goal" from rgrp-wide to within found bit block */ - if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { - goal -= bi->bi_start * GFS2_NBBY; - goto do_search; - } - } - buf = 0; - goal = 0; - -do_search: - /* Search (up to entire) bitmap in this rgrp for allocatable block. - "x <= length", instead of "x < length", because we typically start - the search in the middle of a bit block, but if we can't find an - allocatable block anywhere else, we want to be able wrap around and - search in the first part of our first-searched bit block. */ - for (x = 0; x <= length; x++) { - bi = rgd->rd_bits + buf; - - if (test_bit(GBF_FULL, &bi->bi_flags) && - (state == GFS2_BLKST_FREE)) - goto skip; - - /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone - bitmaps, so we must search the originals for that. */ - buffer = bi->bi_bh->b_data + bi->bi_offset; - WARN_ON(!buffer_uptodate(bi->bi_bh)); - if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) - buffer = bi->bi_clone + bi->bi_offset; - - while (1) { - struct gfs2_blkreserv *rs; - u32 rgblk; - - biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state); - if (biblk == BFITNOENT) - break; - /* Check if this block is reserved() */ - rgblk = gfs2_bi2rgd_blk(bi, biblk); - rs = rs_find(rgd, rgblk); - if (rs == NULL) - break; - - BUG_ON(rs->rs_bi != bi); - biblk = BFITNOENT; - /* This should jump to the first block after the - reservation. */ - goal = rs->rs_biblk + rs->rs_free; - if (goal >= bi->bi_len * GFS2_NBBY) - break; - } - if (biblk != BFITNOENT) - break; - - if ((goal == 0) && (state == GFS2_BLKST_FREE)) - set_bit(GBF_FULL, &bi->bi_flags); - - /* Try next bitmap block (wrap back to rgrp header if at end) */ -skip: - buf++; - buf %= length; - goal = 0; - } - - if (biblk != BFITNOENT) - *rbi = bi; - - return biblk; -} /** * gfs2_alloc_extent - allocate an extent from a given bitmap - * @rgd: the resource group descriptor - * @bi: the bitmap within the rgrp - * @blk: the block within the bitmap + * @rbm: the resource group information * @dinode: TRUE if the first block we allocate is for a dinode - * @n: The extent length + * @n: The extent length (value/result) * - * Add the found bitmap buffer to the transaction. + * Add the bitmap buffer to the transaction. * Set the found bits to @new_state to change block's allocation state. - * Returns: starting block number of the extent (fs scope) */ -static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, - u32 blk, bool dinode, unsigned int *n) +static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, + unsigned int *n) { + struct gfs2_rbm pos = { .rgd = rbm->rgd, }; const unsigned int elen = *n; - u32 goal, rgblk; - const u8 *buffer = NULL; - struct gfs2_blkreserv *rs; - - *n = 0; - buffer = bi->bi_bh->b_data + bi->bi_offset; - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_clone, bi, blk, - dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); - (*n)++; - goal = blk; + u64 block; + int ret; + + *n = 1; + block = gfs2_rbm_to_block(rbm); + gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1); + gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + block++; while (*n < elen) { - goal++; - if (goal >= (bi->bi_len * GFS2_NBBY)) - break; - rgblk = gfs2_bi2rgd_blk(bi, goal); - rs = rs_find(rgd, rgblk); - if (rs) /* Oops, we bumped into someone's reservation */ - break; - if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != - GFS2_BLKST_FREE) + ret = gfs2_rbm_from_block(&pos, block); + if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_setbit(rgd, bi->bi_clone, bi, goal, GFS2_BLKST_USED); + gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1); + gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; + block++; } - blk = gfs2_bi2rgd_blk(bi, blk); - rgd->rd_last_alloc = blk + *n - 1; - return rgd->rd_data0 + blk; } /** @@ -1875,46 +1874,30 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { - struct gfs2_rgrpd *rgd; - struct gfs2_bitmap *bi = NULL; - u32 length, rgrp_blk, buf_blk; - unsigned int buf; + struct gfs2_rbm rbm; - rgd = gfs2_blk2rgrpd(sdp, bstart, 1); - if (!rgd) { + rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); + if (!rbm.rgd) { if (gfs2_consist(sdp)) fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); return NULL; } - length = rgd->rd_length; - - rgrp_blk = bstart - rgd->rd_data0; - while (blen--) { - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; - } - - gfs2_assert(rgd->rd_sbd, buf < length); - - buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY; - rgrp_blk++; - - if (!bi->bi_clone) { - bi->bi_clone = kmalloc(bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len); + gfs2_rbm_from_block(&rbm, bstart); + bstart++; + if (!rbm.bi->bi_clone) { + rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset, + rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, + rbm.bi->bi_len); } - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, NULL, bi, buf_blk, new_state); + gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1); + gfs2_setbit(&rbm, false, new_state); } - return rgd; + return rbm.rgd; } /** @@ -1956,56 +1939,41 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) } /** - * claim_reserved_blks - Claim previously reserved blocks - * @ip: the inode that's claiming the reservation - * @dinode: 1 if this block is a dinode block, otherwise data block - * @nblocks: desired extent length + * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation + * @ip: The inode we have just allocated blocks for + * @rbm: The start of the allocated blocks + * @len: The extent length * - * Lay claim to previously allocated block reservation blocks. - * Returns: Starting block number of the blocks claimed. - * Sets *nblocks to the actual extent length allocated. + * Adjusts a reservation after an allocation has taken place. If the + * reservation does not match the allocation, or if it is now empty + * then it is removed. */ -static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode, - unsigned int *nblocks) + +static void gfs2_adjust_reservation(struct gfs2_inode *ip, + const struct gfs2_rbm *rbm, unsigned len) { struct gfs2_blkreserv *rs = ip->i_res; - struct gfs2_rgrpd *rgd = rs->rs_rgd; - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_bitmap *bi; - u64 start_block = gfs2_rs_startblk(rs); - const unsigned int elen = *nblocks; - - /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/ - gfs2_assert_withdraw(sdp, rgd); - /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/ - bi = rs->rs_bi; - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - - for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) { - /* Make sure the bitmap hasn't changed */ - gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk, - dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); - rs->rs_biblk++; - rs->rs_free--; - - BUG_ON(!rgd->rd_reserved); - rgd->rd_reserved--; - dinode = false; - trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM); - } - - if (!rs->rs_free) { - struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd; + struct gfs2_rgrpd *rgd = rbm->rgd; + unsigned rlen; + u64 block; + int ret; - gfs2_rs_deltree(rs); - /* -nblocks because we haven't returned to do the math yet. - I'm doing the math backwards to prevent negative numbers, - but think of it as: - if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */ - if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks) - rg_mblk_search(rgd, ip); + spin_lock(&rgd->rd_rsspin); + if (gfs2_rs_active(rs)) { + if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) { + block = gfs2_rbm_to_block(rbm); + ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len); + rlen = min(rs->rs_free, len); + rs->rs_free -= rlen; + rgd->rd_reserved -= rlen; + trace_gfs2_rs(rs, TRACE_RS_CLAIM); + if (rs->rs_free && !ret) + goto out; + } + __rs_deltree(ip, rs); } - return start_block; +out: + spin_unlock(&rgd->rd_rsspin); } /** @@ -2024,47 +1992,40 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_rgrpd *rgd; + struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; unsigned int ndata; - u32 goal, blk; /* block, within the rgrp scope */ + u64 goal; u64 block; /* block, within the file system scope */ int error; - struct gfs2_bitmap *bi; - /* Only happens if there is a bug in gfs2, return something distinctive - * to ensure that it is noticed. - */ - if (ip->i_res->rs_requested == 0) - return -ECANCELED; - - /* Check if we have a multi-block reservation, and if so, claim the - next free block from it. */ - if (gfs2_rs_active(ip->i_res)) { - BUG_ON(!ip->i_res->rs_free); - rgd = ip->i_res->rs_rgd; - block = claim_reserved_blks(ip, dinode, nblocks); - } else { - rgd = ip->i_rgd; + if (gfs2_rs_active(ip->i_res)) + goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm); + else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0; - if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) - goal = ip->i_goal - rgd->rd_data0; - else - goal = rgd->rd_last_alloc; - - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); - - /* Since all blocks are reserved in advance, this shouldn't - happen */ - if (blk == BFITNOENT) { - printk(KERN_WARNING "BFITNOENT, nblocks=%u\n", - *nblocks); - printk(KERN_WARNING "FULL=%d\n", - test_bit(GBF_FULL, &rgd->rd_bits->bi_flags)); - goto rgrp_error; - } + gfs2_rbm_from_block(&rbm, goal); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); - block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); + if (error == -ENOSPC) { + gfs2_rbm_from_block(&rbm, goal); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); + } + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (error) { + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", + (unsigned long long)ip->i_no_addr, error, *nblocks, + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); + goto rgrp_error; } + + gfs2_alloc_extent(&rbm, dinode, nblocks); + block = gfs2_rbm_to_block(&rbm); + rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; + if (gfs2_rs_active(ip->i_res)) + gfs2_adjust_reservation(ip, &rbm, *nblocks); ndata = *nblocks; if (dinode) ndata--; @@ -2081,22 +2042,22 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, brelse(dibh); } } - if (rgd->rd_free < *nblocks) { + if (rbm.rgd->rd_free < *nblocks) { printk(KERN_WARNING "nblocks=%u\n", *nblocks); goto rgrp_error; } - rgd->rd_free -= *nblocks; + rbm.rgd->rd_free -= *nblocks; if (dinode) { - rgd->rd_dinodes++; - *generation = rgd->rd_igeneration++; + rbm.rgd->rd_dinodes++; + *generation = rbm.rgd->rd_igeneration++; if (*generation == 0) - *generation = rgd->rd_igeneration++; + *generation = rbm.rgd->rd_igeneration++; } - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) @@ -2110,14 +2071,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, ip->i_inode.i_gid); - rgd->rd_free_clone -= *nblocks; - trace_gfs2_block_alloc(ip, rgd, block, *nblocks, + rbm.rgd->rd_free_clone -= *nblocks; + trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; rgrp_error: - gfs2_rgrp_error(rgd); + gfs2_rgrp_error(rbm.rgd); return -EIO; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index ca6e267..2407795 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -46,7 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern int gfs2_rs_alloc(struct gfs2_inode *ip); -extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs); extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); @@ -73,30 +73,10 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); extern int gfs2_fitrim(struct file *filp, void __user *argp); -/* This is how to tell if a multi-block reservation is "inplace" reserved: */ -static inline int gfs2_mb_reserved(struct gfs2_inode *ip) +/* This is how to tell if a reservation is in the rgrp tree: */ +static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) { - if (ip->i_res && ip->i_res->rs_requested) - return 1; - return 0; -} - -/* This is how to tell if a multi-block reservation is in the rgrp tree: */ -static inline int gfs2_rs_active(struct gfs2_blkreserv *rs) -{ - if (rs && rs->rs_bi) - return 1; - return 0; -} - -static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk) -{ - return (bi->bi_start * GFS2_NBBY) + blk; -} - -static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs) -{ - return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0; + return rs && !RB_EMPTY_NODE(&rs->rs_node); } #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fc3168f..bc73726 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1366,6 +1366,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) val = sdp->sd_tune.gt_statfs_quantum; if (val != 30) seq_printf(s, ",statfs_quantum=%d", val); + else if (sdp->sd_tune.gt_statfs_slow) + seq_puts(s, ",statfs_quantum=0"); val = sdp->sd_tune.gt_quota_quantum; if (val != 60) seq_printf(s, ",quota_quantum=%d", val); @@ -1543,6 +1545,11 @@ static void gfs2_evict_inode(struct inode *inode) out_truncate: gfs2_log_flush(sdp, ip->i_gl); + if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + filemap_fdatawrite(metamapping); + filemap_fdatawait(metamapping); + } write_inode_now(inode, 1); gfs2_ail_flush(ip->i_gl, 0); @@ -1557,7 +1564,7 @@ out_truncate: out_unlock: /* Error path for case 1 */ if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip->i_res); + gfs2_rs_deltree(ip, ip->i_res); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); @@ -1572,7 +1579,7 @@ out: clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; - flush_delayed_work_sync(&ip->i_gl->gl_work); + flush_delayed_work(&ip->i_gl->gl_work); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index a25c252f..bbdc78a 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -509,10 +509,9 @@ TRACE_EVENT(gfs2_block_alloc, /* Keep track of multi-block reservations as they are allocated/freed */ TRACE_EVENT(gfs2_rs, - TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs, - u8 func), + TP_PROTO(const struct gfs2_blkreserv *rs, u8 func), - TP_ARGS(ip, rs, func), + TP_ARGS(rs, func), TP_STRUCT__entry( __field( dev_t, dev ) @@ -526,18 +525,17 @@ TRACE_EVENT(gfs2_rs, ), TP_fast_assign( - __entry->dev = rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0; - __entry->rd_addr = rs->rs_rgd ? rs->rs_rgd->rd_addr : 0; - __entry->rd_free_clone = rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0; - __entry->rd_reserved = rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0; - __entry->inum = ip ? ip->i_no_addr : 0; - __entry->start = gfs2_rs_startblk(rs); + __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; + __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; + __entry->inum = rs->rs_inum; + __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); __entry->free = rs->rs_free; __entry->func = func; ), - TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s " - "f:%lu", + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 41f42cd..bf2ae9a 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -28,11 +28,10 @@ struct gfs2_glock; /* reserve either the number of blocks to be allocated plus the rg header * block, or all of the blocks in the rg, whichever is smaller */ -static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) { - const struct gfs2_blkreserv *rs = ip->i_res; - if (rs && rs->rs_requested < ip->i_rgd->rd_length) - return rs->rs_requested + 1; + if (requested < ip->i_rgd->rd_length) + return requested + 1; return ip->i_rgd->rd_length; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 27a0b4a..db330e5 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -448,17 +448,18 @@ ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) } /** - * ea_get_unstuffed - actually copies the unstuffed data into the - * request buffer + * ea_iter_unstuffed - copies the unstuffed xattr data to/from the + * request buffer * @ip: The GFS2 inode * @ea: The extended attribute header structure - * @data: The data to be copied + * @din: The data to be copied in + * @dout: The data to be copied out (one of din,dout will be NULL) * * Returns: errno */ -static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, - char *data) +static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + const char *din, char *dout) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head **bh; @@ -467,6 +468,8 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); unsigned int x; int error = 0; + unsigned char *pos; + unsigned cp_size; bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); if (!bh) @@ -497,12 +500,21 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, goto out; } - memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header), - (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + pos = bh[x]->b_data + sizeof(struct gfs2_meta_header); + cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize; - amount -= sdp->sd_jbsize; - data += sdp->sd_jbsize; + if (dout) { + memcpy(dout, pos, cp_size); + dout += sdp->sd_jbsize; + } + + if (din) { + gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + memcpy(pos, din, cp_size); + din += sdp->sd_jbsize; + } + amount -= sdp->sd_jbsize; brelse(bh[x]); } @@ -523,7 +535,7 @@ static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, memcpy(data, GFS2_EA2DATA(el->el_ea), len); return len; } - ret = ea_get_unstuffed(ip, el->el_ea, data); + ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data); if (ret < 0) return ret; return len; @@ -727,7 +739,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + gfs2_rg_blocks(ip) + + blks + gfs2_rg_blocks(ip, blks) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; @@ -1220,69 +1232,23 @@ static int gfs2_xattr_set(struct dentry *dentry, const char *name, size, flags, type); } + static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, char *data) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head **bh; unsigned int amount = GFS2_EA_DATA_LEN(ea); unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); - __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); - unsigned int x; - int error; - - bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); - if (!bh) - return -ENOMEM; - - error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); - if (error) - goto out; - - for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, - bh + x); - if (error) { - while (x--) - brelse(bh[x]); - goto fail; - } - dataptrs++; - } - - for (x = 0; x < nptrs; x++) { - error = gfs2_meta_wait(sdp, bh[x]); - if (error) { - for (; x < nptrs; x++) - brelse(bh[x]); - goto fail; - } - if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { - for (; x < nptrs; x++) - brelse(bh[x]); - error = -EIO; - goto fail; - } - - gfs2_trans_add_bh(ip->i_gl, bh[x], 1); - - memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data, - (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); - - amount -= sdp->sd_jbsize; - data += sdp->sd_jbsize; - - brelse(bh[x]); - } + int ret; -out: - kfree(bh); - return error; + ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (ret) + return ret; -fail: + ret = gfs2_iter_unstuffed(ip, ea, data, NULL); gfs2_trans_end(sdp); - kfree(bh); - return error; + + return ret; } int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 8275175..693df9f 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -134,8 +134,8 @@ struct hfs_sb_info { permissions on all files */ umode_t s_dir_umask; /* The umask applied to the permissions on all dirs */ - uid_t s_uid; /* The uid of all files */ - gid_t s_gid; /* The gid of all files */ + kuid_t s_uid; /* The uid of all files */ + kgid_t s_gid; /* The gid of all files */ int session, part; struct nls_table *nls_io, *nls_disk; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index ee1bc55..0b35903 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -594,9 +594,9 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) /* no uig/gid changes and limit which mode bits can be set */ if (((attr->ia_valid & ATTR_UID) && - (attr->ia_uid != hsb->s_uid)) || + (!uid_eq(attr->ia_uid, hsb->s_uid))) || ((attr->ia_valid & ATTR_GID) && - (attr->ia_gid != hsb->s_gid)) || + (!gid_eq(attr->ia_gid, hsb->s_gid))) || ((attr->ia_valid & ATTR_MODE) && ((S_ISDIR(inode->i_mode) && (attr->ia_mode != inode->i_mode)) || @@ -644,7 +644,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, /* sync the superblock to buffers */ sb = inode->i_sb; - flush_delayed_work_sync(&HFS_SB(sb)->mdb_work); + flush_delayed_work(&HFS_SB(sb)->mdb_work); /* .. finally sync the buffers to disk */ err = sync_blockdev(sb->s_bdev); if (!ret) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4eb873e..e93ddaa 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -138,7 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); - seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid); + seq_printf(seq, ",uid=%u,gid=%u", + from_kuid_munged(&init_user_ns, sbi->s_uid), + from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_file_umask != 0133) seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); if (sbi->s_dir_umask != 0022) @@ -254,14 +256,22 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) printk(KERN_ERR "hfs: uid requires an argument\n"); return 0; } - hsb->s_uid = (uid_t)tmp; + hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); + if (!uid_valid(hsb->s_uid)) { + printk(KERN_ERR "hfs: invalid uid %d\n", tmp); + return 0; + } break; case opt_gid: if (match_int(&args[0], &tmp)) { printk(KERN_ERR "hfs: gid requires an argument\n"); return 0; } - hsb->s_gid = (gid_t)tmp; + hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); + if (!gid_valid(hsb->s_gid)) { + printk(KERN_ERR "hfs: invalid gid %d\n", tmp); + return 0; + } break; case opt_umask: if (match_octal(&args[0], &tmp)) { @@ -482,6 +492,12 @@ static int __init init_hfs_fs(void) static void __exit exit_hfs_fs(void) { unregister_filesystem(&hfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(hfs_inode_cachep); } diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index ec2a9c2..798d9c4 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -80,8 +80,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->userflags = HFSPLUS_I(inode)->userflags; perms->mode = cpu_to_be16(inode->i_mode); - perms->owner = cpu_to_be32(inode->i_uid); - perms->group = cpu_to_be32(inode->i_gid); + perms->owner = cpu_to_be32(i_uid_read(inode)); + perms->group = cpu_to_be32(i_gid_read(inode)); if (S_ISREG(inode->i_mode)) perms->dev = cpu_to_be32(inode->i_nlink); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 558dbb4..c571de2 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -149,8 +149,8 @@ struct hfsplus_sb_info { u32 type; umode_t umask; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; int part, session; unsigned long flags; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 3d8b4a6..2172aa5 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -233,12 +233,12 @@ static void hfsplus_get_perms(struct inode *inode, mode = be16_to_cpu(perms->mode); - inode->i_uid = be32_to_cpu(perms->owner); - if (!inode->i_uid && !mode) + i_uid_write(inode, be32_to_cpu(perms->owner)); + if (!i_uid_read(inode) && !mode) inode->i_uid = sbi->uid; - inode->i_gid = be32_to_cpu(perms->group); - if (!inode->i_gid && !mode) + i_gid_write(inode, be32_to_cpu(perms->group)); + if (!i_gid_read(inode) && !mode) inode->i_gid = sbi->gid; if (dir) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 06fa561..ed257c6 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -135,14 +135,22 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) printk(KERN_ERR "hfs: uid requires an argument\n"); return 0; } - sbi->uid = (uid_t)tmp; + sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); + if (!uid_valid(sbi->uid)) { + printk(KERN_ERR "hfs: invalid uid specified\n"); + return 0; + } break; case opt_gid: if (match_int(&args[0], &tmp)) { printk(KERN_ERR "hfs: gid requires an argument\n"); return 0; } - sbi->gid = (gid_t)tmp; + sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); + if (!gid_valid(sbi->gid)) { + printk(KERN_ERR "hfs: invalid gid specified\n"); + return 0; + } break; case opt_part: if (match_int(&args[0], &sbi->part)) { @@ -215,7 +223,8 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) if (sbi->type != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",type=%.4s", (char *)&sbi->type); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, - sbi->uid, sbi->gid); + from_kuid_munged(&init_user_ns, sbi->uid), + from_kgid_munged(&init_user_ns, sbi->gid)); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index fdafb2d..811a84d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -635,6 +635,12 @@ static int __init init_hfsplus_fs(void) static void __exit exit_hfsplus_fs(void) { unregister_filesystem(&hfsplus_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(hfsplus_inode_cachep); } diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 1fe7313..9c88da0 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -1,7 +1,7 @@ #ifndef __UM_FS_HOSTFS #define __UM_FS_HOSTFS -#include "os.h" +#include <os.h> /* * These are exactly the same definitions as in fs.h, but the names are diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1241465..457addc 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -16,8 +16,8 @@ #include <linux/mount.h> #include <linux/namei.h> #include "hostfs.h" -#include "init.h" -#include "kern.h" +#include <init.h> +#include <kern.h> struct hostfs_inode_info { int fd; @@ -542,8 +542,8 @@ static int read_name(struct inode *ino, char *name) ino->i_ino = st.ino; ino->i_mode = st.mode; set_nlink(ino, st.nlink); - ino->i_uid = st.uid; - ino->i_gid = st.gid; + i_uid_write(ino, st.uid); + i_gid_write(ino, st.gid); ino->i_atime = st.atime; ino->i_mtime = st.mtime; ino->i_ctime = st.ctime; @@ -808,11 +808,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_UID) { attrs.ia_valid |= HOSTFS_ATTR_UID; - attrs.ia_uid = attr->ia_uid; + attrs.ia_uid = from_kuid(&init_user_ns, attr->ia_uid); } if (attr->ia_valid & ATTR_GID) { attrs.ia_valid |= HOSTFS_ATTR_GID; - attrs.ia_gid = attr->ia_gid; + attrs.ia_gid = from_kgid(&init_user_ns, attr->ia_gid); } if (attr->ia_valid & ATTR_SIZE) { attrs.ia_valid |= HOSTFS_ATTR_SIZE; @@ -848,9 +848,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { int error; - error = vmtruncate(inode, attr->ia_size); - if (err) - return err; + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + truncate_setsize(inode, attr->ia_size); } setattr_copy(inode, attr); diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index a74ad0d..67838f3 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -15,7 +15,6 @@ #include <sys/types.h> #include <sys/vfs.h> #include "hostfs.h" -#include "os.h" #include <utime.h> static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index 4bae4a4a..2d5b254 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -102,7 +102,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi return -1; } if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) { - btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1); + le32_add_cpu(&btree->u.external[n].length, 1); mark_buffer_dirty(bh); brelse(bh); return se; @@ -153,7 +153,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi btree = &anode->btree; } btree->n_free_nodes--; n = btree->n_used_nodes++; - btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12); + le16_add_cpu(&btree->first_free, 12); btree->u.external[n].disk_secno = cpu_to_le32(se); btree->u.external[n].file_secno = cpu_to_le32(fs); btree->u.external[n].length = cpu_to_le32(1); @@ -174,7 +174,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } if (btree->n_free_nodes) { btree->n_free_nodes--; n = btree->n_used_nodes++; - btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8); + le16_add_cpu(&btree->first_free, 8); btree->u.internal[n].file_secno = cpu_to_le32(-1); btree->u.internal[n].down = cpu_to_le32(na); btree->u.internal[n-1].file_secno = cpu_to_le32(fs); diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 3228c52..4364b2a 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -145,10 +145,10 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno } } if (ptr) { - d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4); + le32_add_cpu(&d->first_free, 4); if (le32_to_cpu(d->first_free) > 2048) { hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); - d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4); + le32_add_cpu(&d->first_free, -4); return; } de->length = cpu_to_le16(36); @@ -184,7 +184,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, de->not_8x3 = hpfs_is_name_long(name, namelen); de->namelen = namelen; memcpy(de->name, name, namelen); - d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size); + le32_add_cpu(&d->first_free, d_size); return de; } @@ -314,7 +314,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); de = de_next_de(de); memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); - nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20)); + le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20)); memcpy(d, nd, le32_to_cpu(nd->first_free)); for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); fix_up_ptrs(i->i_sb, ad); @@ -474,8 +474,8 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) hpfs_brelse4(&qbh); return 0; } - dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); - de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); + le32_add_cpu(&dnode->first_free, -4); + le16_add_cpu(&de->length, -4); de->down = 0; hpfs_mark_4buffers_dirty(&qbh); dno = up; @@ -570,8 +570,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); if (!down) { de->down = 0; - de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); - dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); + le16_add_cpu(&de->length, -4); + le32_add_cpu(&dnode->first_free, -4); memmove(de_next_de(de), (char *)de_next_de(de) + 4, (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); } else { @@ -647,14 +647,14 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); printk("HPFS: warning: goin'on\n"); } - del->length = cpu_to_le16(le16_to_cpu(del->length) + 4); + le16_add_cpu(&del->length, 4); del->down = 1; - d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4); + le32_add_cpu(&d1->first_free, 4); } if (dlp && !down) { - del->length = cpu_to_le16(le16_to_cpu(del->length) - 4); + le16_add_cpu(&del->length, -4); del->down = 0; - d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); + le32_add_cpu(&d1->first_free, -4); } else if (down) *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; @@ -668,9 +668,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); hpfs_delete_de(i->i_sb, dnode, de_prev); if (!de_prev->down) { - de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4); + le16_add_cpu(&de_prev->length, 4); de_prev->down = 1; - dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); + le32_add_cpu(&dnode->first_free, 4); } *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh); diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index ac1ead1..7102aae 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -63,8 +63,8 @@ struct hpfs_sb_info { unsigned sb_dmap; /* sector number of dnode bit map */ unsigned sb_n_free; /* free blocks for statfs, or -1 */ unsigned sb_n_free_dnodes; /* free dnodes for statfs, or -1 */ - uid_t sb_uid; /* uid from mount options */ - gid_t sb_gid; /* gid from mount options */ + kuid_t sb_uid; /* uid from mount options */ + kgid_t sb_gid; /* gid from mount options */ umode_t sb_mode; /* mode from mount options */ unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */ unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */ diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index ed671e0..804a9a8 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -7,6 +7,7 @@ */ #include <linux/slab.h> +#include <linux/user_namespace.h> #include "hpfs_fn.h" void hpfs_init_inode(struct inode *i) @@ -60,14 +61,14 @@ void hpfs_read_inode(struct inode *i) if (hpfs_sb(i->i_sb)->sb_eas) { if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) { if (ea_size == 2) { - i->i_uid = le16_to_cpu(*(__le16*)ea); + i_uid_write(i, le16_to_cpu(*(__le16*)ea)); hpfs_inode->i_ea_uid = 1; } kfree(ea); } if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) { if (ea_size == 2) { - i->i_gid = le16_to_cpu(*(__le16*)ea); + i_gid_write(i, le16_to_cpu(*(__le16*)ea)); hpfs_inode->i_ea_gid = 1; } kfree(ea); @@ -149,13 +150,13 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { __le32 ea; - if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { - ea = cpu_to_le32(i->i_uid); + if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { + ea = cpu_to_le32(i_uid_read(i)); hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2); hpfs_inode->i_ea_uid = 1; } - if ((i->i_gid != hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) { - ea = cpu_to_le32(i->i_gid); + if (!gid_eq(i->i_gid, hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) { + ea = cpu_to_le32(i_gid_read(i)); hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2); hpfs_inode->i_ea_gid = 1; } @@ -261,9 +262,11 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) hpfs_lock(inode->i_sb); if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) goto out_unlock; - if ((attr->ia_valid & ATTR_UID) && attr->ia_uid >= 0x10000) + if ((attr->ia_valid & ATTR_UID) && + from_kuid(&init_user_ns, attr->ia_uid) >= 0x10000) goto out_unlock; - if ((attr->ia_valid & ATTR_GID) && attr->ia_gid >= 0x10000) + if ((attr->ia_valid & ATTR_GID) && + from_kgid(&init_user_ns, attr->ia_gid) >= 0x10000) goto out_unlock; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bc90824..345713d 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -91,8 +91,8 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(dir); insert_inode_hash(result); - if (result->i_uid != current_fsuid() || - result->i_gid != current_fsgid() || + if (!uid_eq(result->i_uid, current_fsuid()) || + !gid_eq(result->i_gid, current_fsgid()) || result->i_mode != (mode | S_IFDIR)) { result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); @@ -179,8 +179,8 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b insert_inode_hash(result); - if (result->i_uid != current_fsuid() || - result->i_gid != current_fsgid() || + if (!uid_eq(result->i_uid, current_fsuid()) || + !gid_eq(result->i_gid, current_fsgid()) || result->i_mode != (mode | S_IFREG)) { result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 706a12c..bc28bf07 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -210,6 +210,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(hpfs_inode_cachep); } @@ -251,7 +256,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, +static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, int *lowercase, int *eas, int *chk, int *errs, int *chkdsk, int *timeshift) { @@ -276,12 +281,16 @@ static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, case Opt_uid: if (match_int(args, &option)) return 0; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 0; break; case Opt_gid: if (match_int(args, &option)) return 0; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 0; break; case Opt_umask: if (match_octal(args, &option)) @@ -378,8 +387,8 @@ HPFS filesystem options:\n\ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t umask; int lowercase, eas, chk, errs, chkdsk, timeshift; int o; @@ -455,8 +464,8 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) struct hpfs_sb_info *sbi; struct inode *root; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t umask; int lowercase, eas, chk, errs, chkdsk, timeshift; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index c1dffe4..78f21f8 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -18,7 +18,7 @@ #include <linux/pid_namespace.h> #include <linux/namei.h> #include <asm/uaccess.h> -#include "os.h" +#include <os.h> static struct inode *get_inode(struct super_block *, struct dentry *); @@ -674,7 +674,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) if (!inode) { dput(dentry); - return ERR_PTR(-ENOMEM); + return NULL; } if (S_ISDIR(dentry->d_inode->i_mode)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8349a89..c5bc355 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -42,8 +42,8 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; struct hugetlbfs_config { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; long nr_blocks; long nr_inodes; @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) @@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); @@ -785,13 +784,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) case Opt_uid: if (match_int(&args[0], &option)) goto bad_val; - pconfig->uid = option; + pconfig->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(pconfig->uid)) + goto bad_val; break; case Opt_gid: if (match_int(&args[0], &option)) goto bad_val; - pconfig->gid = option; + pconfig->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(pconfig->gid)) + goto bad_val; break; case Opt_mode: @@ -924,7 +927,9 @@ static struct vfsmount *hugetlbfs_vfsmount; static int can_do_hugetlb_shm(void) { - return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); + kgid_t shm_group; + shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); + return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } struct file *hugetlb_file_setup(const char *name, unsigned long addr, @@ -1042,6 +1047,11 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(hugetlbfs_inode_cachep); kern_unmount(hugetlbfs_vfsmount); unregister_filesystem(&hugetlbfs_fs_type); @@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); @@ -603,21 +603,14 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct file *filp; - int error = -EBADF; - int fput_needed; - - filp = fget_light(fd, &fput_needed); - if (!filp) - goto out; - - error = security_file_ioctl(filp, cmd, arg); - if (error) - goto out_fput; - - error = do_vfs_ioctl(filp, fd, cmd, arg); - out_fput: - fput_light(filp, fput_needed); - out: + int error; + struct fd f = fdget(fd); + + if (!f.file) + return -EBADF; + error = security_file_ioctl(f.file, cmd, arg); + if (!error) + error = do_vfs_ioctl(f.file, fd, cmd, arg); + fdput(f); return error; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 29037c3..67ce52507 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -21,6 +21,7 @@ #include <linux/cdrom.h> #include <linux/parser.h> #include <linux/mpage.h> +#include <linux/user_namespace.h> #include "isofs.h" #include "zisofs.h" @@ -114,6 +115,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(isofs_inode_cachep); } @@ -171,8 +177,8 @@ struct iso9660_options{ unsigned int blocksize; umode_t fmode; umode_t dmode; - gid_t gid; - uid_t uid; + kgid_t gid; + kuid_t uid; char *iocharset; /* LVE */ s32 session; @@ -383,8 +389,8 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->fmode = popt->dmode = ISOFS_INVALID_MODE; popt->uid_set = 0; popt->gid_set = 0; - popt->gid = 0; - popt->uid = 0; + popt->gid = GLOBAL_ROOT_GID; + popt->uid = GLOBAL_ROOT_UID; popt->iocharset = NULL; popt->utf8 = 0; popt->overriderockperm = 0; @@ -460,13 +466,17 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_uid: if (match_int(&args[0], &option)) return 0; - popt->uid = option; + popt->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(popt->uid)) + return 0; popt->uid_set = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; - popt->gid = option; + popt->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(popt->gid)) + return 0; popt->gid_set = 1; break; case Opt_mode: diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 3620ad1..9916723 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -52,8 +52,8 @@ struct isofs_sb_info { umode_t s_fmode; umode_t s_dmode; - gid_t s_gid; - uid_t s_uid; + kgid_t s_gid; + kuid_t s_uid; struct nls_table *s_nls_iocharset; /* Native language support table */ }; diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 70e79d0..c0bf424 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -364,8 +364,8 @@ repeat: case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); set_nlink(inode, isonum_733(rr->u.PX.n_links)); - inode->i_uid = isonum_733(rr->u.PX.uid); - inode->i_gid = isonum_733(rr->u.PX.gid); + i_uid_write(inode, isonum_733(rr->u.PX.uid)); + i_gid_write(inode, isonum_733(rr->u.PX.gid)); break; case SIG('P', 'N'): { diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 52c15c7..86b39b1 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -86,7 +86,12 @@ nope: static void release_data_buffer(struct buffer_head *bh) { if (buffer_freed(bh)) { + WARN_ON_ONCE(buffer_dirty(bh)); clear_buffer_freed(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; release_buffer_page(bh); } else put_bh(bh); @@ -866,17 +871,35 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future after the "add to orphan" - * operation been committed, That's not only a performance - * gain, it also stops aliasing problems if the buffer is - * left behind for writeback and gets reallocated for another - * use in a different page. */ - if (buffer_freed(bh) && !jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather throughout in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index febc10d..78b7f84 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1843,15 +1843,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1879,10 +1880,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the - * buffer can be reallocated and used by a different page. + * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { @@ -1909,13 +1918,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1923,13 +1928,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1950,10 +1951,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } /* * The buffer is committing, we simply cannot touch - * it. So we just set j_next_transaction to the - * running transaction (if there is one) and mark - * buffer as freed so that commit code knows it should - * clear dirty bits when it is done with the buffer. + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + goto retry; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) @@ -1976,6 +1991,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -2024,7 +2047,8 @@ void journal_invalidatepage(journal_t *journal, if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + offset > 0); unlock_buffer(bh); } curr_off = next_off; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index af5280f..3091d42 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1014,17 +1014,35 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future after the "add to orphan" - * operation been committed, That's not only a performance - * gain, it also stops aliasing problems if the buffer is - * left behind for writeback and gets reallocated for another - * use in a different page. */ - if (buffer_freed(bh) && !jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather through in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e149b99..484b8d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal) BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); read_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + read_unlock(&journal->j_state_lock); + return; + } jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0131e43..626846b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = err2; /* Make sure all replayed data is on permanent storage */ - if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (journal->j_flags & JBD2_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } return err; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index fb1ab953..a74ba46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the - * buffer can be reallocated and used by a different page. + * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { @@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - jbd2_journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - write_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - jbd2_journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - write_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) JBUFFER_TRACE(jh, "on committing transaction"); /* * The buffer is committing, we simply cannot touch - * it. So we just set j_next_transaction to the - * running transaction (if there is one) and mark - * buffer as freed so that commit code knows it should - * clear dirty bits when it is done with the buffer. + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) @@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. + */ + jh->b_modified = 0; jbd2_journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + offset > 0); unlock_buffer(bh); } curr_off = next_off; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 922f146..223283c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -94,15 +94,23 @@ static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size) case ACL_MASK: case ACL_OTHER: value += sizeof(struct jffs2_acl_entry_short); - acl->a_entries[i].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value += sizeof(struct jffs2_acl_entry); + if (value > end) + goto fail; + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + je32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value += sizeof(struct jffs2_acl_entry); if (value > end) goto fail; - acl->a_entries[i].e_id = je32_to_cpu(entry->e_id); + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + je32_to_cpu(entry->e_id)); break; default: @@ -131,13 +139,19 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) header->a_version = cpu_to_je32(JFFS2_ACL_VERSION); e = header + 1; for (i=0; i < acl->a_count; i++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[i]; entry = e; - entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag); - entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm); - switch(acl->a_entries[i].e_tag) { + entry->e_tag = cpu_to_je16(acl_e->e_tag); + entry->e_perm = cpu_to_je16(acl_e->e_perm); + switch(acl_e->e_tag) { case ACL_USER: + entry->e_id = cpu_to_je32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(struct jffs2_acl_entry); + break; case ACL_GROUP: - entry->e_id = cpu_to_je32(acl->a_entries[i].e_id); + entry->e_id = cpu_to_je32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(struct jffs2_acl_entry); break; @@ -363,7 +377,7 @@ static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (!acl) return -ENODATA; - rc = posix_acl_to_xattr(acl, buffer, size); + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return rc; @@ -381,7 +395,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index db3889b..60ef3fb 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -175,8 +175,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ri.ino = cpu_to_je32(f->inocache->ino); ri.version = cpu_to_je32(++f->highest_version); ri.mode = cpu_to_jemode(inode->i_mode); - ri.uid = cpu_to_je16(inode->i_uid); - ri.gid = cpu_to_je16(inode->i_gid); + ri.uid = cpu_to_je16(i_uid_read(inode)); + ri.gid = cpu_to_je16(i_gid_read(inode)); ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds()); ri.offset = cpu_to_je32(inode->i_size); @@ -283,8 +283,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, /* Set the fields that the generic jffs2_write_inode_range() code can't find */ ri->ino = cpu_to_je32(inode->i_ino); ri->mode = cpu_to_jemode(inode->i_mode); - ri->uid = cpu_to_je16(inode->i_uid); - ri->gid = cpu_to_je16(inode->i_gid); + ri->uid = cpu_to_je16(i_uid_read(inode)); + ri->gid = cpu_to_je16(i_gid_read(inode)); ri->isize = cpu_to_je32((uint32_t)inode->i_size); ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds()); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 3d3092e..fe3c052 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -99,8 +99,10 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->ino = cpu_to_je32(inode->i_ino); ri->version = cpu_to_je32(++f->highest_version); - ri->uid = cpu_to_je16((ivalid & ATTR_UID)?iattr->ia_uid:inode->i_uid); - ri->gid = cpu_to_je16((ivalid & ATTR_GID)?iattr->ia_gid:inode->i_gid); + ri->uid = cpu_to_je16((ivalid & ATTR_UID)? + from_kuid(&init_user_ns, iattr->ia_uid):i_uid_read(inode)); + ri->gid = cpu_to_je16((ivalid & ATTR_GID)? + from_kgid(&init_user_ns, iattr->ia_gid):i_gid_read(inode)); if (ivalid & ATTR_MODE) ri->mode = cpu_to_jemode(iattr->ia_mode); @@ -147,8 +149,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); inode->i_mode = jemode_to_cpu(ri->mode); - inode->i_uid = je16_to_cpu(ri->uid); - inode->i_gid = je16_to_cpu(ri->gid); + i_uid_write(inode, je16_to_cpu(ri->uid)); + i_gid_write(inode, je16_to_cpu(ri->gid)); old_metadata = f->metadata; @@ -276,8 +278,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(ret); } inode->i_mode = jemode_to_cpu(latest_node.mode); - inode->i_uid = je16_to_cpu(latest_node.uid); - inode->i_gid = je16_to_cpu(latest_node.gid); + i_uid_write(inode, je16_to_cpu(latest_node.uid)); + i_gid_write(inode, je16_to_cpu(latest_node.gid)); inode->i_size = je32_to_cpu(latest_node.isize); inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); @@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ - ri->uid = cpu_to_je16(current_fsuid()); + ri->uid = cpu_to_je16(from_kuid(&init_user_ns, current_fsuid())); if (dir_i->i_mode & S_ISGID) { - ri->gid = cpu_to_je16(dir_i->i_gid); + ri->gid = cpu_to_je16(i_gid_read(dir_i)); if (S_ISDIR(mode)) mode |= S_ISGID; } else { - ri->gid = cpu_to_je16(current_fsgid()); + ri->gid = cpu_to_je16(from_kgid(&init_user_ns, current_fsgid())); } /* POSIX ACLs have to be processed now, at least partly. @@ -467,8 +469,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r set_nlink(inode, 1); inode->i_ino = je32_to_cpu(ri->ino); inode->i_mode = jemode_to_cpu(ri->mode); - inode->i_gid = je16_to_cpu(ri->gid); - inode->i_uid = je16_to_cpu(ri->uid); + i_gid_write(inode, je16_to_cpu(ri->gid)); + i_uid_write(inode, je16_to_cpu(ri->uid)); inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index bcd983d..d200a9b 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -27,8 +27,8 @@ struct kvec; #define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size) #define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode) -#define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid) -#define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid) +#define JFFS2_F_I_UID(f) (i_uid_read(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_GID(f) (i_gid_read(OFNI_EDONI_2SFFJ(f))) #define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev) #define ITIME(sec) ((struct timespec){sec, 0}) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 1ea349f..ae81b01 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } /* Trivial function to remove the last node in the tree. Which by definition - has no right-hand -- so can be removed just by making its only child (if - any) take its place under its parent. */ + has no right-hand child — so can be removed just by making its left-hand + child (if any) take its place under its parent. Since this is only done + when we're consuming the whole tree, there's no need to use rb_erase() + and let it worry about adjusting colours and balancing the tree. That + would just be a waste of time. */ static void eat_last(struct rb_root *root, struct rb_node *node) { struct rb_node *parent = rb_parent(node); @@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node) link = &parent->rb_right; *link = node->rb_left; - /* Colour doesn't matter now. Only the parent pointer. */ if (node->rb_left) - node->rb_left->rb_parent_color = node->rb_parent_color; + node->rb_left->__rb_parent_color = node->__rb_parent_color; } -/* We put this in reverse order, so we can just use eat_last */ +/* We put the version tree in reverse order, so we can use the same eat_last() + function that we use to consume the tmpnode tree (tn_root). */ static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) { struct rb_node **link = &ver_root->rb_node; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 61ea413..d3d8799 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -100,6 +100,10 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + cancel_delayed_work_sync(&c->wbuf_dwork); +#endif + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); @@ -418,6 +422,12 @@ static void __exit exit_jffs2_fs(void) unregister_filesystem(&jffs2_fs_type); jffs2_destroy_slab_caches(); jffs2_compressors_exit(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(jffs2_inode_cachep); } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 6f4529d..a6597d6 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1044,10 +1044,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, ops.datbuf = NULL; ret = mtd_read_oob(c->mtd, jeb->offset, &ops); - if (ret || ops.oobretlen != ops.ooblen) { + if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", jeb->offset, ops.ooblen, ops.oobretlen, ret); - if (!ret) + if (!ret || mtd_is_bitflip(ret)) ret = -EIO; return ret; } @@ -1086,10 +1086,10 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, ops.datbuf = NULL; ret = mtd_read_oob(c->mtd, jeb->offset, &ops); - if (ret || ops.oobretlen != ops.ooblen) { + if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", jeb->offset, ops.ooblen, ops.oobretlen, ret); - if (!ret) + if (!ret || mtd_is_bitflip(ret)) ret = -EIO; return ret; } diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index a58fa72..d20d473 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_JFS_FS) += jfs.o jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ - jfs_unicode.o jfs_dtree.o jfs_inode.o \ + jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \ jfs_extent.o symlink.o jfs_metapage.o \ jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ resize.o xattr.o ioctl.o diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 45559dc..d254d6d 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -64,7 +64,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) else acl = ERR_PTR(size); } else { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } kfree(value); if (!IS_ERR(acl)) @@ -100,7 +100,7 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type, value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - rc = posix_acl_to_xattr(acl, value, size); + rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (rc < 0) goto out; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 844f946..9d3afd1 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -108,8 +108,8 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if (is_quota_modification(inode, iattr)) dquot_initialize(inode); - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { rc = dquot_transfer(inode, iattr); if (rc) return rc; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index f19d1e0..bc555ff 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -11,13 +11,17 @@ #include <linux/mount.h> #include <linux/time.h> #include <linux/sched.h> +#include <linux/blkdev.h> #include <asm/current.h> #include <asm/uaccess.h> +#include "jfs_filsys.h" +#include "jfs_debug.h" #include "jfs_incore.h" #include "jfs_dinode.h" #include "jfs_inode.h" - +#include "jfs_dmap.h" +#include "jfs_discard.h" static struct { long jfs_flag; @@ -123,6 +127,40 @@ setflags_out: mnt_drop_write_file(filp); return err; } + + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + s64 ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) { + jfs_warn("FITRIM not supported on device"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + ret = jfs_ioc_trim(inode, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } @@ -142,6 +180,9 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case JFS_IOC_SETFLAGS32: cmd = JFS_IOC_SETFLAGS; break; + case FITRIM: + cmd = FITRIM; + break; } return jfs_ioctl(filp, cmd, arg); } diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c new file mode 100644 index 0000000..9947563 --- /dev/null +++ b/fs/jfs/jfs_discard.c @@ -0,0 +1,117 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/blkdev.h> + +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_discard.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" + + +/* + * NAME: jfs_issue_discard() + * + * FUNCTION: TRIM the specified block range on device, if supported + * + * PARAMETERS: + * ip - pointer to in-core inode + * blkno - starting block number to be trimmed (0..N) + * nblocks - number of blocks to be trimmed + * + * RETURN VALUES: + * none + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks) +{ + struct super_block *sb = ip->i_sb; + int r = 0; + + r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0); + if (unlikely(r != 0)) { + jfs_err("JFS: sb_issue_discard" \ + "(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!\n", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + } + + jfs_info("JFS: sb_issue_discard" \ + "(%p, %llu, %llu, GFP_NOFS, 0) = %d\n", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + + return; +} + +/* + * NAME: jfs_ioc_trim() + * + * FUNCTION: attempt to discard (TRIM) all free blocks from the + * filesystem. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * range - the range, given by user space + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; + int agno, agno_end; + s64 start, end, minlen; + u64 trimmed = 0; + + /** + * convert byte values to block size of filesystem: + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + */ + start = range->start >> sb->s_blocksize_bits; + if (start < 0) + start = 0; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + if (end >= bmp->db_mapsize) + end = bmp->db_mapsize - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + if (minlen <= 0) + minlen = 1; + + /** + * we trim all ag's within the range + */ + agno = BLKTOAG(start, JFS_SBI(ip->i_sb)); + agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb)); + while (agno <= agno_end) { + trimmed += dbDiscardAG(ip, agno, minlen); + agno++; + } + range->len = trimmed << sb->s_blocksize_bits; + + return 0; +} diff --git a/fs/jfs/jfs_discard.h b/fs/jfs/jfs_discard.h new file mode 100644 index 0000000..40d1ee6 --- /dev/null +++ b/fs/jfs/jfs_discard.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DISCARD +#define _H_JFS_DISCARD + +struct fstrim_range; + +extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks); +extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range); + +#endif /* _H_JFS_DISCARD */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 9cbd11a..9a55f53 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1,5 +1,6 @@ /* * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Tino Reichardt, 2012 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,6 +26,7 @@ #include "jfs_lock.h" #include "jfs_metapage.h" #include "jfs_debug.h" +#include "jfs_discard.h" /* * SERIALIZATION of the Block Allocation Map. @@ -104,7 +106,6 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); static int dbMaxBud(u8 * cp); -s64 dbMapFileSizeToMapSize(struct inode *ipbmap); static int blkstol2(s64 nb); static int cntlz(u32 value); @@ -145,7 +146,6 @@ static const s8 budtab[256] = { 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 }; - /* * NAME: dbMount() * @@ -310,7 +310,6 @@ int dbSync(struct inode *ipbmap) return (0); } - /* * NAME: dbFree() * @@ -337,6 +336,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) s64 lblkno, rem; struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); @@ -351,6 +351,13 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) return -EIO; } + /** + * TRIM the blocks, when mounted with discard option + */ + if (JFS_SBI(sb)->flag & JFS_DISCARD) + if (JFS_SBI(sb)->minblks_trim <= nblocks) + jfs_issue_discard(ipbmap, blkno, nblocks); + /* * free the blocks a dmap at a time. */ @@ -1095,7 +1102,6 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) /* we were not successful */ release_metapage(mp); - return (rc); } @@ -1590,6 +1596,118 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) /* + * NAME: dbDiscardAG() + * + * FUNCTION: attempt to discard (TRIM) all free blocks of specific AG + * + * algorithm: + * 1) allocate blocks, as large as possible and save them + * while holding IWRITE_LOCK on ipbmap + * 2) trim all these saved block/length values + * 3) mark the blocks free again + * + * benefit: + * - we work only on one ag at some time, minimizing how long we + * need to lock ipbmap + * - reading / writing the fs is possible most time, even on + * trimming + * + * downside: + * - we write two times to the dmapctl and dmap pages + * - but for me, this seems the best way, better ideas? + * /TR 2012 + * + * PARAMETERS: + * ip - pointer to in-core inode + * agno - ag to trim + * minlen - minimum value of contiguous blocks + * + * RETURN VALUES: + * s64 - actual number of blocks trimmed + */ +s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + s64 nblocks, blkno; + u64 trimmed = 0; + int rc, l2nb; + struct super_block *sb = ipbmap->i_sb; + + struct range2trim { + u64 blkno; + u64 nblocks; + } *totrim, *tt; + + /* max blkno / nblocks pairs to trim */ + int count = 0, range_cnt; + u64 max_ranges; + + /* prevent others from writing new stuff here, while trimming */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + nblocks = bmp->db_agfree[agno]; + max_ranges = nblocks; + do_div(max_ranges, minlen); + range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); + totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); + if (totrim == NULL) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbDiscardAG: no memory for trim array"); + IWRITE_UNLOCK(ipbmap); + return 0; + } + + tt = totrim; + while (nblocks >= minlen) { + l2nb = BLKSTOL2(nblocks); + + /* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */ + rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno); + if (rc == 0) { + tt->blkno = blkno; + tt->nblocks = nblocks; + tt++; count++; + + /* the whole ag is free, trim now */ + if (bmp->db_agfree[agno] == 0) + break; + + /* give a hint for the next while */ + nblocks = bmp->db_agfree[agno]; + continue; + } else if (rc == -ENOSPC) { + /* search for next smaller log2 block */ + l2nb = BLKSTOL2(nblocks) - 1; + nblocks = 1 << l2nb; + } else { + /* Trim any already allocated blocks */ + jfs_error(bmp->db_ipbmap->i_sb, + "dbDiscardAG: -EIO"); + break; + } + + /* check, if our trim array is full */ + if (unlikely(count >= range_cnt - 1)) + break; + } + IWRITE_UNLOCK(ipbmap); + + tt->nblocks = 0; /* mark the current end */ + for (tt = totrim; tt->nblocks != 0; tt++) { + /* when mounted with online discard, dbFree() will + * call jfs_issue_discard() itself */ + if (!(JFS_SBI(sb)->flag & JFS_DISCARD)) + jfs_issue_discard(ip, tt->blkno, tt->nblocks); + dbFree(ip, tt->blkno, tt->nblocks); + trimmed += tt->nblocks; + } + kfree(totrim); + + return trimmed; +} + +/* * NAME: dbFindCtl() * * FUNCTION: starting at a specified dmap control page level and block diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 6dcb906..562b9a7e 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -311,4 +311,6 @@ extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); extern void dbFinalizeBmap(struct inode *ipbmap); extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen); + #endif /* _H_JFS_DMAP */ diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index b3f5463..b67d646 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -45,6 +45,9 @@ /* mount time flag to disable journaling to disk */ #define JFS_NOINTEGRITY 0x00000040 +/* mount time flag to enable TRIM to ssd disks */ +#define JFS_DISCARD 0x00000080 + /* commit option */ #define JFS_COMMIT 0x00000f00 /* commit option mask */ #define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 1b6f15f..6ba4006 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3078,15 +3078,15 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } set_nlink(ip, le32_to_cpu(dip->di_nlink)); - jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); - if (sbi->uid == -1) + jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid)); + if (!uid_valid(sbi->uid)) ip->i_uid = jfs_ip->saved_uid; else { ip->i_uid = sbi->uid; } - jfs_ip->saved_gid = le32_to_cpu(dip->di_gid); - if (sbi->gid == -1) + jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid)); + if (!gid_valid(sbi->gid)) ip->i_gid = jfs_ip->saved_gid; else { ip->i_gid = sbi->gid; @@ -3150,14 +3150,16 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) dip->di_size = cpu_to_le64(ip->i_size); dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); dip->di_nlink = cpu_to_le32(ip->i_nlink); - if (sbi->uid == -1) - dip->di_uid = cpu_to_le32(ip->i_uid); + if (!uid_valid(sbi->uid)) + dip->di_uid = cpu_to_le32(i_uid_read(ip)); else - dip->di_uid = cpu_to_le32(jfs_ip->saved_uid); - if (sbi->gid == -1) - dip->di_gid = cpu_to_le32(ip->i_gid); + dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns, + jfs_ip->saved_uid)); + if (!gid_valid(sbi->gid)) + dip->di_gid = cpu_to_le32(i_gid_read(ip)); else - dip->di_gid = cpu_to_le32(jfs_ip->saved_gid); + dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, + jfs_ip->saved_gid)); jfs_get_inode_flags(jfs_ip); /* * mode2 is only needed for storing the higher order bits. diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 584a4a1..cf47f09 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -38,8 +38,8 @@ struct jfs_inode_info { int fileset; /* fileset number (always 16)*/ uint mode2; /* jfs-specific mode */ - uint saved_uid; /* saved for uid mount option */ - uint saved_gid; /* saved for gid mount option */ + kuid_t saved_uid; /* saved for uid mount option */ + kgid_t saved_gid; /* saved for gid mount option */ pxd_t ixpxd; /* inode extent descriptor */ dxd_t acl; /* dxd describing acl */ dxd_t ea; /* dxd describing ea */ @@ -192,9 +192,10 @@ struct jfs_sb_info { uint state; /* mount/recovery state */ unsigned long flag; /* mount time flags */ uint p_state; /* state prior to going no integrity */ - uint uid; /* uid to override on-disk uid */ - uint gid; /* gid to override on-disk gid */ + kuid_t uid; /* uid to override on-disk uid */ + kgid_t gid; /* gid to override on-disk gid */ uint umask; /* umask to override on-disk umask */ + uint minblks_trim; /* minimum blocks, for online trim */ }; /* jfs_sb_info commit_state */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index bb8b661..5fcc02e 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2977,12 +2977,9 @@ int jfs_sync(void *arg) * put back on the anon_list. */ - /* Take off anon_list */ - list_del(&jfs_ip->anon_inode_list); - - /* Put on anon_list2 */ - list_add(&jfs_ip->anon_inode_list, - &TxAnchor.anon_list2); + /* Move from anon_list to anon_list2 */ + list_move(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list2); TXN_UNLOCK(); iput(ip); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index c55c745..1a543be 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/seq_file.h> +#include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" @@ -100,7 +101,7 @@ void jfs_error(struct super_block *sb, const char * function, ...) vsnprintf(error_buf, sizeof(error_buf), function, args); va_end(args); - printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf); + pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf); jfs_handle_error(sb); } @@ -197,7 +198,8 @@ static void jfs_put_super(struct super_block *sb) enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, - Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask + Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, + Opt_discard, Opt_nodiscard, Opt_discard_minblk }; static const match_table_t tokens = { @@ -214,6 +216,9 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%u"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_discard_minblk, "discard=%u"}, {Opt_err, NULL} }; @@ -255,8 +260,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, else { nls_map = load_nls(args[0].from); if (!nls_map) { - printk(KERN_ERR - "JFS: charset not found\n"); + pr_err("JFS: charset not found\n"); goto cleanup; } } @@ -272,8 +276,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, *newLVSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; if (*newLVSize == 0) - printk(KERN_ERR - "JFS: Cannot determine volume size\n"); + pr_err("JFS: Cannot determine volume size\n"); break; } case Opt_errors: @@ -294,8 +297,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, *flag &= ~JFS_ERR_REMOUNT_RO; *flag |= JFS_ERR_PANIC; } else { - printk(KERN_ERR - "JFS: %s is an invalid error handler\n", + pr_err("JFS: %s is an invalid error handler\n", errors); goto cleanup; } @@ -314,33 +316,76 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_usrquota: case Opt_grpquota: case Opt_quota: - printk(KERN_ERR - "JFS: quota operations not supported\n"); + pr_err("JFS: quota operations not supported\n"); break; #endif case Opt_uid: { char *uid = args[0].from; - sbi->uid = simple_strtoul(uid, &uid, 0); + uid_t val = simple_strtoul(uid, &uid, 0); + sbi->uid = make_kuid(current_user_ns(), val); + if (!uid_valid(sbi->uid)) + goto cleanup; break; } + case Opt_gid: { char *gid = args[0].from; - sbi->gid = simple_strtoul(gid, &gid, 0); + gid_t val = simple_strtoul(gid, &gid, 0); + sbi->gid = make_kgid(current_user_ns(), val); + if (!gid_valid(sbi->gid)) + goto cleanup; break; } + case Opt_umask: { char *umask = args[0].from; sbi->umask = simple_strtoul(umask, &umask, 8); if (sbi->umask & ~0777) { - printk(KERN_ERR - "JFS: Invalid value of umask\n"); + pr_err("JFS: Invalid value of umask\n"); goto cleanup; } break; } + + case Opt_discard: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + sbi->minblks_trim = 64; + if (blk_queue_discard(q)) { + *flag |= JFS_DISCARD; + } else { + pr_err("JFS: discard option " \ + "not supported on device\n"); + } + break; + } + + case Opt_nodiscard: + *flag &= ~JFS_DISCARD; + break; + + case Opt_discard_minblk: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + char *minblks_trim = args[0].from; + if (blk_queue_discard(q)) { + *flag |= JFS_DISCARD; + sbi->minblks_trim = simple_strtoull( + minblks_trim, &minblks_trim, 0); + } else { + pr_err("JFS: discard option " \ + "not supported on device\n"); + } + break; + } + default: printk("jfs: Unrecognized mount option \"%s\" " " or missing value\n", p); @@ -374,8 +419,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) if (newLVSize) { if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR - "JFS: resize requires volume to be mounted read-write\n"); + pr_err("JFS: resize requires volume" \ + " to be mounted read-write\n"); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); @@ -443,7 +488,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sb->s_max_links = JFS_LINK_MAX; sbi->sb = sb; - sbi->uid = sbi->gid = sbi->umask = -1; + sbi->uid = INVALID_UID; + sbi->gid = INVALID_GID; + sbi->umask = -1; /* initialize the mount flag and determine the default error handler */ flag = JFS_ERR_REMOUNT_RO; @@ -457,7 +504,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) #endif if (newLVSize) { - printk(KERN_ERR "resize option for remount only\n"); + pr_err("resize option for remount only\n"); goto out_kfree; } @@ -617,14 +664,16 @@ static int jfs_show_options(struct seq_file *seq, struct dentry *root) { struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); - if (sbi->uid != -1) - seq_printf(seq, ",uid=%d", sbi->uid); - if (sbi->gid != -1) - seq_printf(seq, ",gid=%d", sbi->gid); + if (uid_valid(sbi->uid)) + seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); + if (gid_valid(sbi->gid)) + seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); if (sbi->umask != -1) seq_printf(seq, ",umask=%03o", sbi->umask); if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); + if (sbi->flag & JFS_DISCARD) + seq_printf(seq, ",discard=%u", sbi->minblks_trim); if (sbi->nls_tab) seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); if (sbi->flag & JFS_ERR_CONTINUE) @@ -903,6 +952,12 @@ static void __exit exit_jfs_fs(void) jfs_proc_clean(); #endif unregister_filesystem(&jfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(jfs_inode_cachep); } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 26683e1..42d67f9 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -685,7 +685,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, * POSIX_ACL_XATTR_ACCESS is tied to i_mode */ if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(value, value_len); + acl = posix_acl_from_xattr(&init_user_ns, value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); printk(KERN_ERR "posix_acl_from_xattr returned %d\n", @@ -710,7 +710,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return 0; } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(value, value_len); + acl = posix_acl_from_xattr(&init_user_ns, value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); printk(KERN_ERR "posix_acl_from_xattr returned %d\n", @@ -874,7 +874,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** - * generic_fh_to_dentry - generic helper for the fh_to_parent export operation + * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7ef14b3..e4fb3ba 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -7,7 +7,6 @@ */ #include <linux/types.h> -#include <linux/utsname.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/slab.h> @@ -19,6 +18,8 @@ #include <asm/unaligned.h> +#include "netns.h" + #define NLMDBG_FACILITY NLMDBG_MONITOR #define NSM_PROGRAM 100024 #define NSM_VERSION 1 @@ -40,6 +41,7 @@ struct nsm_args { u32 proc; char *mon_name; + char *nodename; }; struct nsm_res { @@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net) }; struct rpc_create_args args = { .net = net, - .protocol = XPRT_TRANSPORT_UDP, + .protocol = XPRT_TRANSPORT_TCP, .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), .servername = "rpc.statd", @@ -83,10 +85,54 @@ static struct rpc_clnt *nsm_create(struct net *net) return rpc_create(&args); } -static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, - struct net *net) +static struct rpc_clnt *nsm_client_get(struct net *net) { + static DEFINE_MUTEX(nsm_create_mutex); struct rpc_clnt *clnt; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users) { + ln->nsm_users++; + clnt = ln->nsm_clnt; + spin_unlock(&ln->nsm_clnt_lock); + goto out; + } + spin_unlock(&ln->nsm_clnt_lock); + + mutex_lock(&nsm_create_mutex); + clnt = nsm_create(net); + if (!IS_ERR(clnt)) { + ln->nsm_clnt = clnt; + smp_wmb(); + ln->nsm_users = 1; + } + mutex_unlock(&nsm_create_mutex); +out: + return clnt; +} + +static void nsm_client_put(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + struct rpc_clnt *clnt = ln->nsm_clnt; + int shutdown = 0; + + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users) { + if (--ln->nsm_users) + ln->nsm_clnt = NULL; + shutdown = !ln->nsm_users; + } + spin_unlock(&ln->nsm_clnt_lock); + + if (shutdown) + rpc_shutdown_client(clnt); +} + +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, + struct rpc_clnt *clnt) +{ int status; struct nsm_args args = { .priv = &nsm->sm_priv, @@ -94,31 +140,24 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, .vers = 3, .proc = NLMPROC_NSM_NOTIFY, .mon_name = nsm->sm_mon_name, + .nodename = clnt->cl_nodename, }; struct rpc_message msg = { .rpc_argp = &args, .rpc_resp = res, }; - clnt = nsm_create(net); - if (IS_ERR(clnt)) { - status = PTR_ERR(clnt); - dprintk("lockd: failed to create NSM upcall transport, " - "status=%d\n", status); - goto out; - } + BUG_ON(clnt == NULL); memset(res, 0, sizeof(*res)); msg.rpc_proc = &clnt->cl_procinfo[proc]; - status = rpc_call_sync(clnt, &msg, 0); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); if (status < 0) dprintk("lockd: NSM upcall RPC failed, status=%d\n", status); else status = 0; - rpc_shutdown_client(clnt); - out: return status; } @@ -138,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host) struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; + struct rpc_clnt *clnt; dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); @@ -150,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host) */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net); + clnt = nsm_client_get(host->net); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d, net=%p\n", status, host->net); + return status; + } + + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt); if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { @@ -182,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host) if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { + struct lockd_net *ln = net_generic(host->net, lockd_net_id); + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt); if (res.status != 0) status = -EIO; if (status < 0) @@ -192,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host) nsm->sm_name); else nsm->sm_monitored = 0; + + nsm_client_put(host->net); } } @@ -430,7 +482,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) { __be32 *p; - encode_nsm_string(xdr, utsname()->nodename); + encode_nsm_string(xdr, argp->nodename); p = xdr_reserve_space(xdr, 4 + 4 + 4); *p++ = cpu_to_be32(argp->prog); *p++ = cpu_to_be32(argp->vers); diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 4eee248..5010b55 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -12,6 +12,10 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; struct list_head grace_list; + + spinlock_t nsm_clnt_lock; + unsigned int nsm_users; + struct rpc_clnt *nsm_clnt; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 31a63f8..7e35587 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -596,6 +596,7 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->grace_list); + spin_lock_init(&ln->nsm_clnt_lock); return 0; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index fb1a2be..8d80c99 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref) dprintk("lockd: freeing block %p...\n", block); /* Remove block from file's list of blocks */ - mutex_lock(&file->f_mutex); list_del_init(&block->b_flist); mutex_unlock(&file->f_mutex); @@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref) static void nlmsvc_release_block(struct nlm_block *block) { if (block != NULL) - kref_put(&block->b_count, nlmsvc_free_block); + kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex); } /* @@ -1625,15 +1625,13 @@ EXPORT_SYMBOL(flock_lock_file_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct file *filp; - int fput_needed; + struct fd f = fdget(fd); struct file_lock *lock; int can_sleep, unlock; int error; error = -EBADF; - filp = fget_light(fd, &fput_needed); - if (!filp) + if (!f.file) goto out; can_sleep = !(cmd & LOCK_NB); @@ -1641,31 +1639,31 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) unlock = (cmd == LOCK_UN); if (!unlock && !(cmd & LOCK_MAND) && - !(filp->f_mode & (FMODE_READ|FMODE_WRITE))) + !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) goto out_putf; - error = flock_make_lock(filp, &lock, cmd); + error = flock_make_lock(f.file, &lock, cmd); if (error) goto out_putf; if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, lock->fl_type); + error = security_file_lock(f.file, lock->fl_type); if (error) goto out_free; - if (filp->f_op && filp->f_op->flock) - error = filp->f_op->flock(filp, + if (f.file->f_op && f.file->f_op->flock) + error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); else - error = flock_lock_file_wait(filp, lock); + error = flock_lock_file_wait(f.file, lock); out_free: locks_free_lock(lock); out_putf: - fput_light(filp, fput_needed); + fdput(f); out: return error; } diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 6984562..adb9011 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -208,8 +208,8 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode) li->li_height = 0; li->li_used_bytes = 0; li->li_block = NULL; - inode->i_uid = 0; - inode->i_gid = 0; + i_uid_write(inode, 0); + i_gid_write(inode, 0); inode->i_size = 0; inode->i_blocks = 0; inode->i_ctime = CURRENT_TIME; @@ -417,5 +417,10 @@ int logfs_init_inode_cache(void) void logfs_destroy_inode_cache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(logfs_inode_cache); } diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 5be0abe..e1a3b6b 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -119,8 +119,8 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) inode->i_mode = be16_to_cpu(di->di_mode); li->li_height = di->di_height; li->li_flags = be32_to_cpu(di->di_flags); - inode->i_uid = be32_to_cpu(di->di_uid); - inode->i_gid = be32_to_cpu(di->di_gid); + i_uid_write(inode, be32_to_cpu(di->di_uid)); + i_gid_write(inode, be32_to_cpu(di->di_gid)); inode->i_size = be64_to_cpu(di->di_size); logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); inode->i_atime = be64_to_timespec(di->di_atime); @@ -156,8 +156,8 @@ static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) di->di_height = li->li_height; di->di_pad = 0; di->di_flags = cpu_to_be32(li->li_flags); - di->di_uid = cpu_to_be32(inode->i_uid); - di->di_gid = cpu_to_be32(inode->i_gid); + di->di_uid = cpu_to_be32(i_uid_read(inode)); + di->di_gid = cpu_to_be32(i_gid_read(inode)); di->di_size = cpu_to_be64(i_size_read(inode)); di->di_used_bytes = cpu_to_be64(li->li_used_bytes); di->di_atime = timespec_to_be64(inode->i_atime); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 2a503ad..4fc5f8a 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -100,6 +100,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(minix_inode_cachep); } @@ -460,8 +465,8 @@ static struct inode *V1_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } inode->i_mode = raw_inode->i_mode; - inode->i_uid = (uid_t)raw_inode->i_uid; - inode->i_gid = (gid_t)raw_inode->i_gid; + i_uid_write(inode, raw_inode->i_uid); + i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; @@ -493,8 +498,8 @@ static struct inode *V2_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } inode->i_mode = raw_inode->i_mode; - inode->i_uid = (uid_t)raw_inode->i_uid; - inode->i_gid = (gid_t)raw_inode->i_gid; + i_uid_write(inode, raw_inode->i_uid); + i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; @@ -545,8 +550,8 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode) if (!raw_inode) return NULL; raw_inode->i_mode = inode->i_mode; - raw_inode->i_uid = fs_high2lowuid(inode->i_uid); - raw_inode->i_gid = fs_high2lowgid(inode->i_gid); + raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode)); + raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; raw_inode->i_time = inode->i_mtime.tv_sec; @@ -572,8 +577,8 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) if (!raw_inode) return NULL; raw_inode->i_mode = inode->i_mode; - raw_inode->i_uid = fs_high2lowuid(inode->i_uid); - raw_inode->i_gid = fs_high2lowgid(inode->i_gid); + raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode)); + raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; raw_inode->i_mtime = inode->i_mtime.tv_sec; @@ -680,7 +680,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) /* Allowed if owner and follower match. */ inode = link->dentry->d_inode; - if (current_cred()->fsuid == inode->i_uid) + if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -689,7 +689,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (parent->i_uid == inode->i_uid) + if (uid_eq(parent->i_uid, inode->i_uid)) return 0; path_put_conditional(link, nd); @@ -759,7 +759,7 @@ static int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || + if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || capable(CAP_FOWNER)) return 0; @@ -1797,8 +1797,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd, struct file **fp) { int retval = 0; - int fput_needed; - struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED; @@ -1850,44 +1848,41 @@ static int path_init(int dfd, const char *name, unsigned int flags, get_fs_pwd(current->fs, &nd->path); } } else { + struct fd f = fdget_raw(dfd); struct dentry *dentry; - file = fget_raw_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; + if (!f.file) + return -EBADF; - dentry = file->f_path.dentry; + dentry = f.file->f_path.dentry; if (*name) { - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (!S_ISDIR(dentry->d_inode->i_mode)) { + fdput(f); + return -ENOTDIR; + } retval = inode_permission(dentry->d_inode, MAY_EXEC); - if (retval) - goto fput_fail; + if (retval) { + fdput(f); + return retval; + } } - nd->path = file->f_path; + nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (fput_needed) - *fp = file; + if (f.need_put) + *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); lock_rcu_walk(); } else { - path_get(&file->f_path); - fput_light(file, fput_needed); + path_get(&nd->path); + fdput(f); } } nd->inode = nd->path.dentry->d_inode; return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; } static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -3971,7 +3966,7 @@ EXPORT_SYMBOL(user_path_at); EXPORT_SYMBOL(follow_down_one); EXPORT_SYMBOL(follow_down); EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ +EXPORT_SYMBOL(get_write_access); /* nfsd */ EXPORT_SYMBOL(getname); EXPORT_SYMBOL(lock_rename); EXPORT_SYMBOL(lookup_one_len); diff --git a/fs/namespace.c b/fs/namespace.c index 4d31f73..7bdf790 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) return err; err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt))) - goto unlock; + if (unlikely(!check_mnt(real_mount(path->mnt)))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + goto unlock; + /* ... and for those we'd better have mountpoint still alive */ + if (!real_mount(path->mnt)->mnt_ns) + goto unlock; + } /* Refuse the same filesystem on the same mount point */ err = -EBUSY; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 333df07..d7e9fe7 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -89,6 +89,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ncp_inode_cachep); } @@ -314,11 +319,11 @@ static void ncp_stop_tasks(struct ncp_server *server) { release_sock(sk); del_timer_sync(&server->timeout_tm); - flush_work_sync(&server->rcv.tq); + flush_work(&server->rcv.tq); if (sk->sk_socket->type == SOCK_STREAM) - flush_work_sync(&server->tx.tq); + flush_work(&server->tx.tq); else - flush_work_sync(&server->timeout_tq); + flush_work(&server->timeout_tq); } static int ncp_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index db7ad719..13ca196 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -95,8 +95,8 @@ config NFS_SWAP This option enables swapon to work on files located on NFS mounts. config NFS_V4_1 - bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_V4 && EXPERIMENTAL + bool "NFS client support for NFSv4.1" + depends on NFS_V4 select SUNRPC_BACKCHANNEL help This option enables support for minor version 1 of the NFSv4 protocol diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index dd392ed..f1027b0 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -37,6 +37,7 @@ #include <linux/bio.h> /* struct bio */ #include <linux/buffer_head.h> /* various write calls */ #include <linux/prefetch.h> +#include <linux/pagevec.h> #include "../pnfs.h" #include "../internal.h" @@ -162,25 +163,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, return bio; } -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, +static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, void (*end_io)(struct bio *, int err), - struct parallel_io *par) + struct parallel_io *par, + unsigned int offset, int len) { + isect = isect + (offset >> SECTOR_SHIFT); + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, + npg, rw, (unsigned long long)isect, offset, len); retry: if (!bio) { bio = bl_alloc_init_bio(npg, isect, be, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); } - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, len, offset) < len) { bio = bl_submit_bio(rw, bio); goto retry; } return bio; } +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + return do_add_page_to_bio(bio, npg, rw, isect, page, be, + end_io, par, 0, PAGE_CACHE_SIZE); +} + /* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { @@ -228,14 +243,6 @@ bl_end_par_io_read(void *data, int unused) schedule_work(&rdata->task.u.tk_work); } -static bool -bl_check_alignment(u64 offset, u32 len, unsigned long blkmask) -{ - if ((offset & blkmask) || (len & blkmask)) - return false; - return true; -} - static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { @@ -246,15 +253,15 @@ bl_read_pagelist(struct nfs_read_data *rdata) sector_t isect, extent_length = 0; struct parallel_io *par; loff_t f_offset = rdata->args.offset; + size_t bytes_left = rdata->args.count; + unsigned int pg_offset, pg_len; struct page **pages = rdata->args.pages; int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + const bool is_dio = (header->dreq != NULL); dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); - if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK)) - goto use_mds; - par = alloc_parallel(rdata); if (!par) goto use_mds; @@ -284,36 +291,53 @@ bl_read_pagelist(struct nfs_read_data *rdata) extent_length = min(extent_length, cow_length); } } + + if (is_dio) { + pg_offset = f_offset & ~PAGE_CACHE_MASK; + if (pg_offset + bytes_left > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = bytes_left; + + f_offset += pg_len; + bytes_left -= pg_len; + isect += (pg_offset >> SECTOR_SHIFT); + } else { + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } + hole = is_hole(be, isect); if (hole && !cow_read) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + zero_user_segment(pages[i], pg_offset, pg_len); print_page(pages[i]); SetPageUptodate(pages[i]); } else { struct pnfs_block_extent *be_read; be_read = (hole && cow_read) ? cow_read : be; - bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, + bio = do_add_page_to_bio(bio, rdata->pages.npages - i, READ, isect, pages[i], be_read, - bl_end_io_read, par); + bl_end_io_read, par, + pg_offset, pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } } - isect += PAGE_CACHE_SECTORS; + isect += (pg_len >> SECTOR_SHIFT); extent_length -= PAGE_CACHE_SECTORS; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { rdata->res.eof = 1; - rdata->res.count = header->inode->i_size - f_offset; + rdata->res.count = header->inode->i_size - rdata->args.offset; } else { - rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; + rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; } out: bl_put_extent(be); @@ -461,6 +485,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) return; } +static void +bl_read_single_end_io(struct bio *bio, int error) +{ + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + + /* Only one page in bvec */ + unlock_page(page); +} + +static int +bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int offset, unsigned int len) +{ + struct bio *bio; + struct page *shadow_page; + sector_t isect; + char *kaddr, *kshadow_addr; + int ret = 0; + + dprintk("%s: offset %u len %u\n", __func__, offset, len); + + shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (shadow_page == NULL) + return -ENOMEM; + + bio = bio_alloc(GFP_NOIO, 1); + if (bio == NULL) + return -ENOMEM; + + isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + + (offset / SECTOR_SIZE); + + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = bl_read_single_end_io; + + lock_page(shadow_page); + if (bio_add_page(bio, shadow_page, + SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { + unlock_page(shadow_page); + bio_put(bio); + return -EIO; + } + + submit_bio(READ, bio); + wait_on_page_locked(shadow_page); + if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { + ret = -EIO; + } else { + kaddr = kmap_atomic(page); + kshadow_addr = kmap_atomic(shadow_page); + memcpy(kaddr + offset, kshadow_addr + offset, len); + kunmap_atomic(kshadow_addr); + kunmap_atomic(kaddr); + } + __free_page(shadow_page); + bio_put(bio); + + return ret; +} + +static int +bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int dirty_offset, unsigned int dirty_len, + bool full_page) +{ + int ret = 0; + unsigned int start, end; + + if (full_page) { + start = 0; + end = PAGE_CACHE_SIZE; + } else { + start = round_down(dirty_offset, SECTOR_SIZE); + end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); + } + + dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); + if (!be) { + zero_user_segments(page, start, dirty_offset, + dirty_offset + dirty_len, end); + if (start == 0 && end == PAGE_CACHE_SIZE && + trylock_page(page)) { + SetPageUptodate(page); + unlock_page(page); + } + return ret; + } + + if (start != dirty_offset) + ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); + + if (!ret && (dirty_offset + dirty_len < end)) + ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, + end - dirty_offset - dirty_len); + + return ret; +} + /* Given an unmapped page, zero it or read in page for COW, page is locked * by caller. */ @@ -494,7 +618,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) SetPageUptodate(page); cleanup: - bl_put_extent(cow_read); if (bh) free_buffer_head(bh); if (ret) { @@ -566,6 +689,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) struct parallel_io *par = NULL; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; + unsigned int pg_offset, pg_len, saved_len; struct page **pages = wdata->args.pages; struct page *page; pgoff_t index; @@ -574,10 +698,13 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); - /* Check for alignment first */ - if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK)) - goto out_mds; + if (header->dreq != NULL && + (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || + !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { + dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); + goto out_mds; + } /* At this point, wdata->pages is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. @@ -674,10 +801,11 @@ next_page: if (!extent_length) { /* We've used up the previous extent */ bl_put_extent(be); + bl_put_extent(cow_read); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, NULL); + isect, &cow_read); if (!be || !is_writable(be, isect)) { header->pnfs_error = -EINVAL; goto out; @@ -694,7 +822,26 @@ next_page: extent_length = be->be_length - (isect - be->be_f_offset); } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + + dprintk("%s offset %lld count %Zu\n", __func__, offset, count); + pg_offset = offset & ~PAGE_CACHE_MASK; + if (pg_offset + count > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = count; + + saved_len = pg_len; + if (be->be_state == PNFS_BLOCK_INVALID_DATA && + !bl_is_sector_init(be->be_inval, isect)) { + ret = bl_read_partial_page_sync(pages[i], cow_read, + pg_offset, pg_len, true); + if (ret) { + dprintk("%s bl_read_partial_page_sync fail %d\n", + __func__, ret); + header->pnfs_error = ret; + goto out; + } + ret = bl_mark_sectors_init(be->be_inval, isect, PAGE_CACHE_SECTORS); if (unlikely(ret)) { @@ -703,15 +850,35 @@ next_page: header->pnfs_error = ret; goto out; } + + /* Expand to full page write */ + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } else if ((pg_offset & (SECTOR_SIZE - 1)) || + (pg_len & (SECTOR_SIZE - 1))){ + /* ahh, nasty case. We have to do sync full sector + * read-modify-write cycles. + */ + unsigned int saved_offset = pg_offset; + ret = bl_read_partial_page_sync(pages[i], be, pg_offset, + pg_len, false); + pg_offset = round_down(pg_offset, SECTOR_SIZE); + pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) + - pg_offset; } - bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, + + + bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, isect, pages[i], be, - bl_end_io_write, par); + bl_end_io_write, par, + pg_offset, pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } + offset += saved_len; + count -= saved_len; isect += PAGE_CACHE_SECTORS; last_isect = isect; extent_length -= PAGE_CACHE_SECTORS; @@ -729,17 +896,16 @@ next_page: } write_done: - wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); - if (count < wdata->res.count) { - wdata->res.count = count; - } + wdata->res.count = wdata->args.count; out: bl_put_extent(be); + bl_put_extent(cow_read); bl_submit_bio(WRITE, bio); put_parallel(par); return PNFS_ATTEMPTED; out_mds: bl_put_extent(be); + bl_put_extent(cow_read); kfree(par); return PNFS_NOT_ATTEMPTED; } @@ -874,7 +1040,7 @@ static void free_blk_mountid(struct block_mount_id *mid) } } -/* This is mostly copied from the filelayout's get_device_info function. +/* This is mostly copied from the filelayout_get_device_info function. * It seems much of this should be at the generic pnfs level. */ static struct pnfs_block_dev * @@ -1011,33 +1177,95 @@ bl_clear_layoutdriver(struct nfs_server *server) return 0; } +static bool +is_aligned_req(struct nfs_page *req, unsigned int alignment) +{ + return IS_ALIGNED(req->wb_offset, alignment) && + IS_ALIGNED(req->wb_bytes, alignment); +} + static void bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) nfs_pageio_reset_read_mds(pgio); else pnfs_generic_pg_init_read(pgio, req); } +static bool +bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) + return false; + + return pnfs_generic_pg_test(pgio, prev, req); +} + +/* + * Return the number of contiguous bytes for a given inode + * starting at page frame idx. + */ +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t end; + + /* Optimize common case that writes from 0 to end of file */ + end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + if (end != NFS_I(inode)->npages) { + rcu_read_lock(); + end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); + rcu_read_unlock(); + } + + if (!end) + return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); + else + return (end - idx) << PAGE_CACHE_SHIFT; +} + static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) { nfs_pageio_reset_write_mds(pgio); - else - pnfs_generic_pg_init_write(pgio, req); + } else { + u64 wb_size; + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); + } +} + +static bool +bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) + return false; + + return pnfs_generic_pg_test(pgio, prev, req); } static const struct nfs_pageio_ops bl_pg_read_ops = { .pg_init = bl_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = bl_pg_test_read, .pg_doio = pnfs_generic_pg_readpages, }; static const struct nfs_pageio_ops bl_pg_write_ops = { .pg_init = bl_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = bl_pg_test_write, .pg_doio = pnfs_generic_pg_writepages, }; diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 0335069..f4891bd 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -41,6 +41,7 @@ #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) struct block_mount_id { spinlock_t bm_lock; /* protects list */ @@ -172,7 +173,6 @@ struct bl_msg_hdr { /* blocklayoutdev.c */ ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -struct block_device *nfs4_blkdev_get(dev_t dev); int nfs4_blkdev_put(struct block_device *bdev); struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index c965542..a86c5bd 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -53,22 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } -/* Open a block_device by device number. */ -struct block_device *nfs4_blkdev_get(dev_t dev) -{ - struct block_device *bd; - - dprintk("%s enter\n", __func__); - bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bd)) - goto fail; - return bd; -fail: - dprintk("%s failed to open device : %ld\n", - __func__, PTR_ERR(bd)); - return NULL; -} - /* * Release the block device */ @@ -172,11 +156,12 @@ nfs4_blk_decode_device(struct nfs_server *server, goto out; } - bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); + bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), + FMODE_READ, NULL); if (IS_ERR(bd)) { - rc = PTR_ERR(bd); - dprintk("%s failed to open device : %d\n", __func__, rc); - rv = ERR_PTR(rc); + dprintk("%s failed to open device : %ld\n", __func__, + PTR_ERR(bd)); + rv = ERR_CAST(bd); goto out; } diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 1f9a603..9c3e117 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -683,8 +683,7 @@ encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); p = xdr_encode_hyper(p, 0LL); *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - list_del(&lce->bse_node); - list_add_tail(&lce->bse_node, &bl->bl_committing); + list_move_tail(&lce->bse_node, &bl->bl_committing); bl->bl_count--; count++; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 4c8459e..2245bef 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -12,6 +12,7 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> +#include <linux/errno.h> #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/kthread.h> @@ -23,6 +24,7 @@ #include "nfs4_fs.h" #include "callback.h" #include "internal.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -37,7 +39,32 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; static DEFINE_MUTEX(nfs_callback_mutex); static struct svc_program nfs4_callback_program; -unsigned short nfs_callback_tcpport6; +static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) +{ + int ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nn->nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport, PF_INET, net); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nn->nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport6, PF_INET6, net); + } else if (ret != -EAFNOSUPPORT) + goto out_err; + return 0; + +out_err: + return (ret) ? ret : -ENOMEM; +} /* * This is the NFSv4 callback kernel thread. @@ -78,38 +105,23 @@ nfs4_callback_svc(void *vrqstp) * Prepare to bring up the NFSv4 callback service */ static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +nfs4_callback_up(struct svc_serv *serv) { - int ret; - - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); - if (ret <= 0) - goto out_err; - nfs_callback_tcpport = ret; - dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport, PF_INET); - - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); - if (ret > 0) { - nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport6, PF_INET6); - } else if (ret == -EAFNOSUPPORT) - ret = 0; - else - goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - -out_err: - if (ret == 0) - ret = -ENOMEM; - return ERR_PTR(ret); } #if defined(CONFIG_NFS_V4_1) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); +} + /* * The callback service for NFSv4.1 callbacks */ @@ -149,28 +161,9 @@ nfs41_callback_svc(void *vrqstp) * Bring up the NFSv4.1 callback service */ static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +nfs41_callback_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - int ret; - - /* - * Create an svc_sock for the back channel service that shares the - * fore channel connection. - * Returns the input port (0) and sets the svc_serv bc_xprt on success - */ - ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, - SVC_SOCK_ANONYMOUS); - if (ret < 0) { - rqstp = ERR_PTR(ret); - goto out; - } - - /* - * Save the svc_serv in the transport so that it can - * be referenced when the session backchannel is initialized - */ - xprt->bc_serv = serv; INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); @@ -180,90 +173,74 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; } -out: dprintk("--> %s return %ld\n", __func__, IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); return rqstp; } -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, struct rpc_xprt *xprt, +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { - if (minorversion) { - *rqstpp = nfs41_callback_up(serv, xprt); - *callback_svc = nfs41_callback_svc; - } - return minorversion; + *rqstpp = nfs41_callback_up(serv); + *callback_svc = nfs41_callback_svc; } static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, - struct nfs_callback_data *cb_info) + struct svc_serv *serv) { if (minorversion) - xprt->bc_serv = cb_info->serv; + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; } #else -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, struct rpc_xprt *xprt, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) { return 0; } +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + *rqstpp = ERR_PTR(-ENOTSUPP); + *callback_svc = ERR_PTR(-ENOTSUPP); +} + static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, - struct nfs_callback_data *cb_info) + struct svc_serv *serv) { } #endif /* CONFIG_NFS_V4_1 */ -/* - * Bring up the callback thread if it is not already up. - */ -int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) { - struct svc_serv *serv = NULL; struct svc_rqst *rqstp; int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; char svc_name[12]; - int ret = 0; - int minorversion_setup; - struct net *net = &init_net; + int ret; - mutex_lock(&nfs_callback_mutex); - if (cb_info->users++ || cb_info->task != NULL) { - nfs_callback_bc_serv(minorversion, xprt, cb_info); - goto out; - } - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); - if (!serv) { - ret = -ENOMEM; - goto out_err; - } - /* As there is only one thread we need to over-ride the - * default maximum of 80 connections - */ - serv->sv_maxconn = 1024; + nfs_callback_bc_serv(minorversion, xprt, serv); - ret = svc_bind(serv, net); - if (ret < 0) { - printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto out_err; - } + if (cb_info->task) + return 0; - minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, - serv, xprt, &rqstp, &callback_svc); - if (!minorversion_setup) { + switch (minorversion) { + case 0: /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv, xprt); + rqstp = nfs4_callback_up(serv); callback_svc = nfs4_callback_svc; + break; + default: + nfs_minorversion_callback_svc_setup(serv, + &rqstp, &callback_svc); } - if (IS_ERR(rqstp)) { - ret = PTR_ERR(rqstp); - goto out_err; - } + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); svc_sock_update_bufs(serv); @@ -276,41 +253,165 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) svc_exit_thread(cb_info->rqst); cb_info->rqst = NULL; cb_info->task = NULL; - goto out_err; + return PTR_ERR(cb_info->task); + } + dprintk("nfs_callback_up: service started\n"); + return 0; +} + +static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + if (--nn->cb_users[minorversion]) + return; + + dprintk("NFS: destroy per-net callback data; net=%p\n", net); + svc_shutdown_net(serv, net); +} + +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + int ret; + + if (nn->cb_users[minorversion]++) + return 0; + + dprintk("NFS: create per-net callback data; net=%p\n", net); + + ret = svc_bind(serv, net); + if (ret < 0) { + printk(KERN_WARNING "NFS: bind callback service failed\n"); + goto err_bind; + } + + switch (minorversion) { + case 0: + ret = nfs4_callback_up_net(serv, net); + break; + case 1: + ret = nfs41_callback_up_net(serv, net); + break; + default: + printk(KERN_ERR "NFS: unknown callback version: %d\n", + minorversion); + ret = -EINVAL; + break; } -out: + + if (ret < 0) { + printk(KERN_ERR "NFS: callback service start failed\n"); + goto err_socks; + } + return 0; + +err_socks: + svc_rpcb_cleanup(serv, net); +err_bind: + dprintk("NFS: Couldn't create callback socket: err = %d; " + "net = %p\n", ret, net); + return ret; +} + +static struct svc_serv *nfs_callback_create_svc(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; + + /* + * Check whether we're already up and running. + */ + if (cb_info->task) { + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(cb_info->serv); + return cb_info->serv; + } + + /* + * Sanity check: if there's no task, + * we should be the first user ... + */ + if (cb_info->users) + printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", + cb_info->users); + + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + if (!serv) { + printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); + return ERR_PTR(-ENOMEM); + } + /* As there is only one thread we need to over-ride the + * default maximum of 80 connections + */ + serv->sv_maxconn = 1024; + dprintk("nfs_callback_create_svc: service created\n"); + return serv; +} + +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ + struct svc_serv *serv; + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int ret; + struct net *net = xprt->xprt_net; + + mutex_lock(&nfs_callback_mutex); + + serv = nfs_callback_create_svc(minorversion); + if (IS_ERR(serv)) { + ret = PTR_ERR(serv); + goto err_create; + } + + ret = nfs_callback_up_net(minorversion, serv, net); + if (ret < 0) + goto err_net; + + ret = nfs_callback_start_svc(minorversion, xprt, serv); + if (ret < 0) + goto err_start; + + cb_info->users++; /* * svc_create creates the svc_serv with sv_nrthreads == 1, and then * svc_prepare_thread increments that. So we need to call svc_destroy * on both success and failure so that the refcount is 1 when the * thread exits. */ - if (serv) - svc_destroy(serv); +err_net: + svc_destroy(serv); +err_create: mutex_unlock(&nfs_callback_mutex); return ret; -out_err: - dprintk("NFS: Couldn't create callback socket or server thread; " - "err = %d\n", ret); - cb_info->users--; - if (serv) - svc_shutdown_net(serv, net); - goto out; + +err_start: + nfs_callback_down_net(minorversion, serv, net); + dprintk("NFS: Couldn't create server thread; err = %d\n", ret); + goto err_net; } /* * Kill the callback thread if it's no longer being used. */ -void nfs_callback_down(int minorversion) +void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; mutex_lock(&nfs_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); - svc_shutdown_net(cb_info->serv, &init_net); + dprintk("nfs_callback_down: service stopped\n"); svc_exit_thread(cb_info->rqst); + dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; cb_info->rqst = NULL; cb_info->task = NULL; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b44d7b1..4251c2ae 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -194,7 +194,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, struct cb_process_state *cps); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); -extern void nfs_callback_down(int minorversion); +extern void nfs_callback_down(int minorversion, struct net *net); extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid); extern int nfs4_set_callback_sessionid(struct nfs_client *clp); @@ -209,6 +209,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp); extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; -extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 1b5d809..76b4a7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -122,7 +122,15 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, ino = igrab(lo->plh_inode); if (!ino) continue; - get_layout_hdr(lo); + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); + continue; + } + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); return lo; } } @@ -158,7 +166,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - mark_matching_lsegs_invalid(lo, &free_me_list, + pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &args->cbl_range)) rv = NFS4ERR_DELAY; else @@ -166,7 +174,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); iput(ino); return rv; } @@ -196,9 +204,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, continue; list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!igrab(lo->plh_inode)) + ino = igrab(lo->plh_inode); + if (ino) + continue; + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); continue; - get_layout_hdr(lo); + } + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); BUG_ON(!list_empty(&lo->plh_bulk_recall)); list_add(&lo->plh_bulk_recall, &recall_list); } @@ -211,12 +228,12 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) + if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); iput(ino); } return rv; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9969444..8b39a42 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -93,10 +93,10 @@ static struct nfs_subversion *find_nfs_version(unsigned int version) spin_unlock(&nfs_version_lock); return nfs; } - }; + } spin_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT);; + return ERR_PTR(-EPROTONOSUPPORT); } struct nfs_subversion *get_nfs_version(unsigned int version) @@ -498,7 +498,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, return nfs_found_client(cl_init, clp); } if (new) { - list_add(&new->cl_share_link, &nn->nfs_client_list); + list_add_tail(&new->cl_share_link, + &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, timeparms, ip_addr, @@ -668,7 +669,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; - server->client = rpc_clone_client(clp->cl_rpcclient); + server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, + pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); @@ -678,16 +680,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; - - if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { - struct rpc_auth *auth; - - auth = rpcauth_create(pseudoflavour, server->client); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __func__); - return PTR_ERR(auth); - } - } server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; @@ -761,6 +753,8 @@ static int nfs_init_server(struct nfs_server *server, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); @@ -855,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - server->pnfs_blksize = fsinfo->blksize; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 627f108..ce8cb92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2072,7 +2072,7 @@ found: nfs_access_free_entry(entry); } -static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) @@ -2098,6 +2098,20 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s spin_unlock(&nfs_access_lru_lock); } } +EXPORT_SYMBOL_GPL(nfs_access_add_cache); + +void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) +{ + entry->mask = 0; + if (access_result & NFS4_ACCESS_READ) + entry->mask |= MAY_READ; + if (access_result & + (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +} +EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1ba385b..cae26cb 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -46,6 +46,7 @@ #include <linux/kref.h> #include <linux/slab.h> #include <linux/task_io_accounting_ops.h> +#include <linux/module.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> @@ -78,6 +79,7 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ ssize_t count, /* bytes actually processed */ + bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ @@ -190,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +{ + return dreq->bytes_left; +} +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); + /* * Collects and returns the final error value/byte-count. */ @@ -390,6 +398,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de user_addr += req_len; pos += req_len; count -= req_len; + dreq->bytes_left -= req_len; } /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); @@ -450,23 +459,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; dreq = nfs_direct_req_alloc(); if (dreq == NULL) goto out; dreq->inode = inode; + dreq->bytes_left = iov_length(iov, nr_segs); dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - dreq->l_ctx = nfs_get_lock_context(dreq->ctx); - if (dreq->l_ctx == NULL) + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); goto out_release; + } + dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + NFS_I(inode)->read_io += iov_length(iov, nr_segs); result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); - NFS_I(inode)->read_io += result; out_release: nfs_direct_req_release(dreq); out: @@ -706,6 +720,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d user_addr += req_len; pos += req_len; count -= req_len; + dreq->bytes_left -= req_len; } /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); @@ -814,6 +829,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, get_dreq(dreq); atomic_inc(&inode->i_dio_count); + NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); @@ -825,7 +841,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } nfs_pageio_complete(&desc); - NFS_I(dreq->inode)->write_io += desc.pg_bytes_written; /* * If no bytes were started, return the error, and let the @@ -849,16 +864,21 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; dreq = nfs_direct_req_alloc(); if (!dreq) goto out; dreq->inode = inode; + dreq->bytes_left = count; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - dreq->l_ctx = nfs_get_lock_context(dreq->ctx); - if (dreq->l_ctx == NULL) + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); goto out_release; + } + dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 75d6d0a..582bb88 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -259,7 +259,7 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - int have_error, status; + int have_error, do_resend, status; int ret = 0; dprintk("NFS: fsync file(%s/%s) datasync %d\n", @@ -267,15 +267,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); - if (status >= 0 && ret < 0) - status = ret; have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) + if (have_error) { ret = xchg(&ctx->error, 0); - if (!ret && status < 0) + if (ret) + goto out; + } + if (status < 0) { ret = status; + goto out; + } + do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + if (do_resend) + ret = -EAGAIN; +out: return ret; } EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); @@ -286,10 +294,21 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file->f_path.dentry->d_inode; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - mutex_lock(&inode->i_mutex); - ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); return ret; } @@ -576,6 +595,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 4654ced..033803c 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -32,6 +32,8 @@ #include <asm/uaccess.h> +#include "internal.h" + #define NFSDBG_FACILITY NFSDBG_CLIENT /* diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a850079..9cc4a3f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -55,18 +55,19 @@ static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; -struct idmap { - struct rpc_pipe *idmap_pipe; - struct key_construction *idmap_key_cons; - struct mutex idmap_mutex; -}; - struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; + struct key_construction *key_cons; struct idmap *idmap; }; +struct idmap { + struct rpc_pipe *idmap_pipe; + struct idmap_legacy_upcalldata *idmap_upcall_data; + struct mutex idmap_mutex; +}; + /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr @@ -158,7 +159,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re return 0; memcpy(buf, name, namelen); buf[namelen] = '\0'; - if (strict_strtoul(buf, 0, &val) != 0) + if (kstrtoul(buf, 0, &val) != 0) return 0; *res = val; return 1; @@ -330,7 +331,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, name, namelen, type, data, data_size, idmap); - idmap->idmap_key_cons = NULL; mutex_unlock(&idmap->idmap_mutex); } return ret; @@ -364,7 +364,7 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ if (data_size <= 0) { ret = -EINVAL; } else { - ret = strict_strtol(id_str, 10, &id_long); + ret = kstrtol(id_str, 10, &id_long); *id = (__u32)id_long; } return ret; @@ -465,8 +465,6 @@ nfs_idmap_new(struct nfs_client *clp) struct rpc_pipe *pipe; int error; - BUG_ON(clp->cl_idmap != NULL); - idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); if (idmap == NULL) return -ENOMEM; @@ -510,7 +508,6 @@ static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - BUG_ON(clp->cl_rpcclient->cl_dentry == NULL); err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, clp->cl_idmap, clp->cl_idmap->idmap_pipe); @@ -632,9 +629,6 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, substring_t substr; int token, ret; - memset(im, 0, sizeof(*im)); - memset(msg, 0, sizeof(*msg)); - im->im_type = IDMAP_TYPE_GROUP; token = match_token(desc, nfs_idmap_tokens, &substr); @@ -665,6 +659,35 @@ out: return ret; } +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data) +{ + if (idmap->idmap_upcall_data != NULL) { + WARN_ON_ONCE(1); + return false; + } + idmap->idmap_upcall_data = data; + return true; +} + +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +{ + struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + + kfree(idmap->idmap_upcall_data); + idmap->idmap_upcall_data = NULL; + complete_request_key(cons, ret); +} + +static void +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +{ + if (idmap->idmap_upcall_data != NULL) + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +} + static int nfs_idmap_legacy_upcall(struct key_construction *cons, const char *op, void *aux) @@ -677,29 +700,28 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, int ret = -ENOMEM; /* msg and im are freed in idmap_pipe_destroy_msg */ - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; + data->key_cons = cons; ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) goto out2; - BUG_ON(idmap->idmap_key_cons != NULL); - idmap->idmap_key_cons = cons; + ret = -EAGAIN; + if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) + goto out2; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) - goto out3; + nfs_idmap_abort_pipe_upcall(idmap, ret); return ret; - -out3: - idmap->idmap_key_cons = NULL; out2: kfree(data); out1: @@ -714,21 +736,32 @@ static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *dat authkey); } -static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, + struct idmap_msg *upcall, + struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; - int ret = -EINVAL; + int ret = -ENOKEY; + /* ret = -ENOKEY */ + if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) + goto out; switch (im->im_conv) { case IDMAP_CONV_NAMETOID: + if (strcmp(upcall->im_name, im->im_name) != 0) + break; sprintf(id_str, "%d", im->im_id); ret = nfs_idmap_instantiate(key, authkey, id_str); break; case IDMAP_CONV_IDTONAME: + if (upcall->im_id != im->im_id) + break; ret = nfs_idmap_instantiate(key, authkey, im->im_name); break; + default: + ret = -EINVAL; } - +out: return ret; } @@ -740,14 +773,16 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct key_construction *cons; struct idmap_msg im; size_t namelen_in; - int ret; + int ret = -ENOKEY; /* If instantiation is successful, anyone waiting for key construction * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - cons = ACCESS_ONCE(idmap->idmap_key_cons); - idmap->idmap_key_cons = NULL; + if (idmap->idmap_upcall_data == NULL) + goto out_noupcall; + + cons = idmap->idmap_upcall_data->key_cons; if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -768,16 +803,19 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; - } +} - ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); + ret = nfs_idmap_read_and_verify_message(&im, + &idmap->idmap_upcall_data->idmap_msg, + cons->key, cons->authkey); if (ret >= 0) { key_set_timeout(cons->key, nfs_idmap_cache_timeout); ret = mlen; } out: - complete_request_key(cons, ret); + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall: return ret; } @@ -788,14 +826,9 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap_legacy_upcalldata, pipe_msg); struct idmap *idmap = data->idmap; - struct key_construction *cons; - if (msg->errno) { - cons = ACCESS_ONCE(idmap->idmap_key_cons); - idmap->idmap_key_cons = NULL; - complete_request_key(cons, msg->errno); - } - /* Free memory allocated in nfs_idmap_legacy_upcall() */ - kfree(data); + + if (msg->errno) + nfs_idmap_abort_pipe_upcall(idmap, msg->errno); } static void @@ -803,7 +836,8 @@ idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; - idmap->idmap_key_cons = NULL; + + nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c6e895f..5c7325c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -154,7 +154,7 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; else @@ -547,8 +547,8 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { atomic_set(&l_ctx->count, 1); - l_ctx->lockowner = current->files; - l_ctx->pid = current->tgid; + l_ctx->lockowner.l_owner = current->files; + l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); } @@ -557,9 +557,9 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *pos; list_for_each_entry(pos, &ctx->lock_context.list, list) { - if (pos->lockowner != current->files) + if (pos->lockowner.l_owner != current->files) continue; - if (pos->pid != current->tgid) + if (pos->lockowner.l_pid != current->tgid) continue; atomic_inc(&pos->count); return pos; @@ -578,7 +578,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_unlock(&inode->i_lock); new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) - return NULL; + return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); @@ -1571,6 +1571,11 @@ static int __init nfs_init_inodecache(void) static void nfs_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(nfs_inode_cachep); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 31fdb03..59b133c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -101,11 +101,11 @@ struct nfs_client_initdata { */ struct nfs_parsed_mount_data { int flags; - int rsize, wsize; - int timeo, retrans; - int acregmin, acregmax, + unsigned int rsize, wsize; + unsigned int timeo, retrans; + unsigned int acregmin, acregmax, acdirmin, acdirmax; - int namlen; + unsigned int namlen; unsigned int options; unsigned int bsize; unsigned int auth_flavor_len; @@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) { inode_dio_wait(inode); } +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_read_data *); @@ -483,6 +484,12 @@ extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); +extern int nfs40_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); +extern int nfs41_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); /* * Determine the device name as a string diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 0539de1..8ee1fab 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -5,6 +5,7 @@ #ifndef __NFS_NETNS_H__ #define __NFS_NETNS_H__ +#include <linux/nfs4.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -22,6 +23,9 @@ struct nfs_net { struct list_head nfs_volume_list; #if IS_ENABLED(CONFIG_NFS_V4) struct idr cb_ident_idr; /* Protected by nfs_client_lock */ + unsigned short nfs_callback_tcpport; + unsigned short nfs_callback_tcpport6; + int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; struct timespec boot_time; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e4498dc..4a1aafb 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -70,7 +70,7 @@ ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, if (type == ACL_TYPE_ACCESS && acl->a_count == 0) error = -ENODATA; else - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); } else error = -ENODATA; @@ -92,7 +92,7 @@ int nfs3_setxattr(struct dentry *dentry, const char *name, else return -EOPNOTSUPP; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); error = nfs3_proc_setacl(inode, type, acl); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d6b3b5f..6932209 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -643,7 +643,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; - __be32 *verf = NFS_COOKIEVERF(dir); + __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index da0618a..a525fde 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -132,8 +132,8 @@ struct nfs4_lock_owner { struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ -#define NFS_LOCK_INITIALIZED 1 - int ls_flags; +#define NFS_LOCK_INITIALIZED 0 + unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; atomic_t ls_count; @@ -191,6 +191,8 @@ struct nfs4_state_recovery_ops { int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); int (*reclaim_complete)(struct nfs_client *); + int (*detect_trunking)(struct nfs_client *, struct nfs_client **, + struct rpc_cred *); }; struct nfs4_state_maintenance_ops { @@ -223,7 +225,7 @@ extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); @@ -320,9 +322,15 @@ extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **); +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) @@ -351,7 +359,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *, extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, fl_owner_t, pid_t); + fmode_t, const struct nfs_lockowner *); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -372,6 +380,9 @@ extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; extern unsigned short send_implementation_id; +#define NFS4_CLIENT_ID_UNIQ_LEN (64) +extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; + /* nfs4sysctl.c */ #ifdef CONFIG_SYSCTL int nfs4_register_sysctl(void); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 24eb663..6bacfde 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -84,7 +84,7 @@ error: static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_mvops->minor_version); + nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -185,6 +185,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour) { char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_client *old; int error; if (clp->cl_cons_state == NFS_CS_READY) { @@ -230,6 +231,17 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (!nfs4_has_session(clp)) nfs_mark_client_ready(clp, NFS_CS_READY); + + error = nfs4_discover_server_trunking(clp, &old); + if (error < 0) + goto error; + if (clp != old) { + clp->cl_preserve_clid = true; + nfs_put_client(clp); + clp = old; + atomic_inc(&clp->cl_count); + } + return clp; error: @@ -239,6 +251,248 @@ error: return ERR_PTR(error); } +/* + * SETCLIENTID just did a callback update with the callback ident in + * "drop," but server trunking discovery claims "drop" and "keep" are + * actually the same server. Swap the callback IDs so that "keep" + * will continue to use the callback ident the server now knows about, + * and so that "keep"'s original callback ident is destroyed when + * "drop" is freed. + */ +static void nfs4_swap_callback_idents(struct nfs_client *keep, + struct nfs_client *drop) +{ + struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id); + unsigned int save = keep->cl_cb_ident; + + if (keep->cl_cb_ident == drop->cl_cb_ident) + return; + + dprintk("%s: keeping callback ident %u and dropping ident %u\n", + __func__, keep->cl_cb_ident, drop->cl_cb_ident); + + spin_lock(&nn->nfs_client_lock); + + idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident); + keep->cl_cb_ident = drop->cl_cb_ident; + + idr_replace(&nn->cb_ident_idr, drop, save); + drop->cl_cb_ident = save; + + spin_unlock(&nn->nfs_client_lock); +} + +/** + * nfs40_walk_client_list - Find server that recognizes a client ID + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs40_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs40_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *n, *prev = NULL; + struct nfs4_setclientid_res clid = { + .clientid = new->cl_clientid, + .confirm = new->cl_confirm, + }; + int status; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos" */ + if (pos->cl_cons_state < NFS_CS_READY) + continue; + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (pos->cl_clientid != new->cl_clientid) + continue; + + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + + status = nfs4_proc_setclientid_confirm(pos, &clid, cred); + if (status == 0) { + nfs4_swap_callback_idents(pos, new); + + nfs_put_client(pos); + *result = pos; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + return 0; + } + if (status != -NFS4ERR_STALE_CLIENTID) { + nfs_put_client(pos); + dprintk("NFS: <-- %s status = %d, no result\n", + __func__, status); + return status; + } + + spin_lock(&nn->nfs_client_lock); + prev = pos; + } + + /* + * No matching nfs_client found. This should be impossible, + * because the new nfs_client has already been added to + * nfs_client_list by nfs_get_client(). + * + * Don't BUG(), since the caller is holding a mutex. + */ + if (prev) + nfs_put_client(prev); + spin_unlock(&nn->nfs_client_lock); + pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); + return -NFS4ERR_STALE_CLIENTID; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ + if (a->cl_clientid != b->cl_clientid) { + dprintk("NFS: --> %s client ID %llx does not match %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return false; + } + dprintk("NFS: --> %s client ID %llx matches %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return true; +} + +/* + * Returns true if the server owners match + */ +static bool +nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) +{ + struct nfs41_server_owner *o1 = a->cl_serverowner; + struct nfs41_server_owner *o2 = b->cl_serverowner; + + if (o1->minor_id != o2->minor_id) { + dprintk("NFS: --> %s server owner minor IDs do not match\n", + __func__); + return false; + } + + if (o1->major_id_sz != o2->major_id_sz) + goto out_major_mismatch; + if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) + goto out_major_mismatch; + + dprintk("NFS: --> %s server owners match\n", __func__); + return true; + +out_major_mismatch: + dprintk("NFS: --> %s server owner major IDs do not match\n", + __func__); + return false; +} + +/** + * nfs41_walk_client_list - Find nfs_client that matches a client/server owner + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs41_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs41_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *n, *prev = NULL; + int error; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state < NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + error = nfs_wait_client_init_complete(pos); + if (error < 0) { + nfs_put_client(pos); + spin_lock(&nn->nfs_client_lock); + continue; + } + + spin_lock(&nn->nfs_client_lock); + } + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (!nfs4_match_clientids(pos, new)) + continue; + + if (!nfs4_match_serverowners(pos, new)) + continue; + + spin_unlock(&nn->nfs_client_lock); + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + + *result = pos; + return 0; + } + + /* + * No matching nfs_client found. This should be impossible, + * because the new nfs_client has already been added to + * nfs_client_list by nfs_get_client(). + * + * Don't BUG(), since the caller is holding a mutex. + */ + spin_unlock(&nn->nfs_client_lock); + pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); + return -NFS4ERR_STALE_CLIENTID; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs4_destroy_server(struct nfs_server *server) { nfs_server_return_all_delegations(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index acb65e7..afddd66 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -95,13 +95,24 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file->f_path.dentry->d_inode; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - mutex_lock(&inode->i_mutex); - ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret && !datasync) - /* application has asked for meta-data sync */ - ret = pnfs_layoutcommit_inode(inode, true); - mutex_unlock(&inode->i_mutex); + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); return ret; } diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 53f94d9..52d8472 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -190,8 +190,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, * i/o and all i/o waiting on the slot table to the MDS until * layout is destroyed and a new valid layout is obtained. */ - set_bit(NFS_LAYOUT_INVALID, - &NFS_I(inode)->layout->plh_flags); pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; @@ -205,7 +203,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EPIPE: dprintk("%s DS connection error %d\n", __func__, task->tk_status); - filelayout_mark_devid_invalid(devid); + nfs4_mark_deviceid_unavailable(devid); clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); _pnfs_return_layout(inode); rpc_wake_up(&tbl->slot_tbl_waitq); @@ -269,6 +267,21 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } +bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return filelayout_test_devid_invalid(node) || + nfs4_test_deviceid_unavailable(node); +} + +static bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); + + return filelayout_test_devid_unavailable(node); +} + /* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its @@ -453,7 +466,7 @@ static void filelayout_commit_release(void *calldata) struct nfs_commit_data *data = calldata; data->completion_ops->completion(data); - put_lseg(data->lseg); + pnfs_put_lseg(data->lseg); nfs_put_client(data->ds_clp); nfs_commitdata_release(data); } @@ -608,13 +621,13 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, NFS_SERVER(lo->plh_inode)->nfs_client, id); if (d == NULL) { - dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); + dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); if (dsaddr == NULL) goto out; } else dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - /* Found deviceid is being reaped */ - if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) + /* Found deviceid is unavailable */ + if (filelayout_test_devid_unavailable(&dsaddr->id_node)) goto out_put; fl->dsaddr = dsaddr; @@ -931,7 +944,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); if (status < 0) { - put_lseg(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; goto out_mds; } @@ -985,7 +998,7 @@ filelayout_clear_request_commit(struct nfs_page *req, out: nfs_request_remove_commit_list(req, cinfo); spin_unlock(cinfo->lock); - put_lseg(freeme); + pnfs_put_lseg(freeme); } static struct list_head * @@ -1018,7 +1031,7 @@ filelayout_choose_commit_list(struct nfs_page *req, * off due to a rewrite, in which case it will be done in * filelayout_clear_request_commit */ - buckets[i].wlseg = get_lseg(lseg); + buckets[i].wlseg = pnfs_get_lseg(lseg); } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; @@ -1128,7 +1141,7 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (list_empty(src)) bucket->wlseg = NULL; else - get_lseg(bucket->clseg); + pnfs_get_lseg(bucket->clseg); } return ret; } @@ -1159,12 +1172,12 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, /* NOTE cinfo->lock is NOT held, relying on fact that this is * only called on single thread per dreq. - * Can't take the lock because need to do put_lseg + * Can't take the lock because need to do pnfs_put_lseg */ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { BUG_ON(!list_empty(&b->written)); - put_lseg(b->wlseg); + pnfs_put_lseg(b->wlseg); b->wlseg = NULL; } } @@ -1200,7 +1213,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - put_lseg(bucket->clseg); + pnfs_put_lseg(bucket->clseg); bucket->clseg = NULL; } /* Caller will clean up entries put on list */ diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 43fe802..dca47d78 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -129,23 +129,13 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) } static inline bool -filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags); -} - -static inline bool filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) { return test_bit(NFS_DEVICEID_INVALID, &node->flags); } -static inline bool -filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) -{ - return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) || - filelayout_test_layout_invalid(lseg->pls_layout); -} +extern bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); @@ -158,7 +148,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); void nfs4_ds_disconnect(struct nfs_client *clp); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f81231f..3336d5e 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -690,7 +690,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl * of available devices, and return it. */ struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) { struct pnfs_device *pdev = NULL; u32 max_resp_sz; @@ -804,13 +804,14 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); - if (filelayout_test_devid_invalid(devid)) + if (filelayout_test_devid_unavailable(devid)) return NULL; if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - goto mark_dev_invalid; + filelayout_mark_devid_invalid(devid); + return NULL; } if (!ds->ds_clp) { @@ -818,14 +819,12 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) int err; err = nfs4_ds_connect(s, ds); - if (err) - goto mark_dev_invalid; + if (err) { + nfs4_mark_deviceid_unavailable(devid); + return NULL; + } } return ds; - -mark_dev_invalid: - filelayout_mark_devid_invalid(devid); - return NULL; } module_param(dataserver_retrans, uint, 0644); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 017b4b0..79fbb61 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -192,25 +192,13 @@ out: struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, struct qstr *name) { - struct rpc_clnt *clone; - struct rpc_auth *auth; rpc_authflavor_t flavor; flavor = nfs4_negotiate_security(inode, name); if ((int)flavor < 0) - return ERR_PTR(flavor); + return ERR_PTR((int)flavor); - clone = rpc_clone_client(clnt); - if (IS_ERR(clone)) - return clone; - - auth = rpcauth_create(flavor, clone); - if (!auth) { - rpc_shutdown_client(clone); - clone = ERR_PTR(-EIO); - } - - return clone; + return rpc_clone_client_set_auth(clnt, flavor); } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6352741..68b21d8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -104,6 +104,8 @@ static int nfs4_map_errors(int err) return -EACCES; case -NFS4ERR_MINOR_VERS_MISMATCH: return -EPROTONOSUPPORT; + case -NFS4ERR_ACCESS: + return -EACCES; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -150,6 +152,12 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { FATTR4_WORD2_MDSTHRESHOLD }; +static const u32 nfs4_open_noattr_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_FILEID, +}; + const u32 nfs4_statfs_bitmap[2] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE @@ -832,6 +840,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; + p->o_res.access_request = p->o_arg.access; nfs_fattr_init(&p->f_attr); nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } @@ -860,6 +869,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS + * will return permission denied for all bits until close */ + if (!(flags & O_EXCL)) { + /* ask server to check for all possible rights as results + * are cached */ + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); p->o_arg.id.uniquifier = sp->so_seqid.owner_id; @@ -1115,11 +1132,80 @@ out_return_state: return state; } -static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +static void +nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) +{ + struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; + struct nfs_delegation *delegation; + int delegation_flags = 0; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + clp->cl_hostname); + } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) + nfs_inode_set_delegation(state->inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); +} + +/* + * Check the inode attributes against the CLAIM_PREVIOUS returned attributes + * and update the nfs4_state. + */ +static struct nfs4_state * +_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode = data->state->inode; + struct nfs4_state *state = data->state; + int ret; + + if (!data->rpc_done) { + ret = data->rpc_status; + goto err; + } + + ret = -ESTALE; + if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || + !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || + !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) + goto err; + + ret = -ENOMEM; + state = nfs4_get_open_state(inode, data->owner); + if (state == NULL) + goto err; + + ret = nfs_refresh_inode(inode, &data->f_attr); + if (ret) + goto err; + + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + + return state; +err: + return ERR_PTR(ret); + +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { struct inode *inode; struct nfs4_state *state = NULL; - struct nfs_delegation *delegation; int ret; if (!data->rpc_done) { @@ -1138,30 +1224,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data state = nfs4_get_open_state(inode, data->owner); if (state == NULL) goto err_put_inode; - if (data->o_res.delegation_type != 0) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - int delegation_flags = 0; - - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation) - delegation_flags = delegation->flags; - rcu_read_unlock(); - if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { - pr_err_ratelimited("NFS: Broken NFSv4 server %s is " - "returning a delegation for " - "OPEN(CLAIM_DELEGATE_CUR)\n", - clp->cl_hostname); - } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) - nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - else - nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - } - + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); iput(inode); @@ -1173,6 +1237,14 @@ err: return ERR_PTR(ret); } +static struct nfs4_state * +nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) + return _nfs4_opendata_reclaim_to_nfs4_state(data); + return _nfs4_opendata_to_nfs4_state(data); +} + static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -1494,6 +1566,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; @@ -1526,7 +1599,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) return; if (task->tk_status == 0) { - switch (data->o_res.f_attr->mode & S_IFMT) { + if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) { + switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: break; case S_IFLNK: @@ -1537,6 +1611,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) break; default: data->rpc_status = -ENOTDIR; + } } renew_lease(data->o_res.server, data->timestamp); if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) @@ -1643,6 +1718,39 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) return status; } +static int nfs4_opendata_access(struct rpc_cred *cred, + struct nfs4_opendata *opendata, + struct nfs4_state *state, fmode_t fmode) +{ + struct nfs_access_entry cache; + u32 mask; + + /* access call failed or for some reason the server doesn't + * support any access modes -- defer access call until later */ + if (opendata->o_res.access_supported == 0) + return 0; + + mask = 0; + /* don't check MAY_WRITE - a newly created file may not have + * write mode bits, but POSIX allows the creating process to write */ + if (fmode & FMODE_READ) + mask |= MAY_READ; + if (fmode & FMODE_EXEC) + mask |= MAY_EXEC; + + cache.cred = cred; + cache.jiffies = jiffies; + nfs_access_set_mask(&cache, opendata->o_res.access_result); + nfs_access_add_cache(state->inode, &cache); + + if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + return 0; + + /* even though OPEN succeeded, access is denied. Close the file */ + nfs4_close_state(state, fmode); + return -NFS4ERR_ACCESS; +} + /* * Note: On error, nfs4_proc_open will free the struct nfs4_opendata */ @@ -1774,7 +1882,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, stateid); + nfs_remove_bad_delegation(state->inode); + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); clear_bit(NFS_DELEGATED_STATE, &state->flags); } } @@ -1790,7 +1902,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) static int nfs41_check_open_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); - nfs4_stateid *stateid = &state->stateid; + nfs4_stateid *stateid = &state->open_stateid; int status; /* If a state reset has been done, test_stateid is unneeded */ @@ -1896,6 +2008,10 @@ static int _nfs4_do_open(struct inode *dir, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + status = nfs4_opendata_access(cred, opendata, state, fmode); + if (status != 0) + goto err_opendata_put; + if (opendata->o_arg.open_flags & O_EXCL) { nfs4_exclusive_attrset(opendata, sattr); @@ -1941,7 +2057,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs4_state *res; int status; - fmode &= FMODE_READ|FMODE_WRITE; + fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC; do { status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res, ctx_th); @@ -2013,8 +2129,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, nfs_fattr_init(fattr); if (state != NULL) { + struct nfs_lockowner lockowner = { + .l_owner = current->files, + .l_pid = current->tgid, + }; nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - current->files, current->tgid); + &lockowner); } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, FMODE_WRITE)) { /* Use that stateid */ @@ -2133,6 +2253,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; + struct inode *inode = calldata->inode; int call_close = 0; dprintk("%s: begin!\n", __func__); @@ -2166,16 +2287,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && - pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { - rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, - task, NULL); + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) goto out; - } } nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), + if (nfs4_setup_sequence(NFS_SERVER(inode), &calldata->arg.seq_args, &calldata->res.seq_res, task)) @@ -2202,7 +2320,7 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -2238,7 +2356,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->roc = roc; + calldata->roc = pnfs_roc(state->inode); nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; @@ -2255,8 +2373,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) out_free_calldata: kfree(calldata); out: - if (roc) - pnfs_roc_release(state->inode); nfs4_put_open_state(state); nfs4_put_state_owner(sp); return status; @@ -2399,7 +2515,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl int ret; auth = rpcauth_create(flavor, server->client); - if (!auth) { + if (IS_ERR(auth)) { ret = -EIO; goto out; } @@ -2767,13 +2883,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { - entry->mask = 0; - if (res.access & NFS4_ACCESS_READ) - entry->mask |= MAY_READ; - if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; + nfs_access_set_mask(entry, res.access); nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); @@ -3215,11 +3325,11 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long long)cookie); - nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); + nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } @@ -3362,8 +3472,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s nfs_fattr_init(fsinfo->fattr); error = nfs4_do_fsinfo(server, fhandle, fsinfo); - if (error == 0) + if (error == 0) { + /* block layout checks this! */ + server->pnfs_blksize = fsinfo->blksize; set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + } return error; } @@ -3653,11 +3766,11 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); } -/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that - * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on * the stack. */ -#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) +#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) static int buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages, unsigned int *pgbase) @@ -3668,7 +3781,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen, spages = pages; do { - len = min_t(size_t, PAGE_CACHE_SIZE, buflen); + len = min_t(size_t, PAGE_SIZE, buflen); newpage = alloc_page(GFP_KERNEL); if (newpage == NULL) @@ -3739,7 +3852,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; - if (pages && buflen <= PAGE_SIZE) { + if (buflen <= PAGE_SIZE) { acl = kmalloc(buflen, GFP_KERNEL); if (acl == NULL) goto out; @@ -3782,17 +3895,15 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - int ret = -ENOMEM, npages, i; - size_t acl_len = 0; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + int ret = -ENOMEM, i; - npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* As long as we're doing a round trip to the server anyway, * let's be prepared for a page of acl data. */ if (npages == 0) npages = 1; - - /* Add an extra page to handle the bitmap returned */ - npages++; + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; for (i = 0; i < npages; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3808,11 +3919,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu args.acl_len = npages * PAGE_SIZE; args.acl_pgbase = 0; - /* Let decode_getfacl know not to fail if the ACL data is larger than - * the page we send as a guess */ - if (buf == NULL) - res.acl_flags |= NFS4_ACL_LEN_REQUEST; - dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", __func__, buf, buflen, npages, args.acl_len); ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), @@ -3820,20 +3926,19 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu if (ret) goto out_free; - acl_len = res.acl_len; - if (acl_len > args.acl_len) - nfs4_write_cached_acl(inode, NULL, 0, acl_len); - else - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, - acl_len); - if (buf) { + /* Handle the case where the passed-in buffer is too short */ + if (res.acl_flags & NFS4_ACL_TRUNC) { + /* Did the user only issue a request for the acl length? */ + if (buf == NULL) + goto out_ok; ret = -ERANGE; - if (acl_len > buflen) - goto out_free; - _copy_from_pages(buf, pages, res.acl_data_offset, - acl_len); + goto out_free; } - ret = acl_len; + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + if (buf) + _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); +out_ok: + ret = res.acl_len; out_free: for (i = 0; i < npages; i++) if (pages[i]) @@ -3891,10 +3996,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl .rpc_argp = &arg, .rpc_resp = &res, }; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); int ret, i; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); if (i < 0) return i; @@ -4012,6 +4120,36 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } +static unsigned int +nfs4_init_nonuniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + unsigned int result; + + rcu_read_lock(); + result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO)); + rcu_read_unlock(); + return result; +} + +static unsigned int +nfs4_init_uniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + char *nodename = clp->cl_rpcclient->cl_nodename; + + if (nfs4_client_id_uniquifier[0] != '\0') + nodename = nfs4_client_id_uniquifier; + return scnprintf(buf, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + nodename); +} + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4042,15 +4180,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); - rcu_read_lock(); - setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) + setclientid.sc_name_len = + nfs4_init_uniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + else + setclientid.sc_name_len = + nfs4_init_nonuniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); /* cb_client4 */ + rcu_read_lock(); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), rpc_peeraddr2str(clp->cl_rpcclient, @@ -4396,7 +4537,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; - if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { + if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ task->tk_action = NULL; return; @@ -4590,7 +4731,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) } if (data->rpc_status == 0) { nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); - data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); } out: @@ -4637,7 +4778,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: @@ -4761,7 +4902,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) struct nfs_server *server = NFS_SERVER(state->inode); list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { status = nfs41_test_stateid(server, &lsp->ls_stateid); if (status != NFS_OK) { /* Free the stateid unless the server @@ -4769,7 +4910,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, &lsp->ls_stateid); - lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); ret = status; } } @@ -5272,10 +5413,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = scnprintf(args.id, sizeof(args.id), - "%s/%s", - clp->cl_ipaddr, - clp->cl_rpcclient->cl_nodename); + args.id_len = nfs4_init_uniform_client_string(clp, args.id, + sizeof(args.id)); dprintk("NFS call exchange_id auth=%s, '%.*s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, args.id_len, args.id); @@ -5396,6 +5535,8 @@ int nfs4_destroy_clientid(struct nfs_client *clp) goto out; if (clp->cl_exchange_flags == 0) goto out; + if (clp->cl_preserve_clid) + goto out; cred = nfs4_get_exchange_id_cred(clp); ret = nfs4_proc_destroy_clientid(clp, cred); if (cred) @@ -6201,26 +6342,44 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + struct nfs4_state *state = NULL; dprintk("--> %s\n", __func__); if (!nfs4_sequence_done(task, &lgp->res.seq_res)) - return; + goto out; switch (task->tk_status) { case 0: - break; + goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: task->tk_status = -NFS4ERR_DELAY; - /* Fall through */ - default: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!lo || list_empty(&lo->plh_segs)) { + spin_unlock(&inode->i_lock); + /* If the open stateid was bad, then recover it. */ + state = lgp->args.ctx->state; + } else { + LIST_HEAD(head); + + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + /* Mark the bad layout state as invalid, then + * retry using the open stateid. */ + pnfs_free_lseg_list(&head); } } + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); +out: dprintk("<-- %s\n", __func__); } @@ -6287,7 +6446,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { .rpc_release = nfs4_layoutget_release, }; -void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) { struct nfs_server *server = NFS_SERVER(lgp->args.inode); size_t max_pages = max_response_pages(server); @@ -6304,6 +6464,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .callback_data = lgp, .flags = RPC_TASK_ASYNC, }; + struct pnfs_layout_segment *lseg = NULL; int status = 0; dprintk("--> %s\n", __func__); @@ -6311,7 +6472,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { nfs4_layoutget_release(lgp); - return; + return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; @@ -6320,15 +6481,17 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) - return; + return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; if (status == 0) - status = pnfs_layout_process(lgp); + lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); - return; + if (status) + return ERR_PTR(status); + return lseg; } static void @@ -6347,7 +6510,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); @@ -6359,20 +6521,21 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); return; } - spin_lock(&lo->plh_inode->i_lock); - if (task->tk_status == 0 && lrp->res.lrs_present) - pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - lo->plh_block_lgets--; - spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); } static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; + struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); - put_layout_hdr(lrp->args.layout); + spin_lock(&lo->plh_inode->i_lock); + if (lrp->res.lrs_present) + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); + pnfs_put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); } @@ -6546,7 +6709,7 @@ static void nfs4_layoutcommit_release(void *calldata) list_del_init(&lseg->pls_lc_list); if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) - put_lseg(lseg); + pnfs_put_lseg(lseg); } clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); @@ -6805,6 +6968,7 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { .recover_lock = nfs4_lock_reclaim, .establish_clid = nfs4_init_clientid, .get_clid_cred = nfs4_get_setclientid_cred, + .detect_trunking = nfs40_discover_server_trunking, }; #if defined(CONFIG_NFS_V4_1) @@ -6816,6 +6980,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, .reclaim_complete = nfs41_proc_reclaim_complete, + .detect_trunking = nfs41_discover_server_trunking, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 6930bec..1720d32 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -117,8 +117,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - cancel_delayed_work(&clp->cl_renewd); - schedule_delayed_work(&clp->cl_renewd, timeout); + mod_delayed_work(system_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 55148de..c351e6b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -51,18 +51,21 @@ #include <linux/bitops.h> #include <linux/jiffies.h> +#include <linux/sunrpc/clnt.h> + #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_STATE #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid; - +static DEFINE_MUTEX(nfs_clid_init_mutex); static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -73,12 +76,13 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) }; unsigned short port; int status; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - port = nfs_callback_tcpport; + port = nn->nfs_callback_tcpport; if (clp->cl_addr.ss_family == AF_INET6) - port = nfs_callback_tcpport6; + port = nn->nfs_callback_tcpport6; status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); if (status != 0) @@ -96,6 +100,56 @@ out: return status; } +/** + * nfs40_discover_server_trunking - Detect server IP address trunking (mv0) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + unsigned short port; + int status; + + port = nn->nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nn->nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + + status = nfs40_walk_client_list(clp, result, cred); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + case 0: + /* Sustain the lease, even if it's empty. If the clientid4 + * goes stale it's of no use for trunking discovery. */ + nfs4_schedule_state_renewal(*result); + break; + } + +out: + return status; +} + struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) { struct rpc_cred *cred = NULL; @@ -275,6 +329,33 @@ out: return status; } +/** + * nfs41_discover_server_trunking - Detect server IP address trunking (mv1) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status. + * If NFS4_OK is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + int status; + + status = nfs4_proc_exchange_id(clp, cred); + if (status != NFS4_OK) + return status; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + + return nfs41_walk_client_list(clp, result, cred); +} + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) { struct rpc_cred *cred; @@ -729,11 +810,8 @@ static void __nfs4_close(struct nfs4_state *state, if (!call_close) { nfs4_put_open_state(state); nfs4_put_state_owner(owner); - } else { - bool roc = pnfs_roc(state->inode); - - nfs4_do_close(state, gfp_mask, wait, roc); - } + } else + nfs4_do_close(state, gfp_mask, wait); } void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) @@ -865,7 +943,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { if (nfs4_release_lockowner(lsp) == 0) return; } @@ -911,17 +989,25 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) } static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fl_owner_t fl_owner, pid_t fl_pid) + const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; + fl_owner_t fl_owner; + pid_t fl_pid; bool ret = false; + + if (lockowner == NULL) + goto out; + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; + fl_owner = lockowner->l_owner; + fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); - if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { + if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); ret = true; } @@ -946,11 +1032,11 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * requests. */ void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) + fmode_t fmode, const struct nfs_lockowner *lockowner) { if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) return; - if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) + if (nfs4_copy_lock_stateid(dst, state, lockowner)) return; nfs4_copy_open_stateid(dst, state); } @@ -1289,7 +1375,7 @@ restart: if (status >= 0) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) pr_warn_ratelimited("NFS: " "%s: Lock reclaim " "failed!\n", __func__); @@ -1361,7 +1447,7 @@ static void nfs4_clear_open_state(struct nfs4_state *state) spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.flags = 0; - lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags); } spin_unlock(&state->state_lock); } @@ -1595,8 +1681,8 @@ out: return nfs4_recovery_handle_error(clp, status); } -/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors - * on EXCHANGE_ID for v4.1 +/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors + * and for recoverable errors on EXCHANGE_ID for v4.1 */ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) { @@ -1606,8 +1692,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) return -ESERVERFAULT; /* Lease confirmation error: retry after purging the lease */ ssleep(1); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; case -NFS4ERR_STALE_CLIENTID: clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_CLID_INUSE: pr_err("NFS: Server %s reports our clientid is in use\n", @@ -1698,6 +1788,109 @@ static int nfs4_purge_lease(struct nfs_client *clp) return 0; } +/** + * nfs4_discover_server_trunking - Detect server IP address trunking + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * + * Returns zero or a negative errno. If zero is returned, + * an nfs_client pointer is planted in "result". + * + * Note: since we are invoked in process context, and + * not from inside the state manager, we cannot use + * nfs4_handle_reclaim_lease_error(). + */ +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result) +{ + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + rpc_authflavor_t *flavors, flav, save; + struct rpc_clnt *clnt; + struct rpc_cred *cred; + int i, len, status; + + dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); + + len = NFS_MAX_SECFLAVORS; + flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL); + if (flavors == NULL) { + status = -ENOMEM; + goto out; + } + len = rpcauth_list_flavors(flavors, len); + if (len < 0) { + status = len; + goto out_free; + } + clnt = clp->cl_rpcclient; + save = clnt->cl_auth->au_flavor; + i = 0; + + mutex_lock(&nfs_clid_init_mutex); + status = -ENOENT; +again: + cred = ops->get_clid_cred(clp); + if (cred == NULL) + goto out_unlock; + + status = ops->detect_trunking(clp, result, cred); + put_rpccred(cred); + switch (status) { + case 0: + break; + + case -EACCES: + if (clp->cl_machine_cred == NULL) + break; + /* Handle case where the user hasn't set up machine creds */ + nfs4_clear_machine_cred(clp); + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + dprintk("NFS: %s after status %d, retrying\n", + __func__, status); + goto again; + + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_WRONGSEC: + status = -EPERM; + if (i >= len) + break; + + flav = flavors[i++]; + if (flav == save) + flav = flavors[i++]; + clnt = rpc_clone_client_set_auth(clnt, flav); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + break; + } + clp->cl_rpcclient = clnt; + goto again; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + status = -EPROTONOSUPPORT; + break; + + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + status = -EKEYEXPIRED; + } + +out_unlock: + mutex_unlock(&nfs_clid_init_mutex); +out_free: + kfree(flavors); +out: + dprintk("NFS: %s: status = %d\n", __func__, status); + return status; +} + #ifdef CONFIG_NFS_V4_1 void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { @@ -2008,6 +2201,7 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); + ssleep(1); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 5729bc8..2628d92 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -9,6 +9,7 @@ #include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> +#include "nfs4_fs.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1bfbd67..40836ee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -447,12 +447,14 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getfh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getfh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ @@ -467,11 +469,13 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ @@ -1509,8 +1513,12 @@ static void encode_open_stateid(struct xdr_stream *xdr, nfs4_stateid stateid; if (ctx->state != NULL) { + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; nfs4_select_rw_stateid(&stateid, ctx->state, - fmode, l_ctx->lockowner, l_ctx->pid); + fmode, lockowner); if (zero_seqid) stateid.seqid = 0; encode_nfs4_stateid(xdr, &stateid); @@ -2216,6 +2224,8 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -2252,7 +2262,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -4095,7 +4107,7 @@ out_overflow: return -EIO; } -static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) +static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) { __be32 *p; uint32_t supp, acc; @@ -4109,8 +4121,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) goto out_overflow; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); - access->supported = supp; - access->access = acc; + *supported = supp; + *access = acc; return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5072,18 +5084,14 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, * are stored with the acl data to handle the problem of * variable length bitmaps.*/ res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; - - /* We ignore &savep and don't do consistency checks on - * the attr length. Let userspace figure it out.... */ res->acl_len = attrlen; - if (attrlen > (xdr->nwords << 2)) { - if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { - /* getxattr interface called with a NULL buf */ - goto out; - } + + /* Check for receive buffer overflow */ + if (res->acl_len > (xdr->nwords << 2) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; dprintk("NFS: acl reply: attrlen %u > page_len %u\n", attrlen, xdr->nwords << 2); - return -EINVAL; } } else status = -EOPNOTSUPP; @@ -5646,7 +5654,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, * and places the remaining xdr data in xdr_buf->tail */ pdev->mincount = be32_to_cpup(p); - xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) + goto out_overflow; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); @@ -5891,7 +5900,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_access(xdr, res); + status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; decode_getfattr(xdr, res->fattr, res->server); @@ -6229,8 +6238,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_open(xdr, res); if (status) goto out; - if (decode_getfh(xdr, &res->fh) != 0) + status = decode_getfh(xdr, &res->fh); + if (status) goto out; + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; @@ -6279,6 +6291,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, status = decode_open(xdr, res); if (status) goto out; + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ea6d111..be731e6 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -41,6 +41,7 @@ #include <scsi/osd_ore.h> #include "objlayout.h" +#include "../internal.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -606,8 +607,14 @@ static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { unsigned long stripe_end = 0; + u64 wb_size; - pnfs_generic_pg_init_write(pgio, req); + if (pgio->pg_dreq == NULL) + wb_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); if (unlikely(pgio->pg_lseg == NULL)) return; /* Not pNFS */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 311a796..e56e846 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -102,6 +102,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, unsigned int offset, unsigned int count) { struct nfs_page *req; + struct nfs_lock_context *l_ctx; /* try to allocate the request struct */ req = nfs_page_alloc(); @@ -109,11 +110,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, return ERR_PTR(-ENOMEM); /* get lock context early so we can deal with alloc failures */ - req->wb_lock_context = nfs_get_lock_context(ctx); - if (req->wb_lock_context == NULL) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) { nfs_page_free(req); - return ERR_PTR(-ENOMEM); + return ERR_CAST(l_ctx); } + req->wb_lock_context = l_ctx; /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -290,7 +292,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, { if (req->wb_context->cred != prev->wb_context->cred) return false; - if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner) + return false; + if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid) return false; if (req->wb_context->state != prev->wb_context->state) return false; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2e00fea..fe624c9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PNFS +#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) /* Locking: * @@ -190,7 +191,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); /* Need to hold i_lock if caller does not already hold reference */ void -get_layout_hdr(struct pnfs_layout_hdr *lo) +pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) { atomic_inc(&lo->plh_refcount); } @@ -199,43 +200,107 @@ static struct pnfs_layout_hdr * pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; - return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : - kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); + return ld->alloc_layout_hdr(ino, gfp_flags); } static void pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (!list_empty(&lo->plh_layouts)) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } put_rpccred(lo->plh_lc_cred); - return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); + return ld->free_layout_hdr(lo); } static void -destroy_layout_hdr(struct pnfs_layout_hdr *lo) +pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) { + struct nfs_inode *nfsi = NFS_I(lo->plh_inode); dprintk("%s: freeing layout cache %p\n", __func__, lo); - BUG_ON(!list_empty(&lo->plh_layouts)); - NFS_I(lo->plh_inode)->layout = NULL; - pnfs_free_layout_hdr(lo); + nfsi->layout = NULL; + /* Reset MDS Threshold I/O counters */ + nfsi->write_io = 0; + nfsi->read_io = 0; +} + +void +pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&inode->i_lock); + pnfs_free_layout_hdr(lo); + } +} + +static int +pnfs_iomode_to_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; } static void -put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { - if (atomic_dec_and_test(&lo->plh_refcount)) - destroy_layout_hdr(lo); + lo->plh_retry_timestamp = jiffies; + if (test_and_set_bit(fail_bit, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); } -void -put_layout_hdr(struct pnfs_layout_hdr *lo) +static void +pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + if (test_and_clear_bit(fail_bit, &lo->plh_flags)) + atomic_dec(&lo->plh_refcount); +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) { struct inode *inode = lo->plh_inode; + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(head); - if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { - destroy_layout_hdr(lo); - spin_unlock(&inode->i_lock); + spin_lock(&inode->i_lock); + pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, + iomode == IOMODE_RW ? "RW" : "READ"); +} + +static bool +pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + unsigned long start, end; + int fail_bit = pnfs_iomode_to_fail_bit(iomode); + + if (test_bit(fail_bit, &lo->plh_flags) == 0) + return false; + end = jiffies; + start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; + if (!time_in_range(lo->plh_retry_timestamp, start, end)) { + /* It is time to retry the failed layoutgets */ + pnfs_layout_clear_fail_bit(lo, fail_bit); + return false; } + return true; } static void @@ -249,33 +314,32 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) lseg->pls_layout = lo; } -static void free_lseg(struct pnfs_layout_segment *lseg) +static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) { struct inode *ino = lseg->pls_layout->plh_inode; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - /* Matched by get_layout_hdr in pnfs_insert_layout */ - put_layout_hdr(NFS_I(ino)->layout); } static void -put_lseg_common(struct pnfs_layout_segment *lseg) +pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) { - struct inode *inode = lseg->pls_layout->plh_inode; + struct inode *inode = lo->plh_inode; WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); - if (list_empty(&lseg->pls_layout->plh_segs)) { - set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); - /* Matched by initial refcount set in alloc_init_layout_hdr */ - put_layout_hdr_locked(lseg->pls_layout); - } + /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ + atomic_dec(&lo->plh_refcount); + if (list_empty(&lo->plh_segs)) + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } void -put_lseg(struct pnfs_layout_segment *lseg) +pnfs_put_lseg(struct pnfs_layout_segment *lseg) { + struct pnfs_layout_hdr *lo; struct inode *inode; if (!lseg) @@ -284,17 +348,17 @@ put_lseg(struct pnfs_layout_segment *lseg) dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - inode = lseg->pls_layout->plh_inode; + lo = lseg->pls_layout; + inode = lo->plh_inode; if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - LIST_HEAD(free_me); - - put_lseg_common(lseg); - list_add(&lseg->pls_list, &free_me); + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); - pnfs_free_lseg_list(&free_me); + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); } } -EXPORT_SYMBOL_GPL(put_lseg); +EXPORT_SYMBOL_GPL(pnfs_put_lseg); static inline u64 end_offset(u64 start, u64 len) @@ -378,7 +442,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); if (atomic_dec_and_test(&lseg->pls_refcount)) { - put_lseg_common(lseg); + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); list_add(&lseg->pls_list, tmp_list); rv = 1; } @@ -390,7 +454,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * after call. */ int -mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range) { @@ -399,14 +463,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); - if (list_empty(&lo->plh_segs)) { - /* Reset MDS Threshold I/O counters */ - NFS_I(lo->plh_inode)->write_io = 0; - NFS_I(lo->plh_inode)->read_io = 0; - if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) - put_layout_hdr_locked(lo); + if (list_empty(&lo->plh_segs)) return 0; - } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (!recall_range || should_free_lseg(&lseg->pls_range, recall_range)) { @@ -426,25 +484,13 @@ void pnfs_free_lseg_list(struct list_head *free_me) { struct pnfs_layout_segment *lseg, *tmp; - struct pnfs_layout_hdr *lo; if (list_empty(free_me)) return; - lo = list_first_entry(free_me, struct pnfs_layout_segment, - pls_list)->pls_layout; - - if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { - struct nfs_client *clp; - - clp = NFS_SERVER(lo->plh_inode)->nfs_client; - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - } list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); - free_lseg(lseg); + pnfs_free_lseg(lseg); } } @@ -458,10 +504,15 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) lo = nfsi->layout; if (lo) { lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - mark_matching_lsegs_invalid(lo, &tmp_list, NULL); - } - spin_unlock(&nfsi->vfs_inode.i_lock); - pnfs_free_lseg_list(&tmp_list); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_get_layout_hdr(lo); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); + pnfs_put_layout_hdr(lo); + } else + spin_unlock(&nfsi->vfs_inode.i_lock); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); @@ -498,46 +549,54 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) } } +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)s1 - (s32)s2 > 0; +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq; + u32 oldseq, newseq, new_barrier; + int empty = list_empty(&lo->plh_segs); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if ((int)(newseq - oldseq) > 0) { + if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); if (update_barrier) { - u32 new_barrier = be32_to_cpu(new->seqid); - - if ((int)(new_barrier - lo->plh_barrier)) - lo->plh_barrier = new_barrier; + new_barrier = be32_to_cpu(new->seqid); } else { /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. It needs to be - * within 2**31 to count as "behind", so if it - * gets too near that limit, give us a litle leeway - * and bring it to within 2**30. - * NOTE - and yes, this is all unsigned arithmetic. + * "close" to the current seqids. */ - if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) - lo->plh_barrier = newseq - (1 << 30); + new_barrier = newseq - atomic_read(&lo->plh_outstanding); } + if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } } +static bool +pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid) +{ + u32 seqid = be32_to_cpu(stateid->seqid); + + return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); +} + /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, - int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) { - if ((stateid) && - (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0) - return true; return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && (atomic_read(&lo->plh_outstanding) > lget)); @@ -551,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, NULL, 1)) { + if (pnfs_layoutgets_blocked(lo, 1)) { status = -EAGAIN; } else if (list_empty(&lo->plh_segs)) { int seq; @@ -582,7 +641,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg = NULL; + struct pnfs_layout_segment *lseg; dprintk("--> %s\n", __func__); @@ -599,16 +658,22 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); - lgp->lsegpp = &lseg; lgp->gfp_flags = gfp_flags; /* Synchronously retrieve layout information from server and * store in lseg. */ - nfs4_proc_layoutget(lgp, gfp_flags); - if (!lseg) { - /* remember that LAYOUTGET failed and suspend trying */ - set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + if (IS_ERR(lseg)) { + switch (PTR_ERR(lseg)) { + case -ENOMEM: + case -ERESTARTSYS: + break; + default: + /* remember that LAYOUTGET failed and suspend trying */ + pnfs_layout_io_set_failed(lo, range->iomode); + } + return NULL; } return lseg; @@ -636,25 +701,24 @@ _pnfs_return_layout(struct inode *ino) spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || pnfs_test_layout_returned(lo)) { + if (!lo) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout to return\n", __func__); goto out; } stateid = nfsi->layout->plh_stateid; /* Reference matched in nfs4_layoutreturn_release */ - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); - mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); goto out; } lo->plh_block_lgets++; - pnfs_mark_layout_returned(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -663,10 +727,10 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; - set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); - set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); - pnfs_clear_layout_returned(lo); - put_layout_hdr(lo); + spin_lock(&ino->i_lock); + lo->plh_block_lgets--; + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); goto out; } @@ -703,7 +767,7 @@ bool pnfs_roc(struct inode *ino) if (!found) goto out_nolayout; lo->plh_block_lgets++; - get_layout_hdr(lo); /* matched in pnfs_roc_release */ + pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); return true; @@ -720,8 +784,12 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; lo->plh_block_lgets--; - put_layout_hdr_locked(lo); - spin_unlock(&ino->i_lock); + if (atomic_dec_and_test(&lo->plh_refcount)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&ino->i_lock); + pnfs_free_layout_hdr(lo); + } else + spin_unlock(&ino->i_lock); } void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) @@ -730,32 +798,34 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - if ((int)(barrier - lo->plh_barrier) > 0) + if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier) +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) { struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg; + u32 current_seqid; bool found = false; spin_lock(&ino->i_lock); list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); found = true; - break; + goto out; } - if (!found) { - struct pnfs_layout_hdr *lo = nfsi->layout; - u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + lo = nfsi->layout; + current_seqid = be32_to_cpu(lo->plh_stateid.seqid); - /* Since close does not return a layout stateid for use as - * a barrier, we choose the worst-case barrier. - */ - *barrier = current_seqid + atomic_read(&lo->plh_outstanding); - } + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); +out: spin_unlock(&ino->i_lock); return found; } @@ -786,14 +856,13 @@ cmp_layout(struct pnfs_layout_range *l1, } static void -pnfs_insert_layout(struct pnfs_layout_hdr *lo, +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { struct pnfs_layout_segment *lp; dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lp, &lo->plh_segs, pls_list) { if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) continue; @@ -813,7 +882,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); out: - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); dprintk("%s:Return\n", __func__); } @@ -847,21 +916,19 @@ pnfs_find_alloc_layout(struct inode *ino, dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); - assert_spin_locked(&ino->i_lock); - if (nfsi->layout) { - if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) - return NULL; - else - return nfsi->layout; - } + if (nfsi->layout != NULL) + goto out_existing; spin_unlock(&ino->i_lock); new = alloc_init_layout_hdr(ino, ctx, gfp_flags); spin_lock(&ino->i_lock); - if (likely(nfsi->layout == NULL)) /* Won the race? */ + if (likely(nfsi->layout == NULL)) { /* Won the race? */ nfsi->layout = new; - else - pnfs_free_layout_hdr(new); + return new; + } + pnfs_free_layout_hdr(new); +out_existing: + pnfs_get_layout_hdr(nfsi->layout); return nfsi->layout; } @@ -904,11 +971,10 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(&lseg->pls_range, range)) { - ret = get_lseg(lseg); + ret = pnfs_get_lseg(lseg); break; } if (lseg->pls_range.offset > range->offset) @@ -1013,7 +1079,6 @@ pnfs_update_layout(struct inode *ino, .length = count, }; unsigned pg_offset; - struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; @@ -1021,16 +1086,16 @@ pnfs_update_layout(struct inode *ino, bool first = false; if (!pnfs_enabled_sb(NFS_SERVER(ino))) - return NULL; + goto out; if (pnfs_within_mdsthreshold(ctx, ino, iomode)) - return NULL; + goto out; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { - dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); - goto out_unlock; + spin_unlock(&ino->i_lock); + goto out; } /* Do we even need to bother with this? */ @@ -1040,7 +1105,7 @@ pnfs_update_layout(struct inode *ino, } /* if LAYOUTGET already failed once we don't try again */ - if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + if (pnfs_layout_io_test_failed(lo, iomode)) goto out_unlock; /* Check to see if the layout for the given range already exists */ @@ -1048,17 +1113,13 @@ pnfs_update_layout(struct inode *ino, if (lseg) goto out_unlock; - if (pnfs_layoutgets_blocked(lo, NULL, 0)) + if (pnfs_layoutgets_blocked(lo, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); - get_layout_hdr(lo); if (list_empty(&lo->plh_segs)) first = true; - /* Enable LAYOUTRETURNs */ - pnfs_clear_layout_returned(lo); - spin_unlock(&ino->i_lock); if (first) { /* The lo must be on the clp list if there is any @@ -1079,24 +1140,26 @@ pnfs_update_layout(struct inode *ino, arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - if (!lseg && first) { - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - } atomic_dec(&lo->plh_outstanding); - put_layout_hdr(lo); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: - dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); + dprintk("%s: inode %s/%llu pNFS layout segment %s for " + "(%s, offset: %llu, length: %llu)\n", + __func__, ino->i_sb->s_id, + (unsigned long long)NFS_FILEID(ino), + lseg == NULL ? "not found" : "found", + iomode==IOMODE_RW ? "read/write" : "read-only", + (unsigned long long)pos, + (unsigned long long)count); return lseg; out_unlock: spin_unlock(&ino->i_lock); - goto out; + goto out_put_layout_hdr; } EXPORT_SYMBOL_GPL(pnfs_update_layout); -int +struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; @@ -1123,25 +1186,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { + if (pnfs_layoutgets_blocked(lo, 1) || + pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid, false); + init_lseg(lo, lseg); lseg->pls_range = res->range; - *lgp->lsegpp = get_lseg(lseg); - pnfs_insert_layout(lo, lseg); + pnfs_get_lseg(lseg); + pnfs_layout_insert_lseg(lo, lseg); if (res->return_on_close) { set_bit(NFS_LSEG_ROC, &lseg->pls_flags); set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); } - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); spin_unlock(&ino->i_lock); + return lseg; out: - return status; + return ERR_PTR(status); out_forget_reply: spin_unlock(&ino->i_lock); @@ -1153,16 +1220,24 @@ out_forget_reply: void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + u64 rd_size = req->wb_bytes; + BUG_ON(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_read_mds(pgio); return; } + + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), - req->wb_bytes, + rd_size, IOMODE_READ, GFP_KERNEL); /* If no lseg, fall back to read through mds */ @@ -1173,7 +1248,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); void -pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size) { BUG_ON(pgio->pg_lseg != NULL); @@ -1181,10 +1257,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * nfs_pageio_reset_write_mds(pgio); return; } + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), - req->wb_bytes, + wb_size, IOMODE_RW, GFP_NOFS); /* If no lseg, fall back to write through mds */ @@ -1362,12 +1439,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he if (trypnfs == PNFS_NOT_ATTEMPTED) pnfs_write_through_mds(desc, data); } - put_lseg(lseg); + pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { - put_lseg(hdr->lseg); + pnfs_put_lseg(hdr->lseg); nfs_writehdr_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); @@ -1382,17 +1459,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) whdr = nfs_writehdr_alloc(); if (!whdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return -ENOMEM; } hdr = &whdr->header; nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); - hdr->lseg = get_lseg(desc->pg_lseg); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); ret = nfs_generic_flush(desc, hdr); if (ret != 0) { - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); @@ -1517,12 +1594,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea if (trypnfs == PNFS_NOT_ATTEMPTED) pnfs_read_through_mds(desc, data); } - put_lseg(lseg); + pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { - put_lseg(hdr->lseg); + pnfs_put_lseg(hdr->lseg); nfs_readhdr_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); @@ -1538,17 +1615,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) if (!rhdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; } hdr = &rhdr->header; nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); - hdr->lseg = get_lseg(desc->pg_lseg); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); ret = nfs_generic_pagein(desc, hdr); if (ret != 0) { - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_multiple_reads(desc, &hdr->rpc_list); @@ -1574,13 +1651,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } + pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); } EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); @@ -1601,7 +1672,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) } if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - get_lseg(hdr->lseg); + pnfs_get_lseg(hdr->lseg); } if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 745aa1b..2d722db 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -62,9 +62,6 @@ enum { NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ - NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ - NFS_LAYOUT_INVALID, /* layout is being destroyed */ - NFS_LAYOUT_RETURNED, /* layout has already been returned */ }; enum layoutdriver_policy_flags { @@ -140,6 +137,7 @@ struct pnfs_layout_hdr { atomic_t plh_outstanding; /* number of RPCs out */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_retry_timestamp; unsigned long plh_flags; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -172,12 +170,12 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server, struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev); -extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ -void get_layout_hdr(struct pnfs_layout_hdr *lo); -void put_lseg(struct pnfs_layout_segment *lseg); +void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, const struct nfs_pgio_completion_ops *); @@ -188,28 +186,29 @@ void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); -void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); -int pnfs_layout_process(struct nfs4_layoutget *lgp); +struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); -void put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, struct nfs4_state *open_state); -int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -bool pnfs_roc_drain(struct inode *ino, u32 *barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -233,6 +232,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ + NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ }; /* pnfs_dev.c */ @@ -242,6 +242,7 @@ struct nfs4_deviceid_node { const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; unsigned long flags; + unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; atomic_t ref; }; @@ -254,34 +255,12 @@ void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, const struct nfs4_deviceid *); struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); +bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); -static inline void -pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo) -{ - set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - -static inline void -pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo) -{ - clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - -static inline bool -pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - -static inline int lo_fail_bit(u32 iomode) -{ - return iomode == IOMODE_RW ? - NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; -} - static inline struct pnfs_layout_segment * -get_lseg(struct pnfs_layout_segment *lseg) +pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); @@ -406,12 +385,12 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) } static inline struct pnfs_layout_segment * -get_lseg(struct pnfs_layout_segment *lseg) +pnfs_get_lseg(struct pnfs_layout_segment *lseg) { return NULL; } -static inline void put_lseg(struct pnfs_layout_segment *lseg) +static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { } @@ -443,7 +422,7 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier) } static inline bool -pnfs_roc_drain(struct inode *ino, u32 *barrier) +pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) { return false; } diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 73f701f..d35b62e 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -40,6 +40,8 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) + static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); @@ -218,6 +220,30 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) } EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); +void +nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + node->timestamp_unavailable = jiffies; + set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); + +bool +nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long start, end; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (time_in_range(node->timestamp_unavailable, start, end)) + return true; + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + } + return false; +} +EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable); + static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { @@ -276,3 +302,4 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) } rcu_read_unlock(); } + diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 239aff7..e831bce 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -88,6 +88,7 @@ enum { Opt_sharecache, Opt_nosharecache, Opt_resvport, Opt_noresvport, Opt_fscache, Opt_nofscache, + Opt_migration, Opt_nomigration, /* Mount options that take integer arguments */ Opt_port, @@ -147,6 +148,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_noresvport, "noresvport" }, { Opt_fscache, "fsc" }, { Opt_nofscache, "nofsc" }, + { Opt_migration, "migration" }, + { Opt_nomigration, "nomigration" }, { Opt_port, "port=%s" }, { Opt_rsize, "rsize=%s" }, @@ -676,6 +679,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); + if (nfss->options & NFS_OPTION_MIGRATION) + seq_printf(m, ",migration"); + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) seq_printf(m, ",lookupcache=none"); @@ -1106,7 +1112,7 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) string = match_strdup(args); if (string == NULL) return -ENOMEM; - rc = strict_strtoul(string, 10, option); + rc = kstrtoul(string, 10, option); kfree(string); return rc; @@ -1243,6 +1249,12 @@ static int nfs_parse_mount_options(char *raw, kfree(mnt->fscache_uniq); mnt->fscache_uniq = NULL; break; + case Opt_migration: + mnt->options |= NFS_OPTION_MIGRATION; + break; + case Opt_nomigration: + mnt->options &= NFS_OPTION_MIGRATION; + break; /* * options that take numeric values @@ -1535,9 +1547,13 @@ static int nfs_parse_mount_options(char *raw, if (mnt->minorversion && mnt->version != 4) goto out_minorversion_mismatch; + if (mnt->options & NFS_OPTION_MIGRATION && + mnt->version != 4 && mnt->minorversion != 0) + goto out_migration_misuse; + /* * verify that any proto=/mountproto= options match the address - * familiies in the addr=/mountaddr= options. + * families in the addr=/mountaddr= options. */ if (protofamily != AF_UNSPEC && protofamily != mnt->nfs_server.address.ss_family) @@ -1572,6 +1588,10 @@ out_minorversion_mismatch: printk(KERN_INFO "NFS: mount option vers=%u does not support " "minorversion=%u\n", mnt->version, mnt->minorversion); return 0; +out_migration_misuse: + printk(KERN_INFO + "NFS: 'migration' not supported for this NFS version\n"); + return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); return 0; @@ -1867,6 +1887,7 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); + args->nfs_server.port = ntohs(data->addr.sin_port); if (!nfs_verify_server_address(sap)) goto out_no_address; @@ -2493,7 +2514,7 @@ EXPORT_SYMBOL_GPL(nfs_kill_super); /* * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) */ -struct dentry * +static struct dentry * nfs_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { @@ -2564,6 +2585,7 @@ static int nfs4_validate_mount_data(void *options, return -EFAULT; if (!nfs_verify_server_address(sap)) goto out_no_address; + args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) @@ -2640,6 +2662,7 @@ unsigned int nfs_idmap_cache_timeout = 600; bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; +char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_callback_tcpport); @@ -2647,6 +2670,7 @@ EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); EXPORT_SYMBOL_GPL(send_implementation_id); +EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -2657,7 +2681,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) if (!val) return -EINVAL; - ret = strict_strtoul(val, 0, &num); + ret = kstrtoul(val, 0, &num); if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; *((unsigned int *)kp->arg) = num; @@ -2672,6 +2696,8 @@ static struct kernel_param_ops param_ops_portnr = { module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); +module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, + NFS4_CLIENT_ID_UNIQ_LEN, 0600); MODULE_PARM_DESC(nfs4_disable_idmapping, "Turn off NFSv4 idmapping when using 'sec=sys'"); module_param(max_session_slots, ushort, 0644); @@ -2680,6 +2706,7 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); +MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); MODULE_ALIAS("nfs4"); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e3b5537..9347ab7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -846,6 +846,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_lock_context *l_ctx; struct nfs_page *req; int do_flush, status; /* @@ -860,9 +861,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx || - req->wb_lock_context->lockowner != current->files || - req->wb_lock_context->pid != current->tgid; + l_ctx = req->wb_lock_context; + do_flush = req->wb_page != page || req->wb_context != ctx; + if (l_ctx) { + do_flush |= l_ctx->lockowner.l_owner != current->files + || l_ctx->lockowner.l_pid != current->tgid; + } nfs_release_request(req); if (!do_flush) return 0; @@ -1576,6 +1580,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cc894ed..48a1bad 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2837,8 +2837,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) return -ENOMEM; } fp->fi_lease = fl; - fp->fi_deleg_file = fl->fl_file; - get_file(fp->fi_deleg_file); + fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); return 0; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a9269f1..3f67b8e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -480,7 +480,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) if (buf == NULL) goto out; - len = posix_acl_to_xattr(pacl, buf, buflen); + len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen); if (len < 0) { error = len; goto out; @@ -549,7 +549,7 @@ _get_posix_acl(struct dentry *dentry, char *key) if (buflen <= 0) return ERR_PTR(buflen); - pacl = posix_acl_from_xattr(buf, buflen); + pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen); kfree(buf); return pacl; } @@ -2264,7 +2264,7 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type) if (size < 0) return ERR_PTR(size); - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } @@ -2297,7 +2297,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (error < 0) goto getout; size = error; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a4d56ac..16f35f7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (unlikely(ret)) goto out; + file_update_time(vma->vm_file); ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); @@ -134,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nilfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &nilfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6e2c3db..4d31d2c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -401,8 +401,8 @@ int nilfs_read_inode_common(struct inode *inode, int err; inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); - inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); + i_uid_write(inode, le32_to_cpu(raw_inode->i_uid)); + i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); @@ -590,8 +590,8 @@ void nilfs_write_inode_common(struct inode *inode, struct nilfs_inode_info *ii = NILFS_I(inode); raw_inode->i_mode = cpu_to_le16(inode->i_mode); - raw_inode->i_uid = cpu_to_le32(inode->i_uid); - raw_inode->i_gid = cpu_to_le32(inode->i_gid); + raw_inode->i_uid = cpu_to_le32(i_uid_read(inode)); + raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le64(inode->i_size); raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6a10812..3c991dc 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1382,6 +1382,12 @@ static void nilfs_segbuf_init_once(void *obj) static void nilfs_destroy_cachep(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + if (nilfs_inode_cachep) kmem_cache_destroy(nilfs_inode_cachep); if (nilfs_transaction_cachep) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index d438036..721d692 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -58,7 +58,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, return fsnotify_remove_notify_event(group); } -static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) +static int create_fd(struct fsnotify_group *group, + struct fsnotify_event *event, + struct file **file) { int client_fd; struct file *new_file; @@ -98,7 +100,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); } else { - fd_install(client_fd, new_file); + *file = new_file; } return client_fd; @@ -106,13 +108,15 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) static int fill_event_metadata(struct fsnotify_group *group, struct fanotify_event_metadata *metadata, - struct fsnotify_event *event) + struct fsnotify_event *event, + struct file **file) { int ret = 0; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, group, metadata, event); + *file = NULL; metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; @@ -121,7 +125,7 @@ static int fill_event_metadata(struct fsnotify_group *group, if (unlikely(event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { - metadata->fd = create_fd(group, event); + metadata->fd = create_fd(group, event, file); if (metadata->fd < 0) ret = metadata->fd; } @@ -220,25 +224,6 @@ static int prepare_for_access_response(struct fsnotify_group *group, return 0; } -static void remove_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - struct fanotify_response_event *re; - - if (!(event->mask & FAN_ALL_PERM_EVENTS)) - return; - - re = dequeue_re(group, fd); - if (!re) - return; - - BUG_ON(re->event != event); - - kmem_cache_free(fanotify_response_event_cache, re); - - return; -} #else static int prepare_for_access_response(struct fsnotify_group *group, struct fsnotify_event *event, @@ -247,12 +232,6 @@ static int prepare_for_access_response(struct fsnotify_group *group, return 0; } -static void remove_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - return; -} #endif static ssize_t copy_event_to_user(struct fsnotify_group *group, @@ -260,31 +239,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, char __user *buf) { struct fanotify_event_metadata fanotify_event_metadata; + struct file *f; int fd, ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); - ret = fill_event_metadata(group, &fanotify_event_metadata, event); + ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); if (ret < 0) goto out; fd = fanotify_event_metadata.fd; - ret = prepare_for_access_response(group, event, fd); - if (ret) - goto out_close_fd; - ret = -EFAULT; if (copy_to_user(buf, &fanotify_event_metadata, fanotify_event_metadata.event_len)) - goto out_kill_access_response; + goto out_close_fd; + ret = prepare_for_access_response(group, event, fd); + if (ret) + goto out_close_fd; + + fd_install(fd, f); return fanotify_event_metadata.event_len; -out_kill_access_response: - remove_access_response(group, event, fd); out_close_fd: - if (fd != FAN_NOFD) - sys_close(fd); + if (fd != FAN_NOFD) { + put_unused_fd(fd); + fput(f); + } out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { @@ -470,24 +451,22 @@ static int fanotify_find_path(int dfd, const char __user *filename, dfd, filename, flags); if (filename == NULL) { - struct file *file; - int fput_needed; + struct fd f = fdget(dfd); ret = -EBADF; - file = fget_light(dfd, &fput_needed); - if (!file) + if (!f.file) goto out; ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) { - fput_light(file, fput_needed); + !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) { + fdput(f); goto out; } - *path = file->f_path; + *path = f.file->f_path; path_get(path); - fput_light(file, fput_needed); + fdput(f); } else { unsigned int lookup_flags = 0; @@ -767,9 +746,9 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, struct inode *inode = NULL; struct vfsmount *mnt = NULL; struct fsnotify_group *group; - struct file *filp; + struct fd f; struct path path; - int ret, fput_needed; + int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", __func__, fanotify_fd, flags, dfd, pathname, mask); @@ -803,15 +782,15 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, #endif return -EINVAL; - filp = fget_light(fanotify_fd, &fput_needed); - if (unlikely(!filp)) + f = fdget(fanotify_fd); + if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an fanotify instance */ ret = -EINVAL; - if (unlikely(filp->f_op != &fanotify_fops)) + if (unlikely(f.file->f_op != &fanotify_fops)) goto fput_and_out; - group = filp->private_data; + group = f.file->private_data; /* * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not @@ -858,7 +837,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, path_put(&path); fput_and_out: - fput_light(filp, fput_needed); + fdput(f); return ret; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 8445fbc..c311dda 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -757,16 +757,16 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, struct fsnotify_group *group; struct inode *inode; struct path path; - struct file *filp; - int ret, fput_needed; + struct fd f; + int ret; unsigned flags = 0; - filp = fget_light(fd, &fput_needed); - if (unlikely(!filp)) + f = fdget(fd); + if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an inotify instance */ - if (unlikely(filp->f_op != &inotify_fops)) { + if (unlikely(f.file->f_op != &inotify_fops)) { ret = -EINVAL; goto fput_and_out; } @@ -782,13 +782,13 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; - group = filp->private_data; + group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); fput_and_out: - fput_light(filp, fput_needed); + fdput(f); return ret; } @@ -796,19 +796,19 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; - struct file *filp; - int ret = 0, fput_needed; + struct fd f; + int ret = 0; - filp = fget_light(fd, &fput_needed); - if (unlikely(!filp)) + f = fdget(fd); + if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an inotify instance */ ret = -EINVAL; - if (unlikely(filp->f_op != &inotify_fops)) + if (unlikely(f.file->f_op != &inotify_fops)) goto out; - group = filp->private_data; + group = f.file->private_data; ret = -EINVAL; i_mark = inotify_idr_find(group, wd); @@ -823,7 +823,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) fsnotify_put_mark(&i_mark->fsn_mark); out: - fput_light(filp, fput_needed); + fdput(f); return ret; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index c6dbd3d..1d27331 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2124,7 +2124,8 @@ int ntfs_read_inode_mount(struct inode *vi) * ntfs_read_inode() will have set up the default ones. */ /* Set uid and gid to root. */ - vi->i_uid = vi->i_gid = 0; + vi->i_uid = GLOBAL_ROOT_UID; + vi->i_gid = GLOBAL_ROOT_GID; /* Regular file. No access for anyone. */ vi->i_mode = S_IFREG; /* No VFS initiated operations allowed for $MFT. */ @@ -2312,8 +2313,8 @@ int ntfs_show_options(struct seq_file *sf, struct dentry *root) ntfs_volume *vol = NTFS_SB(root->d_sb); int i; - seq_printf(sf, ",uid=%i", vol->uid); - seq_printf(sf, ",gid=%i", vol->gid); + seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); + seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); if (vol->fmask == vol->dmask) seq_printf(sf, ",umask=0%o", vol->fmask); else { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2bc149d..4a8289f8 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -102,8 +102,8 @@ static bool parse_options(ntfs_volume *vol, char *opt) char *p, *v, *ov; static char *utf8 = "utf8"; int errors = 0, sloppy = 0; - uid_t uid = (uid_t)-1; - gid_t gid = (gid_t)-1; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; int mft_zone_multiplier = -1, on_errors = -1; int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; @@ -128,6 +128,30 @@ static bool parse_options(ntfs_volume *vol, char *opt) if (*v) \ goto needs_val; \ } +#define NTFS_GETOPT_UID(option, variable) \ + if (!strcmp(p, option)) { \ + uid_t uid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + uid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kuid(current_user_ns(), uid_value); \ + if (!uid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_GID(option, variable) \ + if (!strcmp(p, option)) { \ + gid_t gid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + gid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kgid(current_user_ns(), gid_value); \ + if (!gid_valid(variable)) \ + goto needs_val; \ + } #define NTFS_GETOPT_OCTAL(option, variable) \ if (!strcmp(p, option)) { \ if (!v || !*v) \ @@ -165,8 +189,8 @@ static bool parse_options(ntfs_volume *vol, char *opt) while ((p = strsep(&opt, ","))) { if ((v = strchr(p, '='))) *v++ = 0; - NTFS_GETOPT("uid", uid) - else NTFS_GETOPT("gid", gid) + NTFS_GETOPT_UID("uid", uid) + else NTFS_GETOPT_GID("gid", gid) else NTFS_GETOPT_OCTAL("umask", fmask = dmask) else NTFS_GETOPT_OCTAL("fmask", fmask) else NTFS_GETOPT_OCTAL("dmask", dmask) @@ -283,9 +307,9 @@ no_mount_options: vol->on_errors = on_errors; if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) vol->on_errors |= ON_ERRORS_CONTINUE; - if (uid != (uid_t)-1) + if (uid_valid(uid)) vol->uid = uid; - if (gid != (gid_t)-1) + if (gid_valid(gid)) vol->gid = gid; if (fmask != (umode_t)-1) vol->fmask = fmask; @@ -1023,7 +1047,8 @@ static bool load_and_init_mft_mirror(ntfs_volume *vol) * ntfs_read_inode() will have set up the default ones. */ /* Set uid and gid to root. */ - tmp_ino->i_uid = tmp_ino->i_gid = 0; + tmp_ino->i_uid = GLOBAL_ROOT_UID; + tmp_ino->i_gid = GLOBAL_ROOT_GID; /* Regular file. No access for anyone. */ tmp_ino->i_mode = S_IFREG; /* No VFS initiated operations allowed for $MFTMirr. */ @@ -3168,6 +3193,12 @@ static void __exit exit_ntfs_fs(void) ntfs_debug("Unregistering NTFS driver."); unregister_filesystem(&ntfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ntfs_big_inode_cache); kmem_cache_destroy(ntfs_inode_cache); kmem_cache_destroy(ntfs_name_cache); diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 15e3ba8..4f579b0 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -25,6 +25,7 @@ #define _LINUX_NTFS_VOLUME_H #include <linux/rwsem.h> +#include <linux/uidgid.h> #include "types.h" #include "layout.h" @@ -46,8 +47,8 @@ typedef struct { sized blocks on the device. */ /* Configuration provided by user at mount time. */ unsigned long flags; /* Miscellaneous flags, see below. */ - uid_t uid; /* uid that files will be mounted as. */ - gid_t gid; /* gid that files will be mounted as. */ + kuid_t uid; /* uid that files will be mounted as. */ + kgid_t gid; /* gid that files will be mounted as. */ umode_t fmask; /* The mask for file permissions. */ umode_t dmask; /* The mask for directory permissions. */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index a721907..260b162 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -452,7 +452,7 @@ static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, buffer, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return ret; @@ -475,7 +475,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a4e855e..f7c648d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1746,8 +1746,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, long fd; int sectsize; char *p = (char *)page; - struct file *filp = NULL; - struct inode *inode = NULL; + struct fd f; + struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; @@ -1766,26 +1766,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, if (fd < 0 || fd >= INT_MAX) goto out; - filp = fget(fd); - if (filp == NULL) + f = fdget(fd); + if (f.file == NULL) goto out; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out; + goto out2; - inode = igrab(filp->f_mapping->host); + inode = igrab(f.file->f_mapping->host); if (inode == NULL) - goto out; + goto out2; if (!S_ISBLK(inode->i_mode)) - goto out; + goto out3; - reg->hr_bdev = I_BDEV(filp->f_mapping->host); + reg->hr_bdev = I_BDEV(f.file->f_mapping->host); ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); if (ret) { reg->hr_bdev = NULL; - goto out; + goto out3; } inode = NULL; @@ -1797,7 +1797,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, "blocksize %u incorrect for device, expected %d", reg->hr_block_bytes, sectsize); ret = -EINVAL; - goto out; + goto out3; } o2hb_init_region_params(reg); @@ -1811,13 +1811,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = o2hb_map_slot_data(reg); if (ret) { mlog_errno(ret); - goto out; + goto out3; } ret = o2hb_populate_slot_data(reg); if (ret) { mlog_errno(ret); - goto out; + goto out3; } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); @@ -1847,7 +1847,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, if (IS_ERR(hb_task)) { ret = PTR_ERR(hb_task); mlog_errno(ret); - goto out; + goto out3; } spin_lock(&o2hb_live_lock); @@ -1863,7 +1863,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, if (reg->hr_aborted_start) { ret = -EIO; - goto out; + goto out3; } /* Ok, we were woken. Make sure it wasn't by drop_item() */ @@ -1882,11 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", config_item_name(®->hr_item), reg->hr_dev_name); +out3: + iput(inode); +out2: + fdput(f); out: - if (filp) - fput(filp); - if (inode) - iput(inode); if (ret < 0) { if (reg->hr_bdev) { blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 8f9cea1..c19897d 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -327,5 +327,5 @@ void o2quo_exit(void) { struct o2quo_state *qs = &o2quo_state; - flush_work_sync(&qs->qs_work); + flush_work(&qs->qs_work); } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 83b6f98..16b712d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -691,6 +691,11 @@ static void __exit exit_dlmfs_fs(void) flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(dlmfs_inode_cache); bdi_destroy(&dlmfs_backing_dev_info); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46a1f6d..5a4ee77 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1184,8 +1184,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, - USRQUOTA); + transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); if (!transfer_to[USRQUOTA]) { status = -ESRCH; goto bail_unlock; @@ -1194,8 +1193,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, - GRPQUOTA); + transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); if (!transfer_to[GRPQUOTA]) { status = -ESRCH; goto bail_unlock; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index d150372..47a87dd 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,6 +173,7 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) @@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 0a86e30..332a281 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -95,7 +95,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) struct ocfs2_global_disk_dqblk *d = dp; struct mem_dqblk *m = &dquot->dq_dqb; - d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); @@ -112,11 +112,14 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) { struct ocfs2_global_disk_dqblk *d = dp; struct ocfs2_mem_dqinfo *oinfo = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; if (qtree_entry_unused(&oinfo->dqi_gi, dp)) return 0; - return le32_to_cpu(d->dqb_id) == dquot->dq_id; + + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); } struct qtree_fmt_operations ocfs2_global_ops = { @@ -475,7 +478,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) { int err, err2; struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_global_disk_dqblk dqblk; s64 spacechange, inodechange; @@ -504,7 +507,8 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) olditime = dquot->dq_dqb.dqb_itime; oldbtime = dquot->dq_dqb.dqb_btime; ocfs2_global_disk2memdqb(dquot, &dqblk); - trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace, + trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_dqb.dqb_curspace, (long long)spacechange, dquot->dq_dqb.dqb_curinodes, (long long)inodechange); @@ -555,8 +559,8 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) err = ocfs2_qinfo_lock(info, freeing); if (err < 0) { mlog(ML_ERROR, "Failed to lock quota info, losing quota write" - " (type=%d, id=%u)\n", dquot->dq_type, - (unsigned)dquot->dq_id); + " (type=%d, id=%u)\n", dquot->dq_id.type, + (unsigned)from_kqid(&init_user_ns, dquot->dq_id)); goto out; } if (freeing) @@ -591,9 +595,10 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) struct ocfs2_super *osb = OCFS2_SB(sb); int status = 0; - trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type, + trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type, type, sb->s_id); - if (type != dquot->dq_type) + if (type != dquot->dq_id.type) goto out; status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) @@ -643,7 +648,8 @@ static int ocfs2_write_dquot(struct dquot *dquot) struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; - trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type); + trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type); handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); if (IS_ERR(handle)) { @@ -677,11 +683,12 @@ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; - trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type); + trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id), + dquot->dq_id.type); mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ @@ -691,7 +698,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) if (status < 0) goto out; handle = ocfs2_start_trans(osb, - ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type)); + ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -733,13 +740,14 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) int ex = 0; struct super_block *sb = dquot->dq_sb; struct ocfs2_super *osb = OCFS2_SB(sb); - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; struct inode *gqinode = info->dqi_gqinode; int need_alloc = ocfs2_global_qinit_alloc(sb, type); handle_t *handle; - trace_ocfs2_acquire_dquot(dquot->dq_id, type); + trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id), + type); mutex_lock(&dquot->dq_lock); /* * We need an exclusive lock, because we're going to update use count @@ -821,12 +829,13 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) int sync = 0; int status; struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(sb); - trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type); + trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id), + type); /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index f100bf7..27fe7ee 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -501,7 +501,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, } dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + ol_dqblk_block_off(sb, chunk, bit)); - dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type); + dquot = dqget(sb, + make_kqid(&init_user_ns, type, + le64_to_cpu(dqblk->dqb_id))); if (!dquot) { status = -EIO; mlog(ML_ERROR, "Failed to get quota structure " @@ -881,7 +883,8 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + ol_dqblk_block_offset(sb, od->dq_local_off)); - dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id); + dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, + od->dq_dquot.dq_id)); spin_lock(&dq_data_lock); dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - od->dq_origspace); @@ -891,7 +894,7 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) trace_olq_set_dquot( (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), - od->dq_dquot.dq_id); + from_kqid(&init_user_ns, od->dq_dquot.dq_id)); } /* Write dquot to local quota file */ @@ -900,7 +903,7 @@ int ocfs2_local_write_dquot(struct dquot *dquot) struct super_block *sb = dquot->dq_sb; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); struct buffer_head *bh; - struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type]; + struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type]; int status; status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, @@ -1221,7 +1224,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) int ocfs2_create_local_dquot(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct inode *lqinode = sb_dqopt(sb)->files[type]; struct ocfs2_quota_chunk *chunk; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); @@ -1275,7 +1278,7 @@ out: int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) { int status; - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); struct super_block *sb = dquot->dq_sb; struct ocfs2_local_disk_chunk *dchunk; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 68f4541..0e91ec2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1818,6 +1818,11 @@ static int ocfs2_initialize_mem_caches(void) static void ocfs2_free_mem_caches(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); if (ocfs2_inode_cachep) kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 2c6d952..77e3cb2 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -146,8 +146,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, be64_to_cpu(entry->e_blocks); if (omfs_allocate_block(inode->i_sb, new_block)) { - entry->e_blocks = - cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1); + be64_add_cpu(&entry->e_blocks, 1); terminator->e_blocks = ~(cpu_to_be64( be64_to_cpu(~terminator->e_blocks) + 1)); goto out; @@ -177,7 +176,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, be64_to_cpu(~terminator->e_blocks) + (u64) new_count)); /* write in new entry */ - oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count)); + be32_add_cpu(&oe->e_extent_count, 1); out: *ret_block = new_block; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index e6213b3..25d715c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -391,12 +391,16 @@ static int parse_options(char *options, struct omfs_sb_info *sbi) case Opt_uid: if (match_int(&args[0], &option)) return 0; - sbi->s_uid = option; + sbi->s_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(sbi->s_uid)) + return 0; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; - sbi->s_gid = option; + sbi->s_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(sbi->s_gid)) + return 0; break; case Opt_umask: if (match_octal(&args[0], &option)) diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index 8941f12..f0f8bc7 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -19,8 +19,8 @@ struct omfs_sb_info { unsigned long **s_imap; int s_imap_size; struct mutex s_bitmap_lock; - int s_uid; - int s_gid; + kuid_t s_uid; + kgid_t s_gid; int s_dmask; int s_fmask; }; @@ -132,27 +132,27 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { - struct inode * inode; + struct inode *inode; struct dentry *dentry; - struct file * file; + struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; - file = fget(fd); - if (!file) + f = fdget(fd); + if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ - if (file->f_flags & O_LARGEFILE) + if (f.file->f_flags & O_LARGEFILE) small = 0; - dentry = file->f_path.dentry; + dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; @@ -165,14 +165,14 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) goto out_putf; sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, file, length); + error = locks_verify_truncate(inode, f.file, length); if (!error) - error = security_path_truncate(&file->f_path); + error = security_path_truncate(&f.file->f_path); if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: - fput(file); + fdput(f); out: return error; } @@ -276,15 +276,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) { - struct file *file; + struct fd f = fdget(fd); int error = -EBADF; - file = fget(fd); - if (file) { - error = do_fallocate(file, mode, offset, len); - fput(file); + if (f.file) { + error = do_fallocate(f.file, mode, offset, len); + fdput(f); } - return error; } @@ -400,16 +398,15 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct file *file; + struct fd f = fdget_raw(fd); struct inode *inode; - int error, fput_needed; + int error = -EBADF; error = -EBADF; - file = fget_raw_light(fd, &fput_needed); - if (!file) + if (!f.file) goto out; - inode = file->f_path.dentry->d_inode; + inode = f.file->f_path.dentry->d_inode; error = -ENOTDIR; if (!S_ISDIR(inode->i_mode)) @@ -417,9 +414,9 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); if (!error) - set_fs_pwd(current->fs, &file->f_path); + set_fs_pwd(current->fs, &f.file->f_path); out_putf: - fput_light(file, fput_needed); + fdput(f); out: return error; } @@ -534,7 +531,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; mutex_lock(&inode->i_mutex); - error = security_path_chown(path, user, group); + error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs); mutex_unlock(&inode->i_mutex); @@ -582,23 +579,20 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { - struct file * file; + struct fd f = fdget(fd); int error = -EBADF; - struct dentry * dentry; - file = fget(fd); - if (!file) + if (!f.file) goto out; - error = mnt_want_write_file(file); + error = mnt_want_write_file(f.file); if (error) goto out_fput; - dentry = file->f_path.dentry; - audit_inode(NULL, dentry); - error = chown_common(&file->f_path, user, group); - mnt_drop_write_file(file); + audit_inode(NULL, f.file->f_path.dentry); + error = chown_common(&f.file->f_path, user, group); + mnt_drop_write_file(f.file); out_fput: - fput(file); + fdput(f); out: return error; } @@ -803,50 +797,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -static void __put_unused_fd(struct files_struct *files, unsigned int fd) -{ - struct fdtable *fdt = files_fdtable(files); - __clear_open_fd(fd, fdt); - if (fd < files->next_fd) - files->next_fd = fd; -} - -void put_unused_fd(unsigned int fd) -{ - struct files_struct *files = current->files; - spin_lock(&files->file_lock); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); -} - -EXPORT_SYMBOL(put_unused_fd); - -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. - */ - -void fd_install(unsigned int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); -} - -EXPORT_SYMBOL(fd_install); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; @@ -858,7 +808,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o op->mode = 0; /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY; + flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1038,23 +988,7 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - struct file * filp; - struct files_struct *files = current->files; - struct fdtable *fdt; - int retval; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_unlock; - filp = fdt->fd[fd]; - if (!filp) - goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); - __clear_close_on_exec(fd, fdt); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); - retval = filp_close(filp, files); + int retval = __close_fd(current->files, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -1064,10 +998,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) retval = -EINTR; return retval; - -out_unlock: - spin_unlock(&files->file_lock); - return -EBADF; } EXPORT_SYMBOL(sys_close); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 4a34779..2ad080f 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -463,6 +463,11 @@ static int __init init_openprom_fs(void) static void __exit exit_openprom_fs(void) { unregister_filesystem(&openprom_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(op_inode_cachep); } @@ -1064,9 +1064,8 @@ err_inode: return err; } -int do_pipe_flags(int *fd, int flags) +static int __do_pipe_flags(int *fd, struct file **files, int flags) { - struct file *files[2]; int error; int fdw, fdr; @@ -1088,11 +1087,8 @@ int do_pipe_flags(int *fd, int flags) fdw = error; audit_fd_pair(fdr, fdw); - fd_install(fdr, files[0]); - fd_install(fdw, files[1]); fd[0] = fdr; fd[1] = fdw; - return 0; err_fdr: @@ -1103,21 +1099,38 @@ int do_pipe_flags(int *fd, int flags) return error; } +int do_pipe_flags(int *fd, int flags) +{ + struct file *files[2]; + int error = __do_pipe_flags(fd, files, flags); + if (!error) { + fd_install(fd[0], files[0]); + fd_install(fd[1], files[1]); + } + return error; +} + /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { + struct file *files[2]; int fd[2]; int error; - error = do_pipe_flags(fd, flags); + error = __do_pipe_flags(fd, files, flags); if (!error) { - if (copy_to_user(fildes, fd, sizeof(fd))) { - sys_close(fd[0]); - sys_close(fd[1]); + if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { + fput(files[0]); + fput(files[1]); + put_unused_fd(fd[0]); + put_unused_fd(fd[1]); error = -EFAULT; + } else { + fd_install(fd[0], files[0]); + fd_install(fd[1], files[1]); } } return error; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5e325a42..8bd2135 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -78,7 +78,8 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - unsigned int id = 0; /* keep gcc happy */ + kuid_t prev_uid = INVALID_UID; + kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -87,7 +88,6 @@ posix_acl_valid(const struct posix_acl *acl) switch (pa->e_tag) { case ACL_USER_OBJ: if (state == ACL_USER_OBJ) { - id = 0; state = ACL_USER; break; } @@ -96,16 +96,17 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (pa->e_id == ACL_UNDEFINED_ID || - pa->e_id < id) + if (!uid_valid(pa->e_uid)) return -EINVAL; - id = pa->e_id + 1; + if (uid_valid(prev_uid) && + uid_lte(pa->e_uid, prev_uid)) + return -EINVAL; + prev_uid = pa->e_uid; needs_mask = 1; break; case ACL_GROUP_OBJ: if (state == ACL_USER) { - id = 0; state = ACL_GROUP; break; } @@ -114,10 +115,12 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (pa->e_id == ACL_UNDEFINED_ID || - pa->e_id < id) + if (!gid_valid(pa->e_gid)) + return -EINVAL; + if (gid_valid(prev_gid) && + gid_lte(pa->e_gid, prev_gid)) return -EINVAL; - id = pa->e_id + 1; + prev_gid = pa->e_gid; needs_mask = 1; break; @@ -195,15 +198,12 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) return ERR_PTR(-ENOMEM); acl->a_entries[0].e_tag = ACL_USER_OBJ; - acl->a_entries[0].e_id = ACL_UNDEFINED_ID; acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; acl->a_entries[1].e_tag = ACL_GROUP_OBJ; - acl->a_entries[1].e_id = ACL_UNDEFINED_ID; acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; acl->a_entries[2].e_tag = ACL_OTHER; - acl->a_entries[2].e_id = ACL_UNDEFINED_ID; acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } @@ -224,11 +224,11 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - if (inode->i_uid == current_fsuid()) + if (uid_eq(inode->i_uid, current_fsuid())) goto check_perm; break; case ACL_USER: - if (pa->e_id == current_fsuid()) + if (uid_eq(pa->e_uid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: @@ -239,7 +239,7 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) } break; case ACL_GROUP: - if (in_group_p(pa->e_id)) { + if (in_group_p(pa->e_gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; diff --git a/fs/proc/Makefile b/fs/proc/Makefile index c1c7293..99349ef 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,7 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o + proc_tty.o fd.o proc-y += cmdline.o proc-y += consoles.o proc-y += cpuinfo.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84c..ef5c84b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -90,6 +90,7 @@ #endif #include <trace/events/oom.h> #include "internal.h" +#include "fd.h" /* NOTE: * Implementing inode permission operations in /proc is almost @@ -136,8 +137,6 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) -static int proc_fd_permission(struct inode *inode, int mask); - /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -874,111 +873,6 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; -static ssize_t oom_adjust_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char buffer[PROC_NUMBUF]; - size_t len; - int oom_adjust = OOM_DISABLE; - unsigned long flags; - - if (!task) - return -ESRCH; - - if (lock_task_sighand(task, &flags)) { - oom_adjust = task->signal->oom_adj; - unlock_task_sighand(task, &flags); - } - - put_task_struct(task); - - len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); - - return simple_read_from_buffer(buf, count, ppos, buffer, len); -} - -static ssize_t oom_adjust_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF]; - int oom_adjust; - unsigned long flags; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) { - err = -EFAULT; - goto out; - } - - err = kstrtoint(strstrip(buffer), 0, &oom_adjust); - if (err) - goto out; - if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) { - err = -EINVAL; - goto out; - } - - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * Warn that /proc/pid/oom_adj is deprecated, see - * Documentation/feature-removal-schedule.txt. - */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - task->signal->oom_adj = oom_adjust; - /* - * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum - * value is always attainable. - */ - if (task->signal->oom_adj == OOM_ADJUST_MAX) - task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; - else - task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / - -OOM_DISABLE; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); -out: - return err < 0 ? err : count; -} - -static const struct file_operations proc_oom_adjust_operations = { - .read = oom_adjust_read, - .write = oom_adjust_write, - .llseek = generic_file_llseek, -}; - static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1052,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; trace_oom_score_adj_update(task); - /* - * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is - * always attainable. - */ - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - task->signal->oom_adj = OOM_DISABLE; - else - task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / - OOM_SCORE_ADJ_MAX; + err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1089,7 +975,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - audit_get_loginuid(task)); + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1101,6 +988,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, char *page, *tmp; ssize_t length; uid_t loginuid; + kuid_t kloginuid; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1130,7 +1018,13 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(loginuid); + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } + + length = audit_set_loginuid(kloginuid); if (likely(length == 0)) length = count; @@ -1492,7 +1386,7 @@ out: return error; } -static const struct inode_operations proc_pid_link_inode_operations = { +const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, @@ -1501,21 +1395,6 @@ static const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -static int task_dumpable(struct task_struct *task) -{ - int dumpable = 0; - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) - dumpable = get_dumpable(mm); - task_unlock(task); - if(dumpable == 1) - return 1; - return 0; -} - struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; @@ -1641,15 +1520,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } -static int pid_delete_dentry(const struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, @@ -1712,289 +1582,6 @@ end_instantiate: return filldir(dirent, name, len, filp->f_pos, ino, type); } -static unsigned name_to_int(struct dentry *dentry) -{ - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - -#define PROC_FDINFO_MAX 64 - -static int proc_fd_info(struct inode *inode, struct path *path, char *info) -{ - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; - struct file *file; - int fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (close_on_exec(fd, fdt)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; - } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; -} - -static int proc_fd_link(struct dentry *dentry, struct path *path) -{ - return proc_fd_info(dentry->d_inode, path, NULL); -} - -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct inode *inode; - struct task_struct *task; - int fd; - struct files_struct *files; - const struct cred *cred; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - if (files) { - struct file *file; - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - unsigned f_mode = file->f_mode; - - rcu_read_unlock(); - put_files_struct(files); - - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } - - if (S_ISLNK(inode->i_mode)) { - unsigned i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; - } - - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } - rcu_read_unlock(); - put_files_struct(files); - } - put_task_struct(task); - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations tid_fd_dentry_operations = -{ - .d_revalidate = tid_fd_revalidate, - .d_delete = pid_delete_dentry, -}; - -static struct dentry *proc_fd_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - unsigned fd = (unsigned long)ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); - - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - - inode->i_mode = S_IFLNK; - inode->i_op = &proc_pid_link_inode_operations; - inode->i_size = 64; - ei->op.proc_get_link = proc_fd_link; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - error = NULL; - - out: - return error; -} - -static struct dentry *proc_lookupfd_common(struct inode *dir, - struct dentry *dentry, - instantiate_t instantiate) -{ - struct task_struct *task = get_proc_task(dir); - unsigned fd = name_to_int(dentry); - struct dentry *result = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - if (fd == ~0U) - goto out; - - result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); -out: - put_task_struct(task); -out_no_task: - return result; -} - -static int proc_readfd_common(struct file * filp, void * dirent, - filldir_t filldir, instantiate_t instantiate) -{ - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); - unsigned int fd, ino; - int retval; - struct files_struct * files; - - retval = -ENOENT; - if (!p) - goto out_no_task; - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = filp->f_pos-2; - fd < files_fdtable(files)->max_fds; - fd++, filp->f_pos++) { - char name[PROC_NUMBUF]; - int len; - int rv; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); - - len = snprintf(name, sizeof(name), "%d", fd); - rv = proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, p, - (void *)(unsigned long)fd); - if (rv < 0) - goto out_fd_loop; - rcu_read_lock(); - } - rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); - } -out: - put_task_struct(p); -out_no_task: - return retval; -} - -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); -} - -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); -} - -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - char tmp[PROC_FDINFO_MAX]; - int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); - if (!err) - err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); - return err; -} - -static const struct file_operations proc_fdinfo_file_operations = { - .open = nonseekable_open, - .read = proc_fdinfo_read, - .llseek = no_llseek, -}; - -static const struct file_operations proc_fd_operations = { - .read = generic_read_dir, - .readdir = proc_readfd, - .llseek = default_llseek, -}; - #ifdef CONFIG_CHECKPOINT_RESTORE /* @@ -2113,7 +1700,7 @@ out: } struct map_files_info { - struct file *file; + fmode_t mode; unsigned long len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; @@ -2122,13 +1709,10 @@ static struct dentry * proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - const struct file *file = ptr; + fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; - if (!file) - return ERR_PTR(-ENOENT); - inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) return ERR_PTR(-ENOENT); @@ -2140,9 +1724,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_size = 64; inode->i_mode = S_IFLNK; - if (file->f_mode & FMODE_READ) + if (mode & FMODE_READ) inode->i_mode |= S_IRUSR; - if (file->f_mode & FMODE_WRITE) + if (mode & FMODE_WRITE) inode->i_mode |= S_IWUSR; d_set_d_op(dentry, &tid_map_files_dentry_operations); @@ -2186,7 +1770,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); @@ -2287,8 +1872,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) if (++pos <= filp->f_pos) continue; - get_file(vma->vm_file); - info.file = vma->vm_file; + info.mode = vma->vm_file->f_mode; info.len = snprintf(info.name, sizeof(info.name), "%lx-%lx", vma->vm_start, vma->vm_end); @@ -2303,19 +1887,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) ret = proc_fill_cache(filp, dirent, filldir, p->name, p->len, proc_map_files_instantiate, - task, p->file); + task, + (void *)(unsigned long)p->mode); if (ret) break; filp->f_pos++; - fput(p->file); - } - for (; i < nr_files; i++) { - /* - * In case of error don't forget - * to put rest of file refs. - */ - p = flex_array_get(fa, i); - fput(p->file); } if (fa) flex_array_free(fa); @@ -2337,82 +1913,6 @@ static const struct file_operations proc_map_files_operations = { #endif /* CONFIG_CHECKPOINT_RESTORE */ -/* - * /proc/pid/fd needs a special permission handler so that a process can still - * access /proc/self/fd after it has executed a setuid(). - */ -static int proc_fd_permission(struct inode *inode, int mask) -{ - int rv = generic_permission(inode, mask); - if (rv == 0) - return 0; - if (task_pid(current) == proc_pid(inode)) - rv = 0; - return rv; -} - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fd_inode_operations = { - .lookup = proc_lookupfd, - .permission = proc_fd_permission, - .setattr = proc_setattr, -}; - -static struct dentry *proc_fdinfo_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - unsigned fd = (unsigned long)ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); - - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; - inode->i_fop = &proc_fdinfo_file_operations; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - error = NULL; - - out: - return error; -} - -static struct dentry *proc_lookupfdinfo(struct inode *dir, - struct dentry *dentry, - unsigned int flags) -{ - return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); -} - -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, - proc_fdinfo_instantiate); -} - -static const struct file_operations proc_fdinfo_operations = { - .read = generic_read_dir, - .readdir = proc_readfdinfo, - .llseek = default_llseek, -}; - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fdinfo_inode_operations = { - .lookup = proc_lookupfdinfo, - .setattr = proc_setattr, -}; - - static struct dentry *proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2983,6 +2483,11 @@ static int proc_gid_map_open(struct inode *inode, struct file *file) return proc_id_map_open(inode, file, &proc_gid_seq_operations); } +static int proc_projid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_projid_seq_operations); +} + static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, @@ -2998,6 +2503,14 @@ static const struct file_operations proc_gid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static const struct file_operations proc_projid_map_operations = { + .open = proc_projid_map_open, + .write = proc_projid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -3084,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -3105,6 +2617,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif }; @@ -3450,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -3468,6 +2980,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif }; diff --git a/fs/proc/fd.c b/fs/proc/fd.c new file mode 100644 index 0000000..f28a875 --- /dev/null +++ b/fs/proc/fd.c @@ -0,0 +1,367 @@ +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/dcache.h> +#include <linux/path.h> +#include <linux/fdtable.h> +#include <linux/namei.h> +#include <linux/pid.h> +#include <linux/security.h> +#include <linux/file.h> +#include <linux/seq_file.h> + +#include <linux/proc_fs.h> + +#include "internal.h" +#include "fd.h" + +static int seq_show(struct seq_file *m, void *v) +{ + struct files_struct *files = NULL; + int f_flags = 0, ret = -ENOENT; + struct file *file = NULL; + struct task_struct *task; + + task = get_proc_task(m->private); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + + if (files) { + int fd = proc_fd(m->private); + + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + struct fdtable *fdt = files_fdtable(files); + + f_flags = file->f_flags; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + get_file(file); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + if (!ret) { + seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", + (long long)file->f_pos, f_flags); + fput(file); + } + + return ret; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = seq_fdinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct files_struct *files; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int fd; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + struct file *file; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + unsigned f_mode = file->f_mode; + + rcu_read_unlock(); + put_files_struct(files); + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + + d_drop(dentry); + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + struct files_struct *files = NULL; + struct task_struct *task; + int ret = -ENOENT; + + task = get_proc_task(dentry->d_inode); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + + if (files) { + int fd = proc_fd(dentry->d_inode); + struct file *fd_file; + + spin_lock(&files->file_lock); + fd_file = fcheck_files(files, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + return ret; +} + +static struct dentry * +proc_fd_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFLNK; + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + + ei->op.proc_get_link = proc_fd_link; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + error = NULL; + out: + return error; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + struct dentry *result = ERR_PTR(-ENOENT); + unsigned fd = name_to_int(dentry); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_readfd_common(struct file * filp, void * dirent, + filldir_t filldir, instantiate_t instantiate) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + struct files_struct *files; + unsigned int fd, ino; + int retval; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + for (fd = filp->f_pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, filp->f_pos++) { + char name[PROC_NUMBUF]; + int len; + int rv; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + rv = proc_fill_cache(filp, dirent, filldir, + name, len, instantiate, p, + (void *)(unsigned long)fd); + if (rv < 0) + goto out_fd_loop; + rcu_read_lock(); + } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} + +static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .readdir = proc_readfd, + .llseek = default_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct inode *inode, int mask) +{ + int rv = generic_permission(inode, mask); + if (rv == 0) + return 0; + if (task_pid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static struct dentry * +proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + error = NULL; + out: + return error; +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, + proc_fdinfo_instantiate); +} + +const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .readdir = proc_readfdinfo, + .llseek = default_llseek, +}; diff --git a/fs/proc/fd.h b/fs/proc/fd.h new file mode 100644 index 0000000..cbb1d47 --- /dev/null +++ b/fs/proc/fd.h @@ -0,0 +1,14 @@ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include <linux/fs.h> + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct inode *inode, int mask); + +#endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b3647fe..0d80cef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { pde_get(de); spin_unlock(&proc_subdir_lock); - error = -EINVAL; + error = -ENOMEM; inode = proc_get_inode(dir->i_sb, de); goto out_unlock; } @@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, unsigned int len; /* make sure name is valid */ - if (!name || !strlen(name)) goto out; + if (!name || !strlen(name)) + goto out; if (xlate_proc_name(name, parent, &fn) != 0) goto out; @@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, len = strlen(fn); - ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); - if (!ent) goto out; + ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + if (!ent) + goto out; - memset(ent, 0, sizeof(struct proc_dir_entry)); memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); - ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); - ent->pde_unload_completion = NULL; INIT_LIST_HEAD(&ent->pde_openers); - out: +out: return ent; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7ac817b..3b22bbd 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return NULL; if (inode->i_state & I_NEW) { inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; if (de->mode) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e1167a1..cceaab0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/sched.h> #include <linux/proc_fs.h> struct ctl_table_header; @@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations; struct proc_maps_private { struct pid *pid; @@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode) return PROC_I(inode)->fd; } +static inline int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if (dumpable == SUID_DUMPABLE_ENABLED) + return 1; + return 0; +} + +static inline int pid_delete_dentry(const struct dentry * dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +static inline unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, struct dentry *dentry); int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, diff --git a/fs/proc/page.c b/fs/proc/page.c index 7fcd0d6..b8730d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; - else if (PageTransCompound(page)) + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU to make + * sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) u |= 1 << KPF_THP; /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dfafeb2..a781bdf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) } rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); return 0; } @@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } @@ -266,8 +265,7 @@ void sysctl_head_put(struct ctl_table_header *head) static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { - if (!head) - BUG(); + BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); @@ -462,9 +460,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (h) - sysctl_head_finish(h); - if (!inode) goto out; @@ -473,6 +468,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, inode); out: + if (h) + sysctl_head_finish(h); sysctl_head_finish(head); return err; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 9a2d9fd..9889a92 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) if (!*p) continue; - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_gid: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f..79827ce 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index d39bb5c..ca71db6 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -23,6 +23,7 @@ config PSTORE_FTRACE bool "Persistent function tracer" depends on PSTORE depends on FUNCTION_TRACER + depends on DEBUG_FS help With this option kernel traces function calls into a persistent ram buffer that can be decoded and dumped after reboot through diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index a130d48..2d57e1a 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -17,19 +17,113 @@ #include <linux/percpu.h> #include <linux/smp.h> #include <linux/atomic.h> +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/ftrace.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/cache.h> #include <asm/barrier.h> #include "internal.h" -void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip) +static void notrace pstore_ftrace_call(unsigned long ip, + unsigned long parent_ip) { + unsigned long flags; struct pstore_ftrace_record rec = {}; if (unlikely(oops_in_progress)) return; + local_irq_save(flags); + rec.ip = ip; rec.parent_ip = parent_ip; pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, sizeof(rec), psinfo); + + local_irq_restore(flags); +} + +static struct ftrace_ops pstore_ftrace_ops __read_mostly = { + .func = pstore_ftrace_call, +}; + +static DEFINE_MUTEX(pstore_ftrace_lock); +static bool pstore_ftrace_enabled; + +static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + u8 on; + ssize_t ret; + + ret = kstrtou8_from_user(buf, count, 2, &on); + if (ret) + return ret; + + mutex_lock(&pstore_ftrace_lock); + + if (!on ^ pstore_ftrace_enabled) + goto out; + + if (on) + ret = register_ftrace_function(&pstore_ftrace_ops); + else + ret = unregister_ftrace_function(&pstore_ftrace_ops); + if (ret) { + pr_err("%s: unable to %sregister ftrace ops: %zd\n", + __func__, on ? "" : "un", ret); + goto err; + } + + pstore_ftrace_enabled = on; +out: + ret = count; +err: + mutex_unlock(&pstore_ftrace_lock); + + return ret; +} + +static ssize_t pstore_ftrace_knob_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + char val[] = { '0' + pstore_ftrace_enabled, '\n' }; + + return simple_read_from_buffer(buf, count, ppos, val, sizeof(val)); +} + +static const struct file_operations pstore_knob_fops = { + .open = simple_open, + .read = pstore_ftrace_knob_read, + .write = pstore_ftrace_knob_write, +}; + +void pstore_register_ftrace(void) +{ + struct dentry *dir; + struct dentry *file; + + if (!psinfo->write_buf) + return; + + dir = debugfs_create_dir("pstore", NULL); + if (!dir) { + pr_err("%s: unable to create pstore directory\n", __func__); + return; + } + + file = debugfs_create_file("record_ftrace", 0600, dir, NULL, + &pstore_knob_fops); + if (!file) { + pr_err("%s: unable to create record_ftrace file\n", __func__); + goto err_file; + } + + return; +err_file: + debugfs_remove(dir); } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 0d0d3b7..4847f58 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -39,6 +39,12 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) #endif } +#ifdef CONFIG_PSTORE_FTRACE +extern void pstore_register_ftrace(void); +#else +static inline void pstore_register_ftrace(void) {} +#endif + extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 29996e8..a40da07 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -164,7 +164,13 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) if (c > psinfo->bufsize) c = psinfo->bufsize; - spin_lock_irqsave(&psinfo->buf_lock, flags); + + if (oops_in_progress) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) + break; + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); + } memcpy(psinfo->buf, s, c); psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); @@ -236,6 +242,7 @@ int pstore_register(struct pstore_info *psi) kmsg_dump_register(&pstore_dumper); pstore_register_console(); + pstore_register_ftrace(); if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 0b311bc..1a4f6da 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -32,6 +32,7 @@ #include <linux/ioport.h> #include <linux/platform_device.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <linux/pstore_ram.h> #define RAMOOPS_KERNMSG_HDR "====" @@ -181,12 +182,11 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) return len; } - -static int ramoops_pstore_write_buf(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char *buf, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char *buf, size_t size, + struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; @@ -406,7 +406,7 @@ static int __devinit ramoops_probe(struct platform_device *pdev) goto fail_init_fprz; if (!cxt->przs && !cxt->cprz && !cxt->fprz) { - pr_err("memory size too small, minimum is %lu\n", + pr_err("memory size too small, minimum is %zu\n", cxt->console_size + cxt->record_size + cxt->ftrace_size); goto fail_cnt; @@ -414,13 +414,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Console can handle any buffer size, so prefer dumps buffer - * size since usually it is smaller. + * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we + * have to handle dumps, we must have at least record_size buffer. And + * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be + * ZERO_SIZE_PTR). */ - if (cxt->przs) - cxt->pstore.bufsize = cxt->przs[0]->buffer_size; - else - cxt->pstore.bufsize = cxt->cprz->buffer_size; + if (cxt->console_size) + cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ + cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); spin_lock_init(&cxt->pstore.buf_lock); if (!cxt->pstore.buf) { @@ -537,6 +538,7 @@ postcore_initcall(ramoops_init); static void __exit ramoops_exit(void) { platform_driver_unregister(&ramoops_driver); + platform_device_unregister(dummy); kfree(dummy_data); } module_exit(ramoops_exit); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 552e994..43098bb 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -312,8 +312,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) (ino % QNX4_INODES_PER_BLOCK); inode->i_mode = le16_to_cpu(raw_inode->di_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); + i_uid_write(inode, (uid_t)le16_to_cpu(raw_inode->di_uid)); + i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid)); set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); @@ -391,6 +391,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(qnx4_inode_cachep); } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 2049c81..b6addf5 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -574,8 +574,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); - inode->i_uid = (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid); - inode->i_gid = (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid); + i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); + i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime); inode->i_mtime.tv_nsec = 0; @@ -651,6 +651,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(qnx6_inode_cachep); } diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 5f9e9e2..c66c37c 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -2,6 +2,6 @@ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o -obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL) += quota.o kqid.o obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index c495a30..557a9c20 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -253,8 +253,10 @@ static qsize_t inode_get_rsv_space(struct inode *inode); static void __dquot_initialize(struct inode *inode, int type); static inline unsigned int -hashfn(const struct super_block *sb, unsigned int id, int type) +hashfn(const struct super_block *sb, struct kqid qid) { + unsigned int id = from_kqid(&init_user_ns, qid); + int type = qid.type; unsigned long tmp; tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); @@ -267,7 +269,7 @@ hashfn(const struct super_block *sb, unsigned int id, int type) static inline void insert_dquot_hash(struct dquot *dquot) { struct hlist_head *head; - head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type); + head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id); hlist_add_head(&dquot->dq_hash, head); } @@ -277,15 +279,14 @@ static inline void remove_dquot_hash(struct dquot *dquot) } static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, - unsigned int id, int type) + struct kqid qid) { struct hlist_node *node; struct dquot *dquot; hlist_for_each (node, dquot_hash+hashent) { dquot = hlist_entry(node, struct dquot, dq_hash); - if (dquot->dq_sb == sb && dquot->dq_id == id && - dquot->dq_type == type) + if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; } return NULL; @@ -351,7 +352,7 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) spin_lock(&dq_list_lock); if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> - info[dquot->dq_type].dqi_dirty_list); + info[dquot->dq_id.type].dqi_dirty_list); ret = 0; } spin_unlock(&dq_list_lock); @@ -410,17 +411,17 @@ int dquot_acquire(struct dquot *dquot) mutex_lock(&dquot->dq_lock); mutex_lock(&dqopt->dqio_mutex); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) - ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot); + ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) goto out_iolock; set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { - ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); + ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ - if (info_dirty(&dqopt->info[dquot->dq_type])) { - ret2 = dqopt->ops[dquot->dq_type]->write_file_info( - dquot->dq_sb, dquot->dq_type); + if (info_dirty(&dqopt->info[dquot->dq_id.type])) { + ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( + dquot->dq_sb, dquot->dq_id.type); } if (ret < 0) goto out_iolock; @@ -455,7 +456,7 @@ int dquot_commit(struct dquot *dquot) /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) - ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); + ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; out_sem: @@ -477,12 +478,12 @@ int dquot_release(struct dquot *dquot) if (atomic_read(&dquot->dq_count) > 1) goto out_dqlock; mutex_lock(&dqopt->dqio_mutex); - if (dqopt->ops[dquot->dq_type]->release_dqblk) { - ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot); + if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { + ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ - if (info_dirty(&dqopt->info[dquot->dq_type])) { - ret2 = dqopt->ops[dquot->dq_type]->write_file_info( - dquot->dq_sb, dquot->dq_type); + if (info_dirty(&dqopt->info[dquot->dq_id.type])) { + ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( + dquot->dq_sb, dquot->dq_id.type); } if (ret >= 0) ret = ret2; @@ -521,7 +522,7 @@ restart: list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) continue; - if (dquot->dq_type != type) + if (dquot->dq_id.type != type) continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { @@ -741,7 +742,8 @@ void dqput(struct dquot *dquot) #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", - quotatypes[dquot->dq_type], dquot->dq_id); + quotatypes[dquot->dq_id.type], + from_kqid(&init_user_ns, dquot->dq_id)); BUG(); } #endif @@ -752,7 +754,7 @@ we_slept: /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ - if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) && + if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot->dq_wait_unused); spin_unlock(&dq_list_lock); @@ -815,7 +817,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) INIT_LIST_HEAD(&dquot->dq_dirty); init_waitqueue_head(&dquot->dq_wait_unused); dquot->dq_sb = sb; - dquot->dq_type = type; + dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); return dquot; @@ -829,35 +831,35 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) * a) checking for quota flags under dq_list_lock and * b) getting a reference to dquot before we release dq_list_lock */ -struct dquot *dqget(struct super_block *sb, unsigned int id, int type) +struct dquot *dqget(struct super_block *sb, struct kqid qid) { - unsigned int hashent = hashfn(sb, id, type); + unsigned int hashent = hashfn(sb, qid); struct dquot *dquot = NULL, *empty = NULL; - if (!sb_has_quota_active(sb, type)) + if (!sb_has_quota_active(sb, qid.type)) return NULL; we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); - if (!sb_has_quota_active(sb, type)) { + if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); goto out; } spin_unlock(&dq_state_lock); - dquot = find_dquot(hashent, sb, id, type); + dquot = find_dquot(hashent, sb, qid); if (!dquot) { if (!empty) { spin_unlock(&dq_list_lock); - empty = get_empty_dquot(sb, type); + empty = get_empty_dquot(sb, qid.type); if (!empty) schedule(); /* Try to wait for a moment... */ goto we_slept; } dquot = empty; empty = NULL; - dquot->dq_id = id; + dquot->dq_id = qid; /* all dquots go on the inuse_list */ put_inuse(dquot); /* hash it first so it can be found */ @@ -1129,8 +1131,7 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) struct dquot_warn { struct super_block *w_sb; - qid_t w_dq_id; - short w_dq_type; + struct kqid w_dq_id; short w_type; }; @@ -1154,11 +1155,11 @@ static int need_print_warning(struct dquot_warn *warn) if (!flag_print_warnings) return 0; - switch (warn->w_dq_type) { + switch (warn->w_dq_id.type) { case USRQUOTA: - return current_fsuid() == warn->w_dq_id; + return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: - return in_group_p(warn->w_dq_id); + return in_group_p(warn->w_dq_id.gid); } return 0; } @@ -1184,7 +1185,7 @@ static void print_warning(struct dquot_warn *warn) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); - tty_write_message(tty, quotatypes[warn->w_dq_type]); + tty_write_message(tty, quotatypes[warn->w_dq_id.type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; @@ -1218,7 +1219,6 @@ static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, warn->w_type = warntype; warn->w_sb = dquot->dq_sb; warn->w_dq_id = dquot->dq_id; - warn->w_dq_type = dquot->dq_type; } /* @@ -1236,14 +1236,14 @@ static void flush_warnings(struct dquot_warn *warn) #ifdef CONFIG_PRINT_QUOTA_WARNING print_warning(&warn[i]); #endif - quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id, + quota_send_warning(warn[i].w_dq_id, warn[i].w_sb->s_dev, warn[i].w_type); } } static int ignore_hardlimit(struct dquot *dquot) { - struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || @@ -1256,7 +1256,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, { qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; - if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1281,7 +1281,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, dquot->dq_dqb.dqb_itime == 0) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = get_seconds() + - sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } return 0; @@ -1294,7 +1294,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, qsize_t tspace; struct super_block *sb = dquot->dq_sb; - if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || + if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1325,7 +1325,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, if (!prealloc) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = get_seconds() + - sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; } else /* @@ -1344,7 +1344,7 @@ static int info_idq_free(struct dquot *dquot, qsize_t inodes) if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || - !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) + !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type)) return QUOTA_NL_NOWARN; newinodes = dquot->dq_dqb.dqb_curinodes - inodes; @@ -1390,7 +1390,6 @@ static int dquot_active(const struct inode *inode) */ static void __dquot_initialize(struct inode *inode, int type) { - unsigned int id = 0; int cnt; struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; @@ -1403,18 +1402,19 @@ static void __dquot_initialize(struct inode *inode, int type) /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct kqid qid; got[cnt] = NULL; if (type != -1 && cnt != type) continue; switch (cnt) { case USRQUOTA: - id = inode->i_uid; + qid = make_kqid_uid(inode->i_uid); break; case GRPQUOTA: - id = inode->i_gid; + qid = make_kqid_gid(inode->i_gid); break; } - got[cnt] = dqget(sb, id, cnt); + got[cnt] = dqget(sb, qid); } down_write(&sb_dqopt(sb)->dqptr_sem); @@ -1897,10 +1897,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) - transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); - if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) - transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA); + if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) + transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)) + transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid)); ret = __dquot_transfer(inode, transfer_to); dqput_all(transfer_to); @@ -2360,9 +2360,9 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) memset(di, 0, sizeof(*di)); di->d_version = FS_DQUOT_VERSION; - di->d_flags = dquot->dq_type == USRQUOTA ? + di->d_flags = dquot->dq_id.type == USRQUOTA ? FS_USER_QUOTA : FS_GROUP_QUOTA; - di->d_id = dquot->dq_id; + di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id); spin_lock(&dq_data_lock); di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); @@ -2376,12 +2376,12 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) spin_unlock(&dq_data_lock); } -int dquot_get_dqblk(struct super_block *sb, int type, qid_t id, +int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct fs_disk_quota *di) { struct dquot *dquot; - dquot = dqget(sb, id, type); + dquot = dqget(sb, qid); if (!dquot) return -ESRCH; do_get_dqblk(dquot, di); @@ -2401,7 +2401,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; - struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; if (di->d_fieldmask & ~VFS_FS_DQ_MASK) return -EINVAL; @@ -2488,13 +2488,13 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) return 0; } -int dquot_set_dqblk(struct super_block *sb, int type, qid_t id, +int dquot_set_dqblk(struct super_block *sb, struct kqid qid, struct fs_disk_quota *di) { struct dquot *dquot; int rc; - dquot = dqget(sb, id, type); + dquot = dqget(sb, qid); if (!dquot) { rc = -ESRCH; goto out; diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c new file mode 100644 index 0000000..2f97b0e --- /dev/null +++ b/fs/quota/kqid.c @@ -0,0 +1,132 @@ +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/export.h> + +/** + * qid_eq - Test to see if to kquid values are the same + * @left: A qid value + * @right: Another quid value + * + * Return true if the two qid values are equal and false otherwise. + */ +bool qid_eq(struct kqid left, struct kqid right) +{ + if (left.type != right.type) + return false; + switch(left.type) { + case USRQUOTA: + return uid_eq(left.uid, right.uid); + case GRPQUOTA: + return gid_eq(left.gid, right.gid); + case PRJQUOTA: + return projid_eq(left.projid, right.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_eq); + +/** + * qid_lt - Test to see if one qid value is less than another + * @left: The possibly lesser qid value + * @right: The possibly greater qid value + * + * Return true if left is less than right and false otherwise. + */ +bool qid_lt(struct kqid left, struct kqid right) +{ + if (left.type < right.type) + return true; + if (left.type > right.type) + return false; + switch (left.type) { + case USRQUOTA: + return uid_lt(left.uid, right.uid); + case GRPQUOTA: + return gid_lt(left.gid, right.gid); + case PRJQUOTA: + return projid_lt(left.projid, right.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_lt); + +/** + * from_kqid - Create a qid from a kqid user-namespace pair. + * @targ: The user namespace we want a qid in. + * @kuid: The kernel internal quota identifier to start with. + * + * Map @kqid into the user-namespace specified by @targ and + * return the resulting qid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kqid has no mapping in @targ (qid_t)-1 is returned. + */ +qid_t from_kqid(struct user_namespace *targ, struct kqid kqid) +{ + switch (kqid.type) { + case USRQUOTA: + return from_kuid(targ, kqid.uid); + case GRPQUOTA: + return from_kgid(targ, kqid.gid); + case PRJQUOTA: + return from_kprojid(targ, kqid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(from_kqid); + +/** + * from_kqid_munged - Create a qid from a kqid user-namespace pair. + * @targ: The user namespace we want a qid in. + * @kqid: The kernel internal quota identifier to start with. + * + * Map @kqid into the user-namespace specified by @targ and + * return the resulting qid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kqid from_kqid_munged never fails and always + * returns a valid projid. This makes from_kqid_munged + * appropriate for use in places where failing to provide + * a qid_t is not a good option. + * + * If @kqid has no mapping in @targ the kqid.type specific + * overflow identifier is returned. + */ +qid_t from_kqid_munged(struct user_namespace *targ, struct kqid kqid) +{ + switch (kqid.type) { + case USRQUOTA: + return from_kuid_munged(targ, kqid.uid); + case GRPQUOTA: + return from_kgid_munged(targ, kqid.gid); + case PRJQUOTA: + return from_kprojid_munged(targ, kqid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(from_kqid_munged); + +/** + * qid_valid - Report if a valid value is stored in a kqid. + * @qid: The kernel internal quota identifier to test. + */ +bool qid_valid(struct kqid qid) +{ + switch (qid.type) { + case USRQUOTA: + return uid_valid(qid.uid); + case GRPQUOTA: + return gid_valid(qid.gid); + case PRJQUOTA: + return projid_valid(qid.projid); + default: + BUG(); + } +} +EXPORT_SYMBOL(qid_valid); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index d67908b..16e8abb 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -30,7 +30,7 @@ static struct genl_family quota_genl_family = { * */ -void quota_send_warning(short type, unsigned int id, dev_t dev, +void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { static atomic_t seq; @@ -56,10 +56,11 @@ void quota_send_warning(short type, unsigned int id, dev_t dev, "VFS: Cannot store netlink header in quota warning.\n"); goto err_out; } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, + from_kqid_munged(&init_user_ns, qid)); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); @@ -71,7 +72,8 @@ void quota_send_warning(short type, unsigned int id, dev_t dev, ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, + from_kuid_munged(&init_user_ns, current_uid())); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 6f15578..ff0135d 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -32,8 +32,8 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, /* allow to query information for dquots we "own" */ case Q_GETQUOTA: case Q_XGETQUOTA: - if ((type == USRQUOTA && current_euid() == id) || - (type == GRPQUOTA && in_egroup_p(id))) + if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) || + (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id)))) break; /*FALLTHROUGH*/ default: @@ -130,13 +130,17 @@ static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) static int quota_getquota(struct super_block *sb, int type, qid_t id, void __user *addr) { + struct kqid qid; struct fs_disk_quota fdq; struct if_dqblk idq; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; - ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); @@ -176,13 +180,17 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, { struct fs_disk_quota fdq; struct if_dqblk idq; + struct kqid qid; if (copy_from_user(&idq, addr, sizeof(idq))) return -EFAULT; if (!sb->s_qcop->set_dqblk) return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; copy_from_if_dqblk(&fdq, &idq); - return sb->s_qcop->set_dqblk(sb, type, id, &fdq); + return sb->s_qcop->set_dqblk(sb, qid, &fdq); } static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) @@ -213,23 +221,31 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; + struct kqid qid; if (copy_from_user(&fdq, addr, sizeof(fdq))) return -EFAULT; if (!sb->s_qcop->set_dqblk) return -ENOSYS; - return sb->s_qcop->set_dqblk(sb, type, id, &fdq); + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + return sb->s_qcop->set_dqblk(sb, qid, &fdq); } static int quota_getxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; + struct kqid qid; int ret; if (!sb->s_qcop->get_dqblk) return -ENOSYS; - ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index e41c1becf..d65877f 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -22,9 +22,10 @@ MODULE_LICENSE("GPL"); #define __QUOTA_QT_PARANOIA -static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) +static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) { unsigned int epb = info->dqi_usable_bs >> 2; + qid_t id = from_kqid(&init_user_ns, qid); depth = info->dqi_qtree_depth - depth - 1; while (depth--) @@ -244,7 +245,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, /* This is enough as the block is already zeroed and the entry * list is empty... */ info->dqi_free_entry = blk; - mark_info_dirty(dquot->dq_sb, dquot->dq_type); + mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); } /* Block will be full? */ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { @@ -357,7 +358,7 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, */ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; char *ddquot = getdqbuf(info->dqi_entry_size); @@ -538,8 +539,9 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { - quota_error(dquot->dq_sb, "Quota for id %u referenced " - "but not present", dquot->dq_id); + quota_error(dquot->dq_sb, + "Quota for id %u referenced but not present", + from_kqid(&init_user_ns, dquot->dq_id)); ret = -EIO; goto out_buf; } else { @@ -589,7 +591,7 @@ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; loff_t offset; char *ddquot; @@ -607,8 +609,10 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) - quota_error(sb, "Can't read quota structure " - "for id %u", dquot->dq_id); + quota_error(sb,"Can't read quota structure " + "for id %u", + from_kqid(&init_user_ns, + dquot->dq_id)); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); @@ -626,7 +630,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (ret >= 0) ret = -EIO; quota_error(sb, "Error while reading quota structure for id %u", - dquot->dq_id); + from_kqid(&init_user_ns, dquot->dq_id)); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 34b37a6..469c684 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -54,7 +54,7 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) static int v1_read_dqblk(struct dquot *dquot) { - int type = dquot->dq_type; + int type = dquot->dq_id.type; struct v1_disk_dqblk dqblk; if (!sb_dqopt(dquot->dq_sb)->files[type]) @@ -63,7 +63,8 @@ static int v1_read_dqblk(struct dquot *dquot) /* Set structure to 0s in case read fails/is after end of file */ memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, - sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); + sizeof(struct v1_disk_dqblk), + v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id))); v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); if (dquot->dq_dqb.dqb_bhardlimit == 0 && @@ -78,12 +79,13 @@ static int v1_read_dqblk(struct dquot *dquot) static int v1_commit_dqblk(struct dquot *dquot) { - short type = dquot->dq_type; + short type = dquot->dq_id.type; ssize_t ret; struct v1_disk_dqblk dqblk; v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); - if (dquot->dq_id == 0) { + if (((type == USRQUOTA) && uid_eq(dquot->dq_id.uid, GLOBAL_ROOT_UID)) || + ((type == GRPQUOTA) && gid_eq(dquot->dq_id.gid, GLOBAL_ROOT_GID))) { dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; dqblk.dqb_itime = @@ -93,7 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot) if (sb_dqopt(dquot->dq_sb)->files[type]) ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), - v1_dqoff(dquot->dq_id)); + v1_dqoff(from_kqid(&init_user_ns, dquot->dq_id))); if (ret != sizeof(struct v1_disk_dqblk)) { quota_error(dquot->dq_sb, "dquota write failed"); if (ret >= 0) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index f1ab360..02751ec 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -196,7 +196,7 @@ static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) struct v2r0_disk_dqblk *d = dp; struct mem_dqblk *m = &dquot->dq_dqb; struct qtree_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); @@ -206,7 +206,7 @@ static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); - d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); if (qtree_entry_unused(info, dp)) d->dqb_itime = cpu_to_le64(1); } @@ -215,11 +215,13 @@ static int v2r0_is_id(void *dp, struct dquot *dquot) { struct v2r0_disk_dqblk *d = dp; struct qtree_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; if (qtree_entry_unused(info, dp)) return 0; - return le32_to_cpu(d->dqb_id) == dquot->dq_id; + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); } static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) @@ -247,7 +249,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) struct v2r1_disk_dqblk *d = dp; struct mem_dqblk *m = &dquot->dq_dqb; struct qtree_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); @@ -257,7 +259,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); - d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); if (qtree_entry_unused(info, dp)) d->dqb_itime = cpu_to_le64(1); } @@ -266,26 +268,28 @@ static int v2r1_is_id(void *dp, struct dquot *dquot) { struct v2r1_disk_dqblk *d = dp; struct qtree_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv; if (qtree_entry_unused(info, dp)) return 0; - return le32_to_cpu(d->dqb_id) == dquot->dq_id; + return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type, + le32_to_cpu(d->dqb_id)), + dquot->dq_id); } static int v2_read_dquot(struct dquot *dquot) { - return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); + return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); } static int v2_write_dquot(struct dquot *dquot) { - return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); + return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); } static int v2_release_dquot(struct dquot *dquot) { - return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); + return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); } static int v2_free_file_info(struct super_block *sb, int type) diff --git a/fs/read_write.c b/fs/read_write.c index 1adfb69..d065348 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -232,23 +232,18 @@ EXPORT_SYMBOL(vfs_llseek); SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) { off_t retval; - struct file * file; - int fput_needed; - - retval = -EBADF; - file = fget_light(fd, &fput_needed); - if (!file) - goto bad; + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; retval = -EINVAL; if (origin <= SEEK_MAX) { - loff_t res = vfs_llseek(file, offset, origin); + loff_t res = vfs_llseek(f.file, offset, origin); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fput_light(file, fput_needed); -bad: + fdput(f); return retval; } @@ -258,20 +253,17 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned int, origin) { int retval; - struct file * file; + struct fd f = fdget(fd); loff_t offset; - int fput_needed; - retval = -EBADF; - file = fget_light(fd, &fput_needed); - if (!file) - goto bad; + if (!f.file) + return -EBADF; retval = -EINVAL; if (origin > SEEK_MAX) goto out_putf; - offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, + offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, origin); retval = (int)offset; @@ -281,8 +273,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, retval = 0; } out_putf: - fput_light(file, fput_needed); -bad: + fdput(f); return retval; } #endif @@ -461,34 +452,29 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct file *file; + struct fd f = fdget(fd); ssize_t ret = -EBADF; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (file) { - loff_t pos = file_pos_read(file); - ret = vfs_read(file, buf, count, &pos); - file_pos_write(file, pos); - fput_light(file, fput_needed); + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_read(f.file, buf, count, &pos); + file_pos_write(f.file, pos); + fdput(f); } - return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct file *file; + struct fd f = fdget(fd); ssize_t ret = -EBADF; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (file) { - loff_t pos = file_pos_read(file); - ret = vfs_write(file, buf, count, &pos); - file_pos_write(file, pos); - fput_light(file, fput_needed); + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_write(f.file, buf, count, &pos); + file_pos_write(f.file, pos); + fdput(f); } return ret; @@ -497,19 +483,18 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, size_t count, loff_t pos) { - struct file *file; + struct fd f; ssize_t ret = -EBADF; - int fput_needed; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (file) { + f = fdget(fd); + if (f.file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PREAD) - ret = vfs_read(file, buf, count, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PREAD) + ret = vfs_read(f.file, buf, count, &pos); + fdput(f); } return ret; @@ -526,19 +511,18 @@ SYSCALL_ALIAS(sys_pread64, SyS_pread64); SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { - struct file *file; + struct fd f; ssize_t ret = -EBADF; - int fput_needed; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (file) { + f = fdget(fd); + if (f.file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PWRITE) - ret = vfs_write(file, buf, count, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PWRITE) + ret = vfs_write(f.file, buf, count, &pos); + fdput(f); } return ret; @@ -789,16 +773,14 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct file *file; + struct fd f = fdget(fd); ssize_t ret = -EBADF; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (file) { - loff_t pos = file_pos_read(file); - ret = vfs_readv(file, vec, vlen, &pos); - file_pos_write(file, pos); - fput_light(file, fput_needed); + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_readv(f.file, vec, vlen, &pos); + file_pos_write(f.file, pos); + fdput(f); } if (ret > 0) @@ -810,16 +792,14 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct file *file; + struct fd f = fdget(fd); ssize_t ret = -EBADF; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (file) { - loff_t pos = file_pos_read(file); - ret = vfs_writev(file, vec, vlen, &pos); - file_pos_write(file, pos); - fput_light(file, fput_needed); + if (f.file) { + loff_t pos = file_pos_read(f.file); + ret = vfs_writev(f.file, vec, vlen, &pos); + file_pos_write(f.file, pos); + fdput(f); } if (ret > 0) @@ -838,19 +818,18 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); - struct file *file; + struct fd f; ssize_t ret = -EBADF; - int fput_needed; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (file) { + f = fdget(fd); + if (f.file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PREAD) - ret = vfs_readv(file, vec, vlen, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PREAD) + ret = vfs_readv(f.file, vec, vlen, &pos); + fdput(f); } if (ret > 0) @@ -863,19 +842,18 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); - struct file *file; + struct fd f; ssize_t ret = -EBADF; - int fput_needed; if (pos < 0) return -EINVAL; - file = fget_light(fd, &fput_needed); - if (file) { + f = fdget(fd); + if (f.file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PWRITE) - ret = vfs_writev(file, vec, vlen, &pos); - fput_light(file, fput_needed); + if (f.file->f_mode & FMODE_PWRITE) + ret = vfs_writev(f.file, vec, vlen, &pos); + fdput(f); } if (ret > 0) @@ -884,31 +862,31 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } -static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, - size_t count, loff_t max) +ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, + loff_t max) { - struct file * in_file, * out_file; - struct inode * in_inode, * out_inode; + struct fd in, out; + struct inode *in_inode, *out_inode; loff_t pos; ssize_t retval; - int fput_needed_in, fput_needed_out, fl; + int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; - in_file = fget_light(in_fd, &fput_needed_in); - if (!in_file) + in = fdget(in_fd); + if (!in.file) goto out; - if (!(in_file->f_mode & FMODE_READ)) + if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) - ppos = &in_file->f_pos; + ppos = &in.file->f_pos; else - if (!(in_file->f_mode & FMODE_PREAD)) + if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in_file, ppos, count); + retval = rw_verify_area(READ, in.file, ppos, count); if (retval < 0) goto fput_in; count = retval; @@ -917,15 +895,15 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, * Get output file, and verify that it is ok.. */ retval = -EBADF; - out_file = fget_light(out_fd, &fput_needed_out); - if (!out_file) + out = fdget(out_fd); + if (!out.file) goto fput_in; - if (!(out_file->f_mode & FMODE_WRITE)) + if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - in_inode = in_file->f_path.dentry->d_inode; - out_inode = out_file->f_path.dentry->d_inode; - retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); + in_inode = in.file->f_path.dentry->d_inode; + out_inode = out.file->f_path.dentry->d_inode; + retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -949,10 +927,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ - if (in_file->f_flags & O_NONBLOCK) + if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in_file, ppos, out_file, count, fl); + retval = do_splice_direct(in.file, ppos, out.file, count, fl); if (retval > 0) { add_rchar(current, retval); @@ -965,9 +943,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EOVERFLOW; fput_out: - fput_light(out_file, fput_needed_out); + fdput(out); fput_in: - fput_light(in_file, fput_needed_in); + fdput(in); out: return retval; } diff --git a/fs/read_write.h b/fs/read_write.h index d07b954..d3e00ef 100644 --- a/fs/read_write.h +++ b/fs/read_write.h @@ -12,3 +12,5 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, unsigned long nr_segs, loff_t *ppos, io_fn_t fn); +ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, + loff_t max); diff --git a/fs/readdir.c b/fs/readdir.c index 39e3370..5e69ef5 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -106,22 +106,20 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct file * file; + struct fd f = fdget(fd); struct readdir_callback buf; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; buf.result = 0; buf.dirent = dirent; - error = vfs_readdir(file, fillonedir, &buf); + error = vfs_readdir(f.file, fillonedir, &buf); if (buf.result) error = buf.result; - fput_light(file, fput_needed); + fdput(f); return error; } @@ -191,17 +189,16 @@ efault: SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { - struct file * file; + struct fd f; struct linux_dirent __user * lastdirent; struct getdents_callback buf; - int fput_needed; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; buf.current_dir = dirent; @@ -209,17 +206,17 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, buf.count = count; buf.error = 0; - error = vfs_readdir(file, filldir, &buf); + error = vfs_readdir(f.file, filldir, &buf); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(file->f_pos, &lastdirent->d_off)) + if (put_user(f.file->f_pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } - fput_light(file, fput_needed); + fdput(f); return error; } @@ -272,17 +269,16 @@ efault: SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { - struct file * file; + struct fd f; struct linux_dirent64 __user * lastdirent; struct getdents_callback64 buf; - int fput_needed; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) return -EFAULT; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return -EBADF; buf.current_dir = dirent; @@ -290,17 +286,17 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, buf.count = count; buf.error = 0; - error = vfs_readdir(file, filldir64, &buf); + error = vfs_readdir(f.file, filldir64, &buf); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = file->f_pos; + typeof(lastdirent->d_off) d_off = f.file->f_pos; if (__put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; } - fput_light(file, fput_needed); + fdput(f); return error; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 855da58..4648555 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1155,8 +1155,8 @@ static void init_inode(struct inode *inode, struct treepath *path) set_inode_sd_version(inode, STAT_DATA_V1); inode->i_mode = sd_v1_mode(sd); set_nlink(inode, sd_v1_nlink(sd)); - inode->i_uid = sd_v1_uid(sd); - inode->i_gid = sd_v1_gid(sd); + i_uid_write(inode, sd_v1_uid(sd)); + i_gid_write(inode, sd_v1_gid(sd)); inode->i_size = sd_v1_size(sd); inode->i_atime.tv_sec = sd_v1_atime(sd); inode->i_mtime.tv_sec = sd_v1_mtime(sd); @@ -1200,9 +1200,9 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_mode = sd_v2_mode(sd); set_nlink(inode, sd_v2_nlink(sd)); - inode->i_uid = sd_v2_uid(sd); + i_uid_write(inode, sd_v2_uid(sd)); inode->i_size = sd_v2_size(sd); - inode->i_gid = sd_v2_gid(sd); + i_gid_write(inode, sd_v2_gid(sd)); inode->i_mtime.tv_sec = sd_v2_mtime(sd); inode->i_atime.tv_sec = sd_v2_atime(sd); inode->i_ctime.tv_sec = sd_v2_ctime(sd); @@ -1258,9 +1258,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_mode(sd_v2, inode->i_mode); set_sd_v2_nlink(sd_v2, inode->i_nlink); - set_sd_v2_uid(sd_v2, inode->i_uid); + set_sd_v2_uid(sd_v2, i_uid_read(inode)); set_sd_v2_size(sd_v2, size); - set_sd_v2_gid(sd_v2, inode->i_gid); + set_sd_v2_gid(sd_v2, i_gid_read(inode)); set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); @@ -1280,8 +1280,8 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; set_sd_v1_mode(sd_v1, inode->i_mode); - set_sd_v1_uid(sd_v1, inode->i_uid); - set_sd_v1_gid(sd_v1, inode->i_gid); + set_sd_v1_uid(sd_v1, i_uid_read(inode)); + set_sd_v1_gid(sd_v1, i_gid_read(inode)); set_sd_v1_nlink(sd_v1, inode->i_nlink); set_sd_v1_size(sd_v1, size); set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); @@ -1869,7 +1869,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } if (old_format_only(sb)) { - if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { + if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { pathrelse(&path_to_key); /* i_uid or i_gid is too big to be stored in stat data v3.5 */ err = -EINVAL; @@ -3140,16 +3140,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } } - if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || - ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && + if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && (get_inode_sd_version(inode) == STAT_DATA_V1)) { /* stat data of format v3.5 has 16 bit uid and gid */ error = -EINVAL; goto out; } - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { struct reiserfs_transaction_handle th; int jbegin_count = 2 * diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a37dab..1078ae1 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -608,6 +608,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(reiserfs_inode_cachep); } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index d319963..c196369 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -896,7 +896,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -const struct xattr_handler *reiserfs_xattr_handlers[] = { +static const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 44474f9..d7c01ef 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -30,7 +30,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) { return PTR_ERR(acl); } else if (acl) { @@ -77,7 +77,7 @@ posix_acl_get(struct dentry *dentry, const char *name, void *buffer, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -121,15 +121,23 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) case ACL_OTHER: value = (char *)value + sizeof(reiserfs_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; case ACL_GROUP: value = (char *)value + sizeof(reiserfs_acl_entry); if ((char *)value > end) goto fail; - acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); break; default: @@ -164,13 +172,19 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); e = (char *)ext_acl + sizeof(reiserfs_acl_header); for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(reiserfs_acl_entry); + break; case ACL_GROUP: - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(reiserfs_acl_entry); break; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 77c5f21..fd7c5f6 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -648,6 +648,11 @@ error_register: static void __exit exit_romfs_fs(void) { unregister_filesystem(&romfs_fs_type); + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(romfs_inode_cachep); } diff --git a/fs/select.c b/fs/select.c index db14c78..2ef72d9 100644 --- a/fs/select.c +++ b/fs/select.c @@ -220,8 +220,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; - get_file(filp); - entry->filp = filp; + entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); @@ -429,8 +428,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; - const struct file_operations *f_op = NULL; - struct file *file = NULL; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; @@ -440,20 +437,21 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { - int fput_needed; + struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; - file = fget_light(i, &fput_needed); - if (file) { - f_op = file->f_op; + f = fdget(i); + if (f.file) { + const struct file_operations *f_op; + f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { wait_key_set(wait, in, out, bit); - mask = (*f_op->poll)(file, wait); + mask = (*f_op->poll)(f.file, wait); } - fput_light(file, fput_needed); + fdput(f); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; @@ -726,20 +724,17 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = 0; fd = pollfd->fd; if (fd >= 0) { - int fput_needed; - struct file * file; - - file = fget_light(fd, &fput_needed); + struct fd f = fdget(fd); mask = POLLNVAL; - if (file != NULL) { + if (f.file) { mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) { + if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; - mask = file->f_op->poll(file, pwait); + mask = f.file->f_op->poll(f.file, pwait); } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; - fput_light(file, fput_needed); + fdput(f); } } pollfd->revents = mask; diff --git a/fs/seq_file.c b/fs/seq_file.c index 14cf9de..99dffab 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -9,6 +9,7 @@ #include <linux/export.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/cred.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -56,6 +57,9 @@ int seq_open(struct file *file, const struct seq_operations *op) memset(p, 0, sizeof(*p)); mutex_init(&p->lock); p->op = op; +#ifdef CONFIG_USER_NS + p->user_ns = file->f_cred->user_ns; +#endif /* * Wrappers around seq_open(e.g. swaps_open) need to be diff --git a/fs/signalfd.c b/fs/signalfd.c index 9f35a37..8bee4e5 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -269,13 +269,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, if (ufd < 0) kfree(ctx); } else { - int fput_needed; - struct file *file = fget_light(ufd, &fput_needed); - if (!file) + struct fd f = fdget(ufd); + if (!f.file) return -EBADF; - ctx = file->private_data; - if (file->f_op != &signalfd_fops) { - fput_light(file, fput_needed); + ctx = f.file->private_data; + if (f.file->f_op != &signalfd_fops) { + fdput(f); return -EINVAL; } spin_lock_irq(¤t->sighand->siglock); @@ -283,7 +282,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); - fput_light(file, fput_needed); + fdput(f); } return ufd; diff --git a/fs/splice.c b/fs/splice.c index 41514dd..13e5b47 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1666,9 +1666,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, unsigned long, nr_segs, unsigned int, flags) { - struct file *file; + struct fd f; long error; - int fput; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; @@ -1676,14 +1675,14 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return 0; error = -EBADF; - file = fget_light(fd, &fput); - if (file) { - if (file->f_mode & FMODE_WRITE) - error = vmsplice_to_pipe(file, iov, nr_segs, flags); - else if (file->f_mode & FMODE_READ) - error = vmsplice_to_user(file, iov, nr_segs, flags); - - fput_light(file, fput); + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_WRITE) + error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); + else if (f.file->f_mode & FMODE_READ) + error = vmsplice_to_user(f.file, iov, nr_segs, flags); + + fdput(f); } return error; @@ -1693,30 +1692,27 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { + struct fd in, out; long error; - struct file *in, *out; - int fput_in, fput_out; if (unlikely(!len)) return 0; error = -EBADF; - in = fget_light(fd_in, &fput_in); - if (in) { - if (in->f_mode & FMODE_READ) { - out = fget_light(fd_out, &fput_out); - if (out) { - if (out->f_mode & FMODE_WRITE) - error = do_splice(in, off_in, - out, off_out, + in = fdget(fd_in); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + out = fdget(fd_out); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_splice(in.file, off_in, + out.file, off_out, len, flags); - fput_light(out, fput_out); + fdput(out); } } - - fput_light(in, fput_in); + fdput(in); } - return error; } @@ -2027,26 +2023,25 @@ static long do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct file *in; - int error, fput_in; + struct fd in; + int error; if (unlikely(!len)) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); - if (in) { - if (in->f_mode & FMODE_READ) { - int fput_out; - struct file *out = fget_light(fdout, &fput_out); - - if (out) { - if (out->f_mode & FMODE_WRITE) - error = do_tee(in, out, len, flags); - fput_light(out, fput_out); + in = fdget(fdin); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + struct fd out = fdget(fdout); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_tee(in.file, out.file, + len, flags); + fdput(out); } } - fput_light(in, fput_in); + fdput(in); } return error; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 81afbcc..a1ce5ce 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -56,16 +56,20 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, struct squashfs_base_inode *sqsh_ino) { + uid_t i_uid; + gid_t i_gid; int err; - err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; - err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid); if (err) return err; + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); inode->i_atime.tv_sec = inode->i_mtime.tv_sec; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 29cd014..260e392 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -425,6 +425,11 @@ static int __init init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(squashfs_inode_cachep); } @@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr); int vfs_fstat(unsigned int fd, struct kstat *stat) { - int fput_needed; - struct file *f = fget_light(fd, &fput_needed); + struct fd f = fdget_raw(fd); int error = -EBADF; - if (f) { - error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); - fput_light(f, fput_needed); + if (f.file) { + error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry, + stat); + fdput(f); } return error; } @@ -326,7 +326,7 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, /* ---------- LFS-64 ----------- */ -#ifdef __ARCH_WANT_STAT64 +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) #ifndef INIT_STRUCT_STAT64_PADDING # define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) @@ -415,7 +415,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, return error; return cp_new_stat64(&stat, statbuf); } -#endif /* __ARCH_WANT_STAT64 */ +#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) diff --git a/fs/statfs.c b/fs/statfs.c index 95ad5c0..f8e832e 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -87,12 +87,11 @@ int user_statfs(const char __user *pathname, struct kstatfs *st) int fd_statfs(int fd, struct kstatfs *st) { - int fput_needed; - struct file *file = fget_light(fd, &fput_needed); + struct fd f = fdget(fd); int error = -EBADF; - if (file) { - error = vfs_statfs(&file->f_path, st); - fput_light(file, fput_needed); + if (f.file) { + error = vfs_statfs(&f.file->f_path, st); + fdput(f); } return error; } @@ -307,12 +307,6 @@ void deactivate_locked_super(struct super_block *s) /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); - - /* - * We need to call rcu_barrier so all the delayed rcu free - * inodes are flushed before we release the fs module. - */ - rcu_barrier(); put_filesystem(fs); put_super(s); } else { @@ -871,7 +865,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) @@ -148,21 +148,19 @@ void emergency_sync(void) */ SYSCALL_DEFINE1(syncfs, int, fd) { - struct file *file; + struct fd f = fdget(fd); struct super_block *sb; int ret; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - sb = file->f_dentry->d_sb; + sb = f.file->f_dentry->d_sb; down_read(&sb->s_umount); ret = sync_filesystem(sb); up_read(&sb->s_umount); - fput_light(file, fput_needed); + fdput(f); return ret; } @@ -201,14 +199,12 @@ EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) { - struct file *file; + struct fd f = fdget(fd); int ret = -EBADF; - int fput_needed; - file = fget_light(fd, &fput_needed); - if (file) { - ret = vfs_fsync(file, datasync); - fput_light(file, fput_needed); + if (f.file) { + ret = vfs_fsync(f.file, datasync); + fdput(f); } return ret; } @@ -291,10 +287,9 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, unsigned int flags) { int ret; - struct file *file; + struct fd f; struct address_space *mapping; loff_t endbyte; /* inclusive */ - int fput_needed; umode_t i_mode; ret = -EINVAL; @@ -333,17 +328,17 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, endbyte--; /* inclusive */ ret = -EBADF; - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) goto out; - i_mode = file->f_path.dentry->d_inode->i_mode; + i_mode = f.file->f_path.dentry->d_inode->i_mode; ret = -ESPIPE; if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && !S_ISLNK(i_mode)) goto out_put; - mapping = file->f_mapping; + mapping = f.file->f_mapping; if (!mapping) { ret = -EINVAL; goto out_put; @@ -366,7 +361,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, ret = filemap_fdatawait_range(mapping, offset, endbyte); out_put: - fput_light(file, fput_needed); + fdput(f); out: return ret; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index a7ac78f..3c9eb56 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -113,7 +113,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target, * @target: object we're pointing to. * @name: name of the symlink. * - * This function does the same as sysf_create_link(), but it + * This function does the same as sysfs_create_link(), but it * doesn't warn if the link already exists. */ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 80e1e2b..d33e506 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -202,8 +202,8 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) } /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); - inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); - inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); + i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid)); + i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); @@ -256,8 +256,8 @@ static int __sysv_write_inode(struct inode *inode, int wait) } raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); - raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); - raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode))); + raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); @@ -360,5 +360,10 @@ int __init sysv_init_icache(void) void sysv_destroy_icache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(sysv_inode_cachep); } diff --git a/fs/timerfd.c b/fs/timerfd.c index dffeb37..d03822b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -234,19 +234,17 @@ static const struct file_operations timerfd_fops = { .llseek = noop_llseek, }; -static struct file *timerfd_fget(int fd) +static int timerfd_fget(int fd, struct fd *p) { - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - if (file->f_op != &timerfd_fops) { - fput(file); - return ERR_PTR(-EINVAL); + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; + if (f.file->f_op != &timerfd_fops) { + fdput(f); + return -EINVAL; } - - return file; + *p = f; + return 0; } SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) @@ -284,7 +282,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct itimerspec __user *, utmr, struct itimerspec __user *, otmr) { - struct file *file; + struct fd f; struct timerfd_ctx *ctx; struct itimerspec ktmr, kotmr; int ret; @@ -297,10 +295,10 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, !timespec_valid(&ktmr.it_interval)) return -EINVAL; - file = timerfd_fget(ufd); - if (IS_ERR(file)) - return PTR_ERR(file); - ctx = file->private_data; + ret = timerfd_fget(ufd, &f); + if (ret) + return ret; + ctx = f.file->private_data; timerfd_setup_cancel(ctx, flags); @@ -334,7 +332,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, ret = timerfd_setup(ctx, flags, &ktmr); spin_unlock_irq(&ctx->wqh.lock); - fput(file); + fdput(f); if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) return -EFAULT; @@ -343,14 +341,13 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { - struct file *file; + struct fd f; struct timerfd_ctx *ctx; struct itimerspec kotmr; - - file = timerfd_fget(ufd); - if (IS_ERR(file)) - return PTR_ERR(file); - ctx = file->private_data; + int ret = timerfd_fget(ufd, &f); + if (ret) + return ret; + ctx = f.file->private_data; spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv.tv64) { @@ -362,7 +359,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); kotmr.it_interval = ktime_to_timespec(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); - fput(file); + fdput(f); return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index bc4f94b..e8e01d7 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -272,8 +272,8 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) */ static int can_use_rp(struct ubifs_info *c) { - if (current_fsuid() == c->rp_uid || capable(CAP_SYS_RESOURCE) || - (c->rp_gid != 0 && in_group_p(c->rp_gid))) + if (uid_eq(current_fsuid(), c->rp_uid) || capable(CAP_SYS_RESOURCE) || + (!gid_eq(c->rp_gid, GLOBAL_ROOT_GID) && in_group_p(c->rp_gid))) return 1; return 0; } @@ -342,9 +342,8 @@ static int do_budget_space(struct ubifs_info *c) lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; if (unlikely(rsvd_idx_lebs > lebs)) { - dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " - "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs, - rsvd_idx_lebs); + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d", + min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs); return -ENOSPC; } diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 8eda717..ff822934 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -293,8 +293,8 @@ int ubifs_bg_thread(void *info) int err; struct ubifs_info *c = info; - dbg_msg("background thread \"%s\" started, PID %d", - c->bgt_name, current->pid); + ubifs_msg("background thread \"%s\" started, PID %d", + c->bgt_name, current->pid); set_freezable(); while (1) { @@ -328,7 +328,7 @@ int ubifs_bg_thread(void *info) cond_resched(); } - dbg_msg("background thread \"%s\" stops", c->bgt_name); + ubifs_msg("background thread \"%s\" stops", c->bgt_name); return 0; } @@ -514,7 +514,7 @@ struct idx_node { struct list_head list; int iip; union ubifs_key upper_key; - struct ubifs_idx_node idx __attribute__((aligned(8))); + struct ubifs_idx_node idx __aligned(8); }; /** diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 11e4132..2bfa095 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -112,8 +112,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, if (compr->comp_mutex) mutex_unlock(compr->comp_mutex); if (unlikely(err)) { - ubifs_warn("cannot compress %d bytes, compressor %s, " - "error %d, leave data uncompressed", + ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", in_len, compr->name, err); goto no_compr; } @@ -176,8 +175,8 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, if (compr->decomp_mutex) mutex_unlock(compr->decomp_mutex); if (err) - ubifs_err("cannot decompress %d bytes, compressor %s, " - "error %d", in_len, compr->name, err); + ubifs_err("cannot decompress %d bytes, compressor %s, error %d", + in_len, compr->name, err); return err; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index bb31672..6291163 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -219,15 +219,15 @@ const char *dbg_jhead(int jhead) static void dump_ch(const struct ubifs_ch *ch) { - printk(KERN_ERR "\tmagic %#x\n", le32_to_cpu(ch->magic)); - printk(KERN_ERR "\tcrc %#x\n", le32_to_cpu(ch->crc)); - printk(KERN_ERR "\tnode_type %d (%s)\n", ch->node_type, + pr_err("\tmagic %#x\n", le32_to_cpu(ch->magic)); + pr_err("\tcrc %#x\n", le32_to_cpu(ch->crc)); + pr_err("\tnode_type %d (%s)\n", ch->node_type, dbg_ntype(ch->node_type)); - printk(KERN_ERR "\tgroup_type %d (%s)\n", ch->group_type, + pr_err("\tgroup_type %d (%s)\n", ch->group_type, dbg_gtype(ch->group_type)); - printk(KERN_ERR "\tsqnum %llu\n", + pr_err("\tsqnum %llu\n", (unsigned long long)le64_to_cpu(ch->sqnum)); - printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len)); + pr_err("\tlen %u\n", le32_to_cpu(ch->len)); } void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) @@ -238,43 +238,43 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) struct ubifs_dent_node *dent, *pdent = NULL; int count = 2; - printk(KERN_ERR "Dump in-memory inode:"); - printk(KERN_ERR "\tinode %lu\n", inode->i_ino); - printk(KERN_ERR "\tsize %llu\n", + pr_err("Dump in-memory inode:"); + pr_err("\tinode %lu\n", inode->i_ino); + pr_err("\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_ERR "\tnlink %u\n", inode->i_nlink); - printk(KERN_ERR "\tuid %u\n", (unsigned int)inode->i_uid); - printk(KERN_ERR "\tgid %u\n", (unsigned int)inode->i_gid); - printk(KERN_ERR "\tatime %u.%u\n", + pr_err("\tnlink %u\n", inode->i_nlink); + pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode)); + pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode)); + pr_err("\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_ERR "\tmtime %u.%u\n", + pr_err("\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_ERR "\tctime %u.%u\n", + pr_err("\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_ERR "\tcreat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_ERR "\txattr_size %u\n", ui->xattr_size); - printk(KERN_ERR "\txattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_ERR "\txattr_names %u\n", ui->xattr_names); - printk(KERN_ERR "\tdirty %u\n", ui->dirty); - printk(KERN_ERR "\txattr %u\n", ui->xattr); - printk(KERN_ERR "\tbulk_read %u\n", ui->xattr); - printk(KERN_ERR "\tsynced_i_size %llu\n", + pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); + pr_err("\txattr_size %u\n", ui->xattr_size); + pr_err("\txattr_cnt %u\n", ui->xattr_cnt); + pr_err("\txattr_names %u\n", ui->xattr_names); + pr_err("\tdirty %u\n", ui->dirty); + pr_err("\txattr %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tsynced_i_size %llu\n", (unsigned long long)ui->synced_i_size); - printk(KERN_ERR "\tui_size %llu\n", + pr_err("\tui_size %llu\n", (unsigned long long)ui->ui_size); - printk(KERN_ERR "\tflags %d\n", ui->flags); - printk(KERN_ERR "\tcompr_type %d\n", ui->compr_type); - printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read); - printk(KERN_ERR "\tread_in_a_row %lu\n", ui->read_in_a_row); - printk(KERN_ERR "\tdata_len %d\n", ui->data_len); + pr_err("\tflags %d\n", ui->flags); + pr_err("\tcompr_type %d\n", ui->compr_type); + pr_err("\tlast_page_read %lu\n", ui->last_page_read); + pr_err("\tread_in_a_row %lu\n", ui->read_in_a_row); + pr_err("\tdata_len %d\n", ui->data_len); if (!S_ISDIR(inode->i_mode)) return; - printk(KERN_ERR "List of directory entries:\n"); + pr_err("List of directory entries:\n"); ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); lowest_dent_key(c, &key, inode->i_ino); @@ -282,11 +282,11 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { if (PTR_ERR(dent) != -ENOENT) - printk(KERN_ERR "error %ld\n", PTR_ERR(dent)); + pr_err("error %ld\n", PTR_ERR(dent)); break; } - printk(KERN_ERR "\t%d: %s (%s)\n", + pr_err("\t%d: %s (%s)\n", count++, dent->name, get_dent_type(dent->type)); nm.name = dent->name; @@ -305,12 +305,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ch *ch = node; char key_buf[DBG_KEY_BUF_LEN]; - if (dbg_is_tst_rcvry(c)) - return; - /* If the magic is incorrect, just hexdump the first bytes */ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { - printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ); + pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, (void *)node, UBIFS_CH_SZ, 1); return; @@ -324,8 +321,7 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_pad_node *pad = node; - printk(KERN_ERR "\tpad_len %u\n", - le32_to_cpu(pad->pad_len)); + pr_err("\tpad_len %u\n", le32_to_cpu(pad->pad_len)); break; } case UBIFS_SB_NODE: @@ -333,112 +329,77 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_sb_node *sup = node; unsigned int sup_flags = le32_to_cpu(sup->flags); - printk(KERN_ERR "\tkey_hash %d (%s)\n", + pr_err("\tkey_hash %d (%s)\n", (int)sup->key_hash, get_key_hash(sup->key_hash)); - printk(KERN_ERR "\tkey_fmt %d (%s)\n", + pr_err("\tkey_fmt %d (%s)\n", (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); - printk(KERN_ERR "\tflags %#x\n", sup_flags); - printk(KERN_ERR "\t big_lpt %u\n", + pr_err("\tflags %#x\n", sup_flags); + pr_err("\t big_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); - printk(KERN_ERR "\t space_fixup %u\n", + pr_err("\t space_fixup %u\n", !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); - printk(KERN_ERR "\tmin_io_size %u\n", - le32_to_cpu(sup->min_io_size)); - printk(KERN_ERR "\tleb_size %u\n", - le32_to_cpu(sup->leb_size)); - printk(KERN_ERR "\tleb_cnt %u\n", - le32_to_cpu(sup->leb_cnt)); - printk(KERN_ERR "\tmax_leb_cnt %u\n", - le32_to_cpu(sup->max_leb_cnt)); - printk(KERN_ERR "\tmax_bud_bytes %llu\n", + pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); + pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(sup->leb_cnt)); + pr_err("\tmax_leb_cnt %u\n", le32_to_cpu(sup->max_leb_cnt)); + pr_err("\tmax_bud_bytes %llu\n", (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); - printk(KERN_ERR "\tlog_lebs %u\n", - le32_to_cpu(sup->log_lebs)); - printk(KERN_ERR "\tlpt_lebs %u\n", - le32_to_cpu(sup->lpt_lebs)); - printk(KERN_ERR "\torph_lebs %u\n", - le32_to_cpu(sup->orph_lebs)); - printk(KERN_ERR "\tjhead_cnt %u\n", - le32_to_cpu(sup->jhead_cnt)); - printk(KERN_ERR "\tfanout %u\n", - le32_to_cpu(sup->fanout)); - printk(KERN_ERR "\tlsave_cnt %u\n", - le32_to_cpu(sup->lsave_cnt)); - printk(KERN_ERR "\tdefault_compr %u\n", + pr_err("\tlog_lebs %u\n", le32_to_cpu(sup->log_lebs)); + pr_err("\tlpt_lebs %u\n", le32_to_cpu(sup->lpt_lebs)); + pr_err("\torph_lebs %u\n", le32_to_cpu(sup->orph_lebs)); + pr_err("\tjhead_cnt %u\n", le32_to_cpu(sup->jhead_cnt)); + pr_err("\tfanout %u\n", le32_to_cpu(sup->fanout)); + pr_err("\tlsave_cnt %u\n", le32_to_cpu(sup->lsave_cnt)); + pr_err("\tdefault_compr %u\n", (int)le16_to_cpu(sup->default_compr)); - printk(KERN_ERR "\trp_size %llu\n", + pr_err("\trp_size %llu\n", (unsigned long long)le64_to_cpu(sup->rp_size)); - printk(KERN_ERR "\trp_uid %u\n", - le32_to_cpu(sup->rp_uid)); - printk(KERN_ERR "\trp_gid %u\n", - le32_to_cpu(sup->rp_gid)); - printk(KERN_ERR "\tfmt_version %u\n", - le32_to_cpu(sup->fmt_version)); - printk(KERN_ERR "\ttime_gran %u\n", - le32_to_cpu(sup->time_gran)); - printk(KERN_ERR "\tUUID %pUB\n", - sup->uuid); + pr_err("\trp_uid %u\n", le32_to_cpu(sup->rp_uid)); + pr_err("\trp_gid %u\n", le32_to_cpu(sup->rp_gid)); + pr_err("\tfmt_version %u\n", le32_to_cpu(sup->fmt_version)); + pr_err("\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); + pr_err("\tUUID %pUB\n", sup->uuid); break; } case UBIFS_MST_NODE: { const struct ubifs_mst_node *mst = node; - printk(KERN_ERR "\thighest_inum %llu\n", + pr_err("\thighest_inum %llu\n", (unsigned long long)le64_to_cpu(mst->highest_inum)); - printk(KERN_ERR "\tcommit number %llu\n", + pr_err("\tcommit number %llu\n", (unsigned long long)le64_to_cpu(mst->cmt_no)); - printk(KERN_ERR "\tflags %#x\n", - le32_to_cpu(mst->flags)); - printk(KERN_ERR "\tlog_lnum %u\n", - le32_to_cpu(mst->log_lnum)); - printk(KERN_ERR "\troot_lnum %u\n", - le32_to_cpu(mst->root_lnum)); - printk(KERN_ERR "\troot_offs %u\n", - le32_to_cpu(mst->root_offs)); - printk(KERN_ERR "\troot_len %u\n", - le32_to_cpu(mst->root_len)); - printk(KERN_ERR "\tgc_lnum %u\n", - le32_to_cpu(mst->gc_lnum)); - printk(KERN_ERR "\tihead_lnum %u\n", - le32_to_cpu(mst->ihead_lnum)); - printk(KERN_ERR "\tihead_offs %u\n", - le32_to_cpu(mst->ihead_offs)); - printk(KERN_ERR "\tindex_size %llu\n", + pr_err("\tflags %#x\n", le32_to_cpu(mst->flags)); + pr_err("\tlog_lnum %u\n", le32_to_cpu(mst->log_lnum)); + pr_err("\troot_lnum %u\n", le32_to_cpu(mst->root_lnum)); + pr_err("\troot_offs %u\n", le32_to_cpu(mst->root_offs)); + pr_err("\troot_len %u\n", le32_to_cpu(mst->root_len)); + pr_err("\tgc_lnum %u\n", le32_to_cpu(mst->gc_lnum)); + pr_err("\tihead_lnum %u\n", le32_to_cpu(mst->ihead_lnum)); + pr_err("\tihead_offs %u\n", le32_to_cpu(mst->ihead_offs)); + pr_err("\tindex_size %llu\n", (unsigned long long)le64_to_cpu(mst->index_size)); - printk(KERN_ERR "\tlpt_lnum %u\n", - le32_to_cpu(mst->lpt_lnum)); - printk(KERN_ERR "\tlpt_offs %u\n", - le32_to_cpu(mst->lpt_offs)); - printk(KERN_ERR "\tnhead_lnum %u\n", - le32_to_cpu(mst->nhead_lnum)); - printk(KERN_ERR "\tnhead_offs %u\n", - le32_to_cpu(mst->nhead_offs)); - printk(KERN_ERR "\tltab_lnum %u\n", - le32_to_cpu(mst->ltab_lnum)); - printk(KERN_ERR "\tltab_offs %u\n", - le32_to_cpu(mst->ltab_offs)); - printk(KERN_ERR "\tlsave_lnum %u\n", - le32_to_cpu(mst->lsave_lnum)); - printk(KERN_ERR "\tlsave_offs %u\n", - le32_to_cpu(mst->lsave_offs)); - printk(KERN_ERR "\tlscan_lnum %u\n", - le32_to_cpu(mst->lscan_lnum)); - printk(KERN_ERR "\tleb_cnt %u\n", - le32_to_cpu(mst->leb_cnt)); - printk(KERN_ERR "\tempty_lebs %u\n", - le32_to_cpu(mst->empty_lebs)); - printk(KERN_ERR "\tidx_lebs %u\n", - le32_to_cpu(mst->idx_lebs)); - printk(KERN_ERR "\ttotal_free %llu\n", + pr_err("\tlpt_lnum %u\n", le32_to_cpu(mst->lpt_lnum)); + pr_err("\tlpt_offs %u\n", le32_to_cpu(mst->lpt_offs)); + pr_err("\tnhead_lnum %u\n", le32_to_cpu(mst->nhead_lnum)); + pr_err("\tnhead_offs %u\n", le32_to_cpu(mst->nhead_offs)); + pr_err("\tltab_lnum %u\n", le32_to_cpu(mst->ltab_lnum)); + pr_err("\tltab_offs %u\n", le32_to_cpu(mst->ltab_offs)); + pr_err("\tlsave_lnum %u\n", le32_to_cpu(mst->lsave_lnum)); + pr_err("\tlsave_offs %u\n", le32_to_cpu(mst->lsave_offs)); + pr_err("\tlscan_lnum %u\n", le32_to_cpu(mst->lscan_lnum)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(mst->leb_cnt)); + pr_err("\tempty_lebs %u\n", le32_to_cpu(mst->empty_lebs)); + pr_err("\tidx_lebs %u\n", le32_to_cpu(mst->idx_lebs)); + pr_err("\ttotal_free %llu\n", (unsigned long long)le64_to_cpu(mst->total_free)); - printk(KERN_ERR "\ttotal_dirty %llu\n", + pr_err("\ttotal_dirty %llu\n", (unsigned long long)le64_to_cpu(mst->total_dirty)); - printk(KERN_ERR "\ttotal_used %llu\n", + pr_err("\ttotal_used %llu\n", (unsigned long long)le64_to_cpu(mst->total_used)); - printk(KERN_ERR "\ttotal_dead %llu\n", + pr_err("\ttotal_dead %llu\n", (unsigned long long)le64_to_cpu(mst->total_dead)); - printk(KERN_ERR "\ttotal_dark %llu\n", + pr_err("\ttotal_dark %llu\n", (unsigned long long)le64_to_cpu(mst->total_dark)); break; } @@ -446,12 +407,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_ref_node *ref = node; - printk(KERN_ERR "\tlnum %u\n", - le32_to_cpu(ref->lnum)); - printk(KERN_ERR "\toffs %u\n", - le32_to_cpu(ref->offs)); - printk(KERN_ERR "\tjhead %u\n", - le32_to_cpu(ref->jhead)); + pr_err("\tlnum %u\n", le32_to_cpu(ref->lnum)); + pr_err("\toffs %u\n", le32_to_cpu(ref->offs)); + pr_err("\tjhead %u\n", le32_to_cpu(ref->jhead)); break; } case UBIFS_INO_NODE: @@ -459,41 +417,32 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_ERR "\tkey %s\n", + pr_err("\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_ERR "\tcreat_sqnum %llu\n", + pr_err("\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); - printk(KERN_ERR "\tsize %llu\n", + pr_err("\tsize %llu\n", (unsigned long long)le64_to_cpu(ino->size)); - printk(KERN_ERR "\tnlink %u\n", - le32_to_cpu(ino->nlink)); - printk(KERN_ERR "\tatime %lld.%u\n", + pr_err("\tnlink %u\n", le32_to_cpu(ino->nlink)); + pr_err("\tatime %lld.%u\n", (long long)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); - printk(KERN_ERR "\tmtime %lld.%u\n", + pr_err("\tmtime %lld.%u\n", (long long)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); - printk(KERN_ERR "\tctime %lld.%u\n", + pr_err("\tctime %lld.%u\n", (long long)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); - printk(KERN_ERR "\tuid %u\n", - le32_to_cpu(ino->uid)); - printk(KERN_ERR "\tgid %u\n", - le32_to_cpu(ino->gid)); - printk(KERN_ERR "\tmode %u\n", - le32_to_cpu(ino->mode)); - printk(KERN_ERR "\tflags %#x\n", - le32_to_cpu(ino->flags)); - printk(KERN_ERR "\txattr_cnt %u\n", - le32_to_cpu(ino->xattr_cnt)); - printk(KERN_ERR "\txattr_size %u\n", - le32_to_cpu(ino->xattr_size)); - printk(KERN_ERR "\txattr_names %u\n", - le32_to_cpu(ino->xattr_names)); - printk(KERN_ERR "\tcompr_type %#x\n", + pr_err("\tuid %u\n", le32_to_cpu(ino->uid)); + pr_err("\tgid %u\n", le32_to_cpu(ino->gid)); + pr_err("\tmode %u\n", le32_to_cpu(ino->mode)); + pr_err("\tflags %#x\n", le32_to_cpu(ino->flags)); + pr_err("\txattr_cnt %u\n", le32_to_cpu(ino->xattr_cnt)); + pr_err("\txattr_size %u\n", le32_to_cpu(ino->xattr_size)); + pr_err("\txattr_names %u\n", le32_to_cpu(ino->xattr_names)); + pr_err("\tcompr_type %#x\n", (int)le16_to_cpu(ino->compr_type)); - printk(KERN_ERR "\tdata len %u\n", - le32_to_cpu(ino->data_len)); + pr_err("\tdata len %u\n", le32_to_cpu(ino->data_len)); break; } case UBIFS_DENT_NODE: @@ -503,22 +452,21 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_ERR "\tkey %s\n", + pr_err("\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_ERR "\tinum %llu\n", + pr_err("\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); - printk(KERN_ERR "\ttype %d\n", (int)dent->type); - printk(KERN_ERR "\tnlen %d\n", nlen); - printk(KERN_ERR "\tname "); + pr_err("\ttype %d\n", (int)dent->type); + pr_err("\tnlen %d\n", nlen); + pr_err("\tname "); if (nlen > UBIFS_MAX_NLEN) - printk(KERN_ERR "(bad name length, not printing, " - "bad or corrupted node)"); + pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - printk(KERN_CONT "%c", dent->name[i]); + pr_cont("%c", dent->name[i]); } - printk(KERN_CONT "\n"); + pr_cont("\n"); break; } @@ -528,15 +476,13 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_ERR "\tkey %s\n", + pr_err("\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_ERR "\tsize %u\n", - le32_to_cpu(dn->size)); - printk(KERN_ERR "\tcompr_typ %d\n", + pr_err("\tsize %u\n", le32_to_cpu(dn->size)); + pr_err("\tcompr_typ %d\n", (int)le16_to_cpu(dn->compr_type)); - printk(KERN_ERR "\tdata size %d\n", - dlen); - printk(KERN_ERR "\tdata:\n"); + pr_err("\tdata size %d\n", dlen); + pr_err("\tdata:\n"); print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, (void *)&dn->data, dlen, 0); break; @@ -545,11 +491,10 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_trun_node *trun = node; - printk(KERN_ERR "\tinum %u\n", - le32_to_cpu(trun->inum)); - printk(KERN_ERR "\told_size %llu\n", + pr_err("\tinum %u\n", le32_to_cpu(trun->inum)); + pr_err("\told_size %llu\n", (unsigned long long)le64_to_cpu(trun->old_size)); - printk(KERN_ERR "\tnew_size %llu\n", + pr_err("\tnew_size %llu\n", (unsigned long long)le64_to_cpu(trun->new_size)); break; } @@ -558,17 +503,16 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_idx_node *idx = node; n = le16_to_cpu(idx->child_cnt); - printk(KERN_ERR "\tchild_cnt %d\n", n); - printk(KERN_ERR "\tlevel %d\n", - (int)le16_to_cpu(idx->level)); - printk(KERN_ERR "\tBranches:\n"); + pr_err("\tchild_cnt %d\n", n); + pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); + pr_err("\tBranches:\n"); for (i = 0; i < n && i < c->fanout - 1; i++) { const struct ubifs_branch *br; br = ubifs_idx_branch(c, idx, i); key_read(c, &br->key, &key); - printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n", + pr_err("\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), le32_to_cpu(br->len), dbg_snprintf_key(c, &key, key_buf, @@ -582,20 +526,20 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_orph_node *orph = node; - printk(KERN_ERR "\tcommit number %llu\n", + pr_err("\tcommit number %llu\n", (unsigned long long) le64_to_cpu(orph->cmt_no) & LLONG_MAX); - printk(KERN_ERR "\tlast node flag %llu\n", + pr_err("\tlast node flag %llu\n", (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; - printk(KERN_ERR "\t%d orphan inode numbers:\n", n); + pr_err("\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) - printk(KERN_ERR "\t ino %llu\n", + pr_err("\t ino %llu\n", (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: - printk(KERN_ERR "node type %d was not recognized\n", + pr_err("node type %d was not recognized\n", (int)ch->node_type); } spin_unlock(&dbg_lock); @@ -604,16 +548,16 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) void ubifs_dump_budget_req(const struct ubifs_budget_req *req) { spin_lock(&dbg_lock); - printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n", + pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n", req->new_ino, req->dirtied_ino); - printk(KERN_ERR "\tnew_ino_d %d, dirtied_ino_d %d\n", + pr_err("\tnew_ino_d %d, dirtied_ino_d %d\n", req->new_ino_d, req->dirtied_ino_d); - printk(KERN_ERR "\tnew_page %d, dirtied_page %d\n", + pr_err("\tnew_page %d, dirtied_page %d\n", req->new_page, req->dirtied_page); - printk(KERN_ERR "\tnew_dent %d, mod_dent %d\n", + pr_err("\tnew_dent %d, mod_dent %d\n", req->new_dent, req->mod_dent); - printk(KERN_ERR "\tidx_growth %d\n", req->idx_growth); - printk(KERN_ERR "\tdata_growth %d dd_growth %d\n", + pr_err("\tidx_growth %d\n", req->idx_growth); + pr_err("\tdata_growth %d dd_growth %d\n", req->data_growth, req->dd_growth); spin_unlock(&dbg_lock); } @@ -621,14 +565,12 @@ void ubifs_dump_budget_req(const struct ubifs_budget_req *req) void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); - printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, " - "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); - printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, " - "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, - lst->total_dirty); - printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, " - "total_dead %lld\n", lst->total_used, lst->total_dark, - lst->total_dead); + pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs %d\n", + current->pid, lst->empty_lebs, lst->idx_lebs); + pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n", + lst->taken_empty_lebs, lst->total_free, lst->total_dirty); + pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n", + lst->total_used, lst->total_dark, lst->total_dead); spin_unlock(&dbg_lock); } @@ -642,21 +584,17 @@ void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) spin_lock(&c->space_lock); spin_lock(&dbg_lock); - printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, " - "total budget sum %lld\n", current->pid, - bi->data_growth + bi->dd_growth, + pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n", + current->pid, bi->data_growth + bi->dd_growth, bi->data_growth + bi->dd_growth + bi->idx_growth); - printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, " - "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, - bi->idx_growth); - printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, " - "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, - bi->uncommitted_idx); - printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n", + pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n", + bi->data_growth, bi->dd_growth, bi->idx_growth); + pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n", + bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx); + pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n", bi->page_budget, bi->inode_budget, bi->dent_budget); - printk(KERN_ERR "\tnospace %u, nospace_rp %u\n", - bi->nospace, bi->nospace_rp); - printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp); + pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", c->dark_wm, c->dead_wm, c->max_idx_node_sz); if (bi != &c->bi) @@ -667,38 +605,37 @@ void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) */ goto out_unlock; - printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); - printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " - "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), + pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n", + atomic_long_read(&c->dirty_pg_cnt), atomic_long_read(&c->dirty_zn_cnt), atomic_long_read(&c->clean_zn_cnt)); - printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n", - c->gc_lnum, c->ihead_lnum); + pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); /* If we are in R/O mode, journal heads do not exist */ if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) - printk(KERN_ERR "\tjhead %s\t LEB %d\n", + pr_err("\tjhead %s\t LEB %d\n", dbg_jhead(c->jheads[i].wbuf.jhead), c->jheads[i].wbuf.lnum); for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { bud = rb_entry(rb, struct ubifs_bud, rb); - printk(KERN_ERR "\tbud LEB %d\n", bud->lnum); + pr_err("\tbud LEB %d\n", bud->lnum); } list_for_each_entry(bud, &c->old_buds, list) - printk(KERN_ERR "\told bud LEB %d\n", bud->lnum); + pr_err("\told bud LEB %d\n", bud->lnum); list_for_each_entry(idx_gc, &c->idx_gc, list) - printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n", + pr_err("\tGC'ed idx LEB %d unmap %d\n", idx_gc->lnum, idx_gc->unmap); - printk(KERN_ERR "\tcommit state %d\n", c->cmt_state); + pr_err("\tcommit state %d\n", c->cmt_state); /* Print budgeting predictions */ available = ubifs_calc_available(c, c->bi.min_idx_lebs); outstanding = c->bi.data_growth + c->bi.dd_growth; free = ubifs_get_free_space_nolock(c); - printk(KERN_ERR "Budgeting predictions:\n"); - printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n", + pr_err("Budgeting predictions:\n"); + pr_err("\tavailable: %lld, outstanding %lld, free %lld\n", available, outstanding, free); out_unlock: spin_unlock(&dbg_lock); @@ -718,21 +655,19 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) dark = ubifs_calc_dark(c, spc); if (lp->flags & LPROPS_INDEX) - printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " - "free + dirty %-8d flags %#x (", lp->lnum, lp->free, - lp->dirty, c->leb_size - spc, spc, lp->flags); + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + lp->flags); else - printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " - "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " - "flags %#-4x (", lp->lnum, lp->free, lp->dirty, - c->leb_size - spc, spc, dark, dead, - (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); if (lp->flags & LPROPS_TAKEN) { if (lp->flags & LPROPS_INDEX) - printk(KERN_CONT "index, taken"); + pr_cont("index, taken"); else - printk(KERN_CONT "taken"); + pr_cont("taken"); } else { const char *s; @@ -769,7 +704,7 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) break; } } - printk(KERN_CONT "%s", s); + pr_cont("%s", s); } for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { @@ -784,19 +719,18 @@ void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) */ if (c->jheads && lp->lnum == c->jheads[i].wbuf.lnum) { - printk(KERN_CONT ", jhead %s", - dbg_jhead(i)); + pr_cont(", jhead %s", dbg_jhead(i)); head = 1; } } if (!head) - printk(KERN_CONT ", bud of jhead %s", + pr_cont(", bud of jhead %s", dbg_jhead(bud->jhead)); } } if (lp->lnum == c->gc_lnum) - printk(KERN_CONT ", GC LEB"); - printk(KERN_CONT ")\n"); + pr_cont(", GC LEB"); + pr_cont(")\n"); } void ubifs_dump_lprops(struct ubifs_info *c) @@ -805,8 +739,7 @@ void ubifs_dump_lprops(struct ubifs_info *c) struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_ERR "(pid %d) start dumping LEB properties\n", - current->pid); + pr_err("(pid %d) start dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); ubifs_dump_lstats(&lst); @@ -817,8 +750,7 @@ void ubifs_dump_lprops(struct ubifs_info *c) ubifs_dump_lprop(c, &lp); } - printk(KERN_ERR "(pid %d) finish dumping LEB properties\n", - current->pid); + pr_err("(pid %d) finish dumping LEB properties\n", current->pid); } void ubifs_dump_lpt_info(struct ubifs_info *c) @@ -826,37 +758,36 @@ void ubifs_dump_lpt_info(struct ubifs_info *c) int i; spin_lock(&dbg_lock); - printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid); - printk(KERN_ERR "\tlpt_sz: %lld\n", c->lpt_sz); - printk(KERN_ERR "\tpnode_sz: %d\n", c->pnode_sz); - printk(KERN_ERR "\tnnode_sz: %d\n", c->nnode_sz); - printk(KERN_ERR "\tltab_sz: %d\n", c->ltab_sz); - printk(KERN_ERR "\tlsave_sz: %d\n", c->lsave_sz); - printk(KERN_ERR "\tbig_lpt: %d\n", c->big_lpt); - printk(KERN_ERR "\tlpt_hght: %d\n", c->lpt_hght); - printk(KERN_ERR "\tpnode_cnt: %d\n", c->pnode_cnt); - printk(KERN_ERR "\tnnode_cnt: %d\n", c->nnode_cnt); - printk(KERN_ERR "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); - printk(KERN_ERR "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); - printk(KERN_ERR "\tlsave_cnt: %d\n", c->lsave_cnt); - printk(KERN_ERR "\tspace_bits: %d\n", c->space_bits); - printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); - printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); - printk(KERN_ERR "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); - printk(KERN_ERR "\tpcnt_bits: %d\n", c->pcnt_bits); - printk(KERN_ERR "\tlnum_bits: %d\n", c->lnum_bits); - printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); - printk(KERN_ERR "\tLPT head is at %d:%d\n", + pr_err("(pid %d) dumping LPT information\n", current->pid); + pr_err("\tlpt_sz: %lld\n", c->lpt_sz); + pr_err("\tpnode_sz: %d\n", c->pnode_sz); + pr_err("\tnnode_sz: %d\n", c->nnode_sz); + pr_err("\tltab_sz: %d\n", c->ltab_sz); + pr_err("\tlsave_sz: %d\n", c->lsave_sz); + pr_err("\tbig_lpt: %d\n", c->big_lpt); + pr_err("\tlpt_hght: %d\n", c->lpt_hght); + pr_err("\tpnode_cnt: %d\n", c->pnode_cnt); + pr_err("\tnnode_cnt: %d\n", c->nnode_cnt); + pr_err("\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + pr_err("\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + pr_err("\tlsave_cnt: %d\n", c->lsave_cnt); + pr_err("\tspace_bits: %d\n", c->space_bits); + pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + pr_err("\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + pr_err("\tpcnt_bits: %d\n", c->pcnt_bits); + pr_err("\tlnum_bits: %d\n", c->lnum_bits); + pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + pr_err("\tLPT head is at %d:%d\n", c->nhead_lnum, c->nhead_offs); - printk(KERN_ERR "\tLPT ltab is at %d:%d\n", - c->ltab_lnum, c->ltab_offs); + pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) - printk(KERN_ERR "\tLPT lsave is at %d:%d\n", + pr_err("\tLPT lsave is at %d:%d\n", c->lsave_lnum, c->lsave_offs); for (i = 0; i < c->lpt_lebs; i++) - printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d " - "cmt %d\n", i + c->lpt_first, c->ltab[i].free, - c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n", + i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty, + c->ltab[i].tgc, c->ltab[i].cmt); spin_unlock(&dbg_lock); } @@ -865,13 +796,13 @@ void ubifs_dump_sleb(const struct ubifs_info *c, { struct ubifs_scan_node *snod; - printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n", + pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n", current->pid, sleb->lnum, offs); list_for_each_entry(snod, &sleb->nodes, list) { cond_resched(); - printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum, - snod->offs, snod->len); + pr_err("Dumping node at LEB %d:%d len %d\n", + sleb->lnum, snod->offs, snod->len); ubifs_dump_node(c, snod->node); } } @@ -882,11 +813,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) struct ubifs_scan_node *snod; void *buf; - if (dbg_is_tst_rcvry(c)) - return; - - printk(KERN_ERR "(pid %d) start dumping LEB %d\n", - current->pid, lnum); + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { @@ -900,18 +827,17 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) goto out; } - printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum, + pr_err("LEB %d has %d nodes ending at %d\n", lnum, sleb->nodes_cnt, sleb->endpt); list_for_each_entry(snod, &sleb->nodes, list) { cond_resched(); - printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum, + pr_err("Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); ubifs_dump_node(c, snod->node); } - printk(KERN_ERR "(pid %d) finish dumping LEB %d\n", - current->pid, lnum); + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); out: @@ -932,33 +858,28 @@ void ubifs_dump_znode(const struct ubifs_info *c, else zbr = &c->zroot; - printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d" - " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, - zbr->len, znode->parent, znode->iip, znode->level, - znode->child_cnt, znode->flags); + pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n", + znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip, + znode->level, znode->child_cnt, znode->flags); if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { spin_unlock(&dbg_lock); return; } - printk(KERN_ERR "zbranches:\n"); + pr_err("zbranches:\n"); for (n = 0; n < znode->child_cnt; n++) { zbr = &znode->zbranch[n]; if (znode->level > 0) - printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key " - "%s\n", n, zbr->znode, zbr->lnum, - zbr->offs, zbr->len, - dbg_snprintf_key(c, &zbr->key, - key_buf, - DBG_KEY_BUF_LEN)); + pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); else - printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key " - "%s\n", n, zbr->znode, zbr->lnum, - zbr->offs, zbr->len, - dbg_snprintf_key(c, &zbr->key, - key_buf, - DBG_KEY_BUF_LEN)); + pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); } spin_unlock(&dbg_lock); } @@ -967,16 +888,16 @@ void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n", + pr_err("(pid %d) start dumping heap cat %d (%d elements)\n", current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; - printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d " - "flags %d\n", i, lprops->lnum, lprops->hpos, - lprops->free, lprops->dirty, lprops->flags); + pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n", + i, lprops->lnum, lprops->hpos, lprops->free, + lprops->dirty, lprops->flags); } - printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid); + pr_err("(pid %d) finish dumping heap\n", current->pid); } void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, @@ -984,15 +905,15 @@ void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, { int i; - printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid); - printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n", + pr_err("(pid %d) dumping pnode:\n", current->pid); + pr_err("\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); - printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n", + pr_err("\tflags %lu iip %d level %d num %d\n", pnode->flags, iip, pnode->level, pnode->num); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { struct ubifs_lprops *lp = &pnode->lprops[i]; - printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n", + pr_err("\t%d: free %d dirty %d flags %d lnum %d\n", i, lp->free, lp->dirty, lp->flags, lp->lnum); } } @@ -1002,20 +923,20 @@ void ubifs_dump_tnc(struct ubifs_info *c) struct ubifs_znode *znode; int level; - printk(KERN_ERR "\n"); - printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid); + pr_err("\n"); + pr_err("(pid %d) start dumping TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; - printk(KERN_ERR "== Level %d ==\n", level); + pr_err("== Level %d ==\n", level); while (znode) { if (level != znode->level) { level = znode->level; - printk(KERN_ERR "== Level %d ==\n", level); + pr_err("== Level %d ==\n", level); } ubifs_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } - printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid); + pr_err("(pid %d) finish dumping TNC tree\n", current->pid); } static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, @@ -1154,8 +1075,8 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) mutex_lock(&ui->ui_mutex); spin_lock(&ui->ui_lock); if (ui->ui_size != ui->synced_i_size && !ui->dirty) { - ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " - "is clean", ui->ui_size, ui->synced_i_size); + ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean", + ui->ui_size, ui->synced_i_size); ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); dump_stack(); @@ -1217,17 +1138,16 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) kfree(pdent); if (i_size_read(dir) != size) { - ubifs_err("directory inode %lu has size %llu, " - "but calculated size is %llu", dir->i_ino, - (unsigned long long)i_size_read(dir), + ubifs_err("directory inode %lu has size %llu, but calculated size is %llu", + dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { - ubifs_err("directory inode %lu has nlink %u, but calculated " - "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u", + dir->i_ino, dir->i_nlink, nlink); ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; @@ -1686,8 +1606,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, if (znode_cb) { err = znode_cb(c, znode, priv); if (err) { - ubifs_err("znode checking function returned " - "error %d", err); + ubifs_err("znode checking function returned error %d", + err); ubifs_dump_znode(c, znode); goto out_dump; } @@ -1697,9 +1617,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, zbr = &znode->zbranch[idx]; err = leaf_cb(c, zbr, priv); if (err) { - ubifs_err("leaf checking function " - "returned error %d, for leaf " - "at LEB %d:%d", + ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d", err, zbr->lnum, zbr->offs); goto out_dump; } @@ -1807,8 +1725,8 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) } if (calc != idx_size) { - ubifs_err("index size check failed: calculated size is %lld, " - "should be %lld", calc, idx_size); + ubifs_err("index size check failed: calculated size is %lld, should be %lld", + calc, idx_size); dump_stack(); return -EINVAL; } @@ -2120,8 +2038,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing data node and " - "trying to find inode node %lu", + ubifs_err("error %d while processing data node and trying to find inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2131,9 +2048,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, blk_offs <<= UBIFS_BLOCK_SHIFT; blk_offs += le32_to_cpu(dn->size); if (blk_offs > fscki->size) { - ubifs_err("data node at LEB %d:%d is not within inode " - "size %lld", zbr->lnum, zbr->offs, - fscki->size); + ubifs_err("data node at LEB %d:%d is not within inode size %lld", + zbr->lnum, zbr->offs, fscki->size); err = -EINVAL; goto out_dump; } @@ -2154,8 +2070,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing entry node and " - "trying to find inode node %lu", + ubifs_err("error %d while processing entry node and trying to find inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2167,8 +2082,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki1 = read_add_inode(c, priv, inum); if (IS_ERR(fscki1)) { err = PTR_ERR(fscki1); - ubifs_err("error %d while processing entry node and " - "trying to find parent inode node %lu", + ubifs_err("error %d while processing entry node and trying to find parent inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2258,61 +2172,52 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) */ if (fscki->inum != UBIFS_ROOT_INO && fscki->references != 1) { - ubifs_err("directory inode %lu has %d " - "direntries which refer it, but " - "should be 1", + ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1", (unsigned long)fscki->inum, fscki->references); goto out_dump; } if (fscki->inum == UBIFS_ROOT_INO && fscki->references != 0) { - ubifs_err("root inode %lu has non-zero (%d) " - "direntries which refer it", + ubifs_err("root inode %lu has non-zero (%d) direntries which refer it", (unsigned long)fscki->inum, fscki->references); goto out_dump; } if (fscki->calc_sz != fscki->size) { - ubifs_err("directory inode %lu size is %lld, " - "but calculated size is %lld", + ubifs_err("directory inode %lu size is %lld, but calculated size is %lld", (unsigned long)fscki->inum, fscki->size, fscki->calc_sz); goto out_dump; } if (fscki->calc_cnt != fscki->nlink) { - ubifs_err("directory inode %lu nlink is %d, " - "but calculated nlink is %d", + ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d", (unsigned long)fscki->inum, fscki->nlink, fscki->calc_cnt); goto out_dump; } } else { if (fscki->references != fscki->nlink) { - ubifs_err("inode %lu nlink is %d, but " - "calculated nlink is %d", + ubifs_err("inode %lu nlink is %d, but calculated nlink is %d", (unsigned long)fscki->inum, fscki->nlink, fscki->references); goto out_dump; } } if (fscki->xattr_sz != fscki->calc_xsz) { - ubifs_err("inode %lu has xattr size %u, but " - "calculated size is %lld", + ubifs_err("inode %lu has xattr size %u, but calculated size is %lld", (unsigned long)fscki->inum, fscki->xattr_sz, fscki->calc_xsz); goto out_dump; } if (fscki->xattr_cnt != fscki->calc_xcnt) { - ubifs_err("inode %lu has %u xattrs, but " - "calculated count is %lld", + ubifs_err("inode %lu has %u xattrs, but calculated count is %lld", (unsigned long)fscki->inum, fscki->xattr_cnt, fscki->calc_xcnt); goto out_dump; } if (fscki->xattr_nms != fscki->calc_xnms) { - ubifs_err("inode %lu has xattr names' size %u, but " - "calculated names' size is %lld", + ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld", (unsigned long)fscki->inum, fscki->xattr_nms, fscki->calc_xnms); goto out_dump; @@ -2652,20 +2557,18 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) return 1; } -static void cut_data(const void *buf, unsigned int len) +static int corrupt_data(const struct ubifs_info *c, const void *buf, + unsigned int len) { unsigned int from, to, i, ffs = chance(1, 2); unsigned char *p = (void *)buf; from = random32() % (len + 1); - if (chance(1, 2)) - to = random32() % (len - from + 1); - else - to = len; + /* Corruption may only span one max. write unit */ + to = min(len, ALIGN(from, c->max_write_size)); - if (from < to) - ubifs_warn("filled bytes %u-%u with %s", from, to - 1, - ffs ? "0xFFs" : "random data"); + ubifs_warn("filled bytes %u-%u with %s", from, to - 1, + ffs ? "0xFFs" : "random data"); if (ffs) for (i = from; i < to; i++) @@ -2673,6 +2576,8 @@ static void cut_data(const void *buf, unsigned int len) else for (i = from; i < to; i++) p[i] = random32() % 0x100; + + return to; } int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, @@ -2685,7 +2590,9 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, failing = power_cut_emulated(c, lnum, 1); if (failing) - cut_data(buf, len); + len = corrupt_data(c, buf, len); + ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", + len, lnum, offs); err = ubi_leb_write(c->ubi, lnum, buf, offs, len); if (err) return err; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 760de72..e03d517 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -150,7 +150,7 @@ struct ubifs_global_debug_info { #define ubifs_assert(expr) do { \ if (unlikely(!(expr))) { \ - printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + pr_crit("UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ dump_stack(); \ } \ @@ -159,26 +159,23 @@ struct ubifs_global_debug_info { #define ubifs_assert_cmt_locked(c) do { \ if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ up_write(&(c)->commit_sem); \ - printk(KERN_CRIT "commit lock is not locked!\n"); \ + pr_crit("commit lock is not locked!\n"); \ ubifs_assert(0); \ } \ } while (0) #define ubifs_dbg_msg(type, fmt, ...) \ - pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) + pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid, \ + ##__VA_ARGS__) #define DBG_KEY_BUF_LEN 48 #define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ - pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ + pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid, \ + ##__VA_ARGS__, \ dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ } while (0) -/* Just a debugging messages not related to any specific UBIFS subsystem */ -#define dbg_msg(fmt, ...) \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ - __func__, ##__VA_ARGS__) - /* General messages */ #define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c95681c..e271fba 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -980,8 +980,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, * separately. */ - dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " - "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, + dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu", + old_dentry->d_name.len, old_dentry->d_name.name, old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, new_dentry->d_name.name, new_dir->i_ino); ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7bd6e72..5bc7781 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1486,8 +1486,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, err = ubifs_budget_space(c, &req); if (unlikely(err)) { if (err == -ENOSPC) - ubifs_warn("out of space for mmapped file " - "(inode number %lu)", inode->i_ino); + ubifs_warn("out of space for mmapped file (inode number %lu)", + inode->i_ino); return VM_FAULT_SIGBUS; } @@ -1536,6 +1536,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ubifs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 04dd6f4..76ca53c 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -714,9 +714,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) break; } - dbg_gc("found LEB %d: free %d, dirty %d, sum %d " - "(min. space %d)", lp.lnum, lp.free, lp.dirty, - lp.free + lp.dirty, min_space); + dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)", + lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, + min_space); space_before = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum == -1) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 12c0f15..afaad07 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -469,8 +469,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ino->uid = cpu_to_le32(inode->i_uid); - ino->gid = cpu_to_le32(inode->i_gid); + ino->uid = cpu_to_le32(i_uid_read(inode)); + ino->gid = cpu_to_le32(i_gid_read(inode)); ino->mode = cpu_to_le32(inode->i_mode); ino->flags = cpu_to_le32(ui->flags); ino->size = cpu_to_le64(ui->ui_size); diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index c80b15d..36bd4ef 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -315,17 +315,15 @@ static void remove_buds(struct ubifs_info *c) * heads (non-closed buds). */ c->cmt_bud_bytes += wbuf->offs - bud->start; - dbg_log("preserve %d:%d, jhead %s, bud bytes %d, " - "cmt_bud_bytes %lld", bud->lnum, bud->start, - dbg_jhead(bud->jhead), wbuf->offs - bud->start, - c->cmt_bud_bytes); + dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + wbuf->offs - bud->start, c->cmt_bud_bytes); bud->start = wbuf->offs; } else { c->cmt_bud_bytes += c->leb_size - bud->start; - dbg_log("remove %d:%d, jhead %s, bud bytes %d, " - "cmt_bud_bytes %lld", bud->lnum, bud->start, - dbg_jhead(bud->jhead), c->leb_size - bud->start, - c->cmt_bud_bytes); + dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); /* * If the commit does not finish, the recovery will need diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 86eb8e5..e5a2a35 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -867,15 +867,15 @@ int dbg_check_cats(struct ubifs_info *c) list_for_each_entry(lprops, &c->empty_list, list) { if (lprops->free != c->leb_size) { - ubifs_err("non-empty LEB %d on empty list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on empty list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } } @@ -883,15 +883,15 @@ int dbg_check_cats(struct ubifs_info *c) i = 0; list_for_each_entry(lprops, &c->freeable_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on freeable list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on freeable list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } i += 1; @@ -913,21 +913,21 @@ int dbg_check_cats(struct ubifs_info *c) list_for_each_entry(lprops, &c->frdi_idx_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (!(lprops->flags & LPROPS_INDEX)) { - ubifs_err("non-index LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } } @@ -982,9 +982,9 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, goto out; } if (lprops != lp) { - dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", - (size_t)lprops, (size_t)lp, lprops->lnum, - lp->lnum); + ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); err = 4; goto out; } @@ -1002,7 +1002,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, } out: if (err) { - dbg_msg("failed cat %d hpos %d err %d", cat, i, err); + ubifs_err("failed cat %d hpos %d err %d", cat, i, err); dump_stack(); ubifs_dump_heap(c, heap, cat); } @@ -1153,8 +1153,8 @@ static int scan_check_cb(struct ubifs_info *c, if (free > c->leb_size || free < 0 || dirty > c->leb_size || dirty < 0) { - ubifs_err("bad calculated accounting for LEB %d: " - "free %d, dirty %d", lnum, free, dirty); + ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d", + lnum, free, dirty); goto out_destroy; } @@ -1200,8 +1200,7 @@ static int scan_check_cb(struct ubifs_info *c, /* Free but not unmapped LEB, it's fine */ is_idx = 0; else { - ubifs_err("indexing node without indexing " - "flag"); + ubifs_err("indexing node without indexing flag"); goto out_print; } } @@ -1236,8 +1235,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; out_print: - ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " - "should be free %d, dirty %d", + ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d", lnum, lp->free, lp->dirty, lp->flags, free, dirty); ubifs_dump_leb(c, lnum); out_destroy: @@ -1290,12 +1288,10 @@ int dbg_check_lprops(struct ubifs_info *c) lst.total_dirty != c->lst.total_dirty || lst.total_used != c->lst.total_used) { ubifs_err("bad overall accounting"); - ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " - "total_free %lld, total_dirty %lld, total_used %lld", + ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", lst.empty_lebs, lst.idx_lebs, lst.total_free, lst.total_dirty, lst.total_used); - ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " - "total_free %lld, total_dirty %lld, total_used %lld", + ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, c->lst.total_dirty, c->lst.total_used); err = -EINVAL; diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 8640920..d46b19e 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1311,7 +1311,7 @@ out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); ubifs_dump_pnode(c, pnode, parent, iip); dump_stack(); - dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; } @@ -2237,8 +2237,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, /* cnode is a nnode */ num = calc_nnode_num(row, col); if (cnode->num != num) { - ubifs_err("nnode num %d expected %d " - "parent num %d iip %d", + ubifs_err("nnode num %d expected %d parent num %d iip %d", cnode->num, num, (nnode ? nnode->num : 0), cnode->iip); return -EINVAL; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 4fa7073..9daaeef 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -320,8 +320,8 @@ static int layout_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " - "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); ubifs_dump_lpt_info(c); ubifs_dump_lpt_lebs(c); dump_stack(); @@ -545,8 +545,8 @@ static int write_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " - "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); ubifs_dump_lpt_info(c); ubifs_dump_lpt_lebs(c); dump_stack(); @@ -1662,21 +1662,19 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) continue; } if (!dbg_is_all_ff(p, len)) { - dbg_msg("invalid empty space in LEB %d at %d", - lnum, c->leb_size - len); + ubifs_err("invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); err = -EINVAL; } i = lnum - c->lpt_first; if (len != c->ltab[i].free) { - dbg_msg("invalid free space in LEB %d " - "(free %d, expected %d)", - lnum, len, c->ltab[i].free); + ubifs_err("invalid free space in LEB %d (free %d, expected %d)", + lnum, len, c->ltab[i].free); err = -EINVAL; } if (dirty != c->ltab[i].dirty) { - dbg_msg("invalid dirty space in LEB %d " - "(dirty %d, expected %d)", - lnum, dirty, c->ltab[i].dirty); + ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } goto out; @@ -1888,8 +1886,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) int err, len = c->leb_size, node_type, node_num, node_len, offs; void *buf, *p; - printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", - current->pid, lnum); + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { ubifs_err("cannot allocate memory to dump LPT"); @@ -1907,14 +1904,14 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pad_len = get_pad_len(c, p, len); if (pad_len) { - printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", + pr_err("LEB %d:%d, pad %d bytes\n", lnum, offs, pad_len); p += pad_len; len -= pad_len; continue; } if (len) - printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n", + pr_err("LEB %d:%d, free %d bytes\n", lnum, offs, len); break; } @@ -1925,11 +1922,10 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) { node_len = c->pnode_sz; if (c->big_lpt) - printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n", + pr_err("LEB %d:%d, pnode num %d\n", lnum, offs, node_num); else - printk(KERN_DEBUG "LEB %d:%d, pnode\n", - lnum, offs); + pr_err("LEB %d:%d, pnode\n", lnum, offs); break; } case UBIFS_LPT_NNODE: @@ -1939,29 +1935,28 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) node_len = c->nnode_sz; if (c->big_lpt) - printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ", + pr_err("LEB %d:%d, nnode num %d, ", lnum, offs, node_num); else - printk(KERN_DEBUG "LEB %d:%d, nnode, ", + pr_err("LEB %d:%d, nnode, ", lnum, offs); err = ubifs_unpack_nnode(c, p, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { - printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, + pr_cont("%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); if (i != UBIFS_LPT_FANOUT - 1) - printk(KERN_CONT ", "); + pr_cont(", "); } - printk(KERN_CONT "\n"); + pr_cont("\n"); break; } case UBIFS_LPT_LTAB: node_len = c->ltab_sz; - printk(KERN_DEBUG "LEB %d:%d, ltab\n", - lnum, offs); + pr_err("LEB %d:%d, ltab\n", lnum, offs); break; case UBIFS_LPT_LSAVE: node_len = c->lsave_sz; - printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs); + pr_err("LEB %d:%d, lsave len\n", lnum, offs); break; default: ubifs_err("LPT node type %d not recognized", node_type); @@ -1972,8 +1967,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) len -= node_len; } - printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", - current->pid, lnum); + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); out: vfree(buf); return; @@ -1990,12 +1984,10 @@ void ubifs_dump_lpt_lebs(const struct ubifs_info *c) { int i; - printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n", - current->pid); + pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid); for (i = 0; i < c->lpt_lebs; i++) dump_lpt_leb(c, i + c->lpt_first); - printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n", - current->pid); + pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid); } /** diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index cebf17e..769701c 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -562,8 +562,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, list_for_each_entry(snod, &sleb->nodes, list) { if (snod->type != UBIFS_ORPH_NODE) { - ubifs_err("invalid node type %d in orphan area at " - "%d:%d", snod->type, sleb->lnum, snod->offs); + ubifs_err("invalid node type %d in orphan area at %d:%d", + snod->type, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); return -EINVAL; } @@ -589,8 +589,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, * number. That makes this orphan node, out of date. */ if (!first) { - ubifs_err("out of order commit number %llu in " - "orphan node at %d:%d", + ubifs_err("out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); return -EINVAL; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index edeec49..065096e 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -609,7 +609,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); - dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs); + dbg_rcvry("dropping last node at %d:%d", + sleb->lnum, snod->offs); *offs = snod->offs; list_del(&snod->list); kfree(snod); @@ -702,8 +703,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * See header comment for this file for more * explanations about the reasons we have this check. */ - ubifs_err("corrupt empty space LEB %d:%d, corruption " - "starts at %d", lnum, offs, corruption); + ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d", + lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ offs += corruption; buf += corruption; @@ -899,8 +900,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, } } if (snod->sqnum > cs_sqnum) { - ubifs_err("unrecoverable log corruption " - "in LEB %d", lnum); + ubifs_err("unrecoverable log corruption in LEB %d", + lnum); ubifs_scan_destroy(sleb); return ERR_PTR(-EUCLEAN); } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 94d78fc..3187925 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -141,9 +141,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) * during the replay. */ if (dirty != 0) - dbg_msg("LEB %d lp: %d free %d dirty " - "replay: %d free %d dirty", b->bud->lnum, - lp->free, lp->dirty, b->free, b->dirty); + dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty", + b->bud->lnum, lp->free, lp->dirty, b->free, + b->dirty); } lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, lp->flags | LPROPS_TAKEN, 0); @@ -677,7 +677,8 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) b->dirty = sleb->endpt - offs - used; b->free = c->leb_size - sleb->endpt; - dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free); + dbg_mnt("bud LEB %d replied: dirty %d, free %d", + lnum, b->dirty, b->free); out: ubifs_scan_destroy(sleb); @@ -865,8 +866,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) goto out_dump; } if (le64_to_cpu(node->cmt_no) != c->cmt_no) { - ubifs_err("first CS node at LEB %d:%d has wrong " - "commit number %llu expected %llu", + ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu", lnum, offs, (unsigned long long)le64_to_cpu(node->cmt_no), c->cmt_no); @@ -1058,8 +1058,8 @@ int ubifs_replay_journal(struct ubifs_info *c) c->bi.uncommitted_idx *= c->max_idx_node_sz; ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); - dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " - "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu", + c->lhead_lnum, c->lhead_offs, c->max_sqnum, (unsigned long)c->highest_inum); out: destroy_replay_list(c); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 15e2fc5..4c37607 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -391,9 +391,8 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { - ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " - "%d minimum required", c->leb_cnt, c->vi.size, - min_leb_cnt); + ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required", + c->leb_cnt, c->vi.size, min_leb_cnt); goto failed; } @@ -411,15 +410,14 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; if (c->max_bud_bytes < max_bytes) { - ubifs_err("too small journal (%lld bytes), must be at least " - "%lld bytes", c->max_bud_bytes, max_bytes); + ubifs_err("too small journal (%lld bytes), must be at least %lld bytes", + c->max_bud_bytes, max_bytes); goto failed; } max_bytes = (long long)c->leb_size * c->main_lebs; if (c->max_bud_bytes > max_bytes) { - ubifs_err("too large journal size (%lld bytes), only %lld bytes" - "available in the main area", + ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area", c->max_bud_bytes, max_bytes); goto failed; } @@ -549,10 +547,9 @@ int ubifs_read_superblock(struct ubifs_info *c) ubifs_assert(!c->ro_media || c->ro_mount); if (!c->ro_mount || c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { - ubifs_err("on-flash format version is w%d/r%d, but " - "software only supports up to version " - "w%d/r%d", c->fmt_version, - c->ro_compat_version, UBIFS_FORMAT_VERSION, + ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { ubifs_msg("only R/O mounting is possible"); @@ -611,8 +608,8 @@ int ubifs_read_superblock(struct ubifs_info *c) c->fanout = le32_to_cpu(sup->fanout); c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); c->rp_size = le64_to_cpu(sup->rp_size); - c->rp_uid = le32_to_cpu(sup->rp_uid); - c->rp_gid = le32_to_cpu(sup->rp_gid); + c->rp_uid = make_kuid(&init_user_ns, le32_to_cpu(sup->rp_uid)); + c->rp_gid = make_kgid(&init_user_ns, le32_to_cpu(sup->rp_gid)); sup_flags = le32_to_cpu(sup->flags); if (!c->mount_opts.override_compr) c->default_compr = le16_to_cpu(sup->default_compr); diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 7c40e60..58aa05d 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -75,7 +75,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, magic = le32_to_cpu(ch->magic); if (magic == 0xFFFFFFFF) { - dbg_scan("hit empty space"); + dbg_scan("hit empty space at LEB %d:%d", lnum, offs); return SCANNED_EMPTY_SPACE; } @@ -85,7 +85,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (len < UBIFS_CH_SZ) return SCANNED_GARBAGE; - dbg_scan("scanning %s", dbg_ntype(ch->node_type)); + dbg_scan("scanning %s at LEB %d:%d", + dbg_ntype(ch->node_type), lnum, offs); if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; @@ -114,8 +115,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, return SCANNED_A_BAD_PAD_NODE; } - dbg_scan("%d bytes padded, offset now %d", - pad_len, ALIGN(offs + node_len + pad_len, 8)); + dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len, + lnum, offs, ALIGN(offs + node_len + pad_len, 8)); return node_len + pad_len; } @@ -150,8 +151,8 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); if (err && err != -EBADMSG) { - ubifs_err("cannot read %d bytes from LEB %d:%d," - " error %d", c->leb_size - offs, lnum, offs, err); + ubifs_err("cannot read %d bytes from LEB %d:%d, error %d", + c->leb_size - offs, lnum, offs, err); kfree(sleb); return ERR_PTR(err); } @@ -240,8 +241,6 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, int len; ubifs_err("corruption at LEB %d:%d", lnum, offs); - if (dbg_is_tst_rcvry(c)) - return; len = c->leb_size - offs; if (len > 8192) len = 8192; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 71a197f..ddc0f6a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -89,9 +89,8 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 5; if (!ubifs_compr_present(ui->compr_type)) { - ubifs_warn("inode %lu uses '%s' compression, but it was not " - "compiled in", inode->i_ino, - ubifs_compr_name(ui->compr_type)); + ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in", + inode->i_ino, ubifs_compr_name(ui->compr_type)); } err = dbg_check_dir(c, inode); @@ -130,8 +129,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode->i_flags |= (S_NOCMTIME | S_NOATIME); set_nlink(inode, le32_to_cpu(ino->nlink)); - inode->i_uid = le32_to_cpu(ino->uid); - inode->i_gid = le32_to_cpu(ino->gid); + i_uid_write(inode, le32_to_cpu(ino->uid)); + i_gid_write(inode, le32_to_cpu(ino->gid)); inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); @@ -1061,8 +1060,8 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, flag = parse_standard_option(p); if (!flag) { - ubifs_err("unrecognized mount option \"%s\" " - "or missing value", p); + ubifs_err("unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } sb->s_flags |= flag; @@ -1124,8 +1123,8 @@ again: } /* Just disable bulk-read */ - ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, " - "disabling it", c->max_bu_buf_len); + ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it", + c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; return; @@ -1161,7 +1160,7 @@ static int check_free_space(struct ubifs_info *c) static int mount_ubifs(struct ubifs_info *c) { int err; - long long x; + long long x, y; size_t sz; c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); @@ -1411,75 +1410,69 @@ static int mount_ubifs(struct ubifs_info *c) c->mounting = 0; - ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", - c->vi.ubi_num, c->vi.vol_id, c->vi.name); - if (c->ro_mount) - ubifs_msg("mounted read-only"); + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", + c->vi.ubi_num, c->vi.vol_id, c->vi.name, + c->ro_mount ? ", R/O mode" : NULL); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " - "LEBs)", x, x >> 10, x >> 20, c->main_lebs); - x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " - "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", + c->leb_size, c->leb_size >> 10, c->min_io_size, + c->max_write_size); + ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", + x, x >> 20, c->main_lebs, + y, y >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, - UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("reserved for root: %llu bytes (%llu KiB)", - c->report_rp_size, c->report_rp_size >> 10); - - dbg_msg("compiled on: " __DATE__ " at " __TIME__); - dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); - dbg_msg("max. write size: %d bytes", c->max_write_size); - dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size >> 10); - dbg_msg("data journal heads: %d", + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, + c->big_lpt ? ", big LPT model" : ", small LPT model"); + + dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr)); + dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); - dbg_msg("UUID: %pUB", c->uuid); - dbg_msg("big_lpt %d", c->big_lpt); - dbg_msg("log LEBs: %d (%d - %d)", + dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); - dbg_msg("LPT area LEBs: %d (%d - %d)", + dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); - dbg_msg("orphan area LEBs: %d (%d - %d)", + dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); - dbg_msg("main area LEBs: %d (%d - %d)", + dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); - dbg_msg("index LEBs: %d", c->lst.idx_lebs); - dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("index LEBs: %d", c->lst.idx_lebs); + dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)", c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, c->bi.old_idx_sz >> 20); - dbg_msg("key hash type: %d", c->key_hash_type); - dbg_msg("tree fanout: %d", c->fanout); - dbg_msg("reserved GC LEB: %d", c->gc_lnum); - dbg_msg("first main LEB: %d", c->main_first); - dbg_msg("max. znode size %d", c->max_znode_sz); - dbg_msg("max. index node size %d", c->max_idx_node_sz); - dbg_msg("node sizes: data %zu, inode %zu, dentry %zu", + dbg_gen("key hash type: %d", c->key_hash_type); + dbg_gen("tree fanout: %d", c->fanout); + dbg_gen("reserved GC LEB: %d", c->gc_lnum); + dbg_gen("max. znode size %d", c->max_znode_sz); + dbg_gen("max. index node size %d", c->max_idx_node_sz); + dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); - dbg_msg("node sizes: trun %zu, sb %zu, master %zu", + dbg_gen("node sizes: trun %zu, sb %zu, master %zu", UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); - dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", + dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); - dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", + dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); - dbg_msg("dead watermark: %d", c->dead_wm); - dbg_msg("dark watermark: %d", c->dark_wm); - dbg_msg("LEB overhead: %d", c->leb_overhead); + dbg_gen("dead watermark: %d", c->dead_wm); + dbg_gen("dark watermark: %d", c->dark_wm); + dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; - dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); - dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); - dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); - dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); - dbg_msg("max. seq. number: %llu", c->max_sqnum); - dbg_msg("commit number: %llu", c->cmt_no); + dbg_gen("max. seq. number: %llu", c->max_sqnum); + dbg_gen("commit number: %llu", c->cmt_no); return 0; @@ -1564,10 +1557,9 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->rw_incompat) { ubifs_err("the file-system is not R/W-compatible"); - ubifs_msg("on-flash format version is w%d/r%d, but software " - "only supports up to version w%d/r%d", c->fmt_version, - c->ro_compat_version, UBIFS_FORMAT_VERSION, - UBIFS_RO_COMPAT_VERSION); + ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; } @@ -1828,8 +1820,8 @@ static void ubifs_put_super(struct super_block *sb) * next mount, so we just print a message and * continue to unmount normally. */ - ubifs_err("failed to write master node, " - "error %d", err); + ubifs_err("failed to write master node, error %d", + err); } else { for (i = 0; i < c->jhead_cnt; i++) /* Make sure write-buffer timers are canceled */ @@ -2248,8 +2240,7 @@ static int __init ubifs_init(void) * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { - ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" - " at least 4096 bytes", + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", (unsigned int)PAGE_CACHE_SIZE); return -EINVAL; } @@ -2298,6 +2289,12 @@ static void __exit ubifs_exit(void) dbg_debugfs_exit(); ubifs_compressors_exit(); unregister_shrinker(&ubifs_shrinker_info); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ubifs_inode_slab); unregister_filesystem(&ubifs_fs_type); } diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index d38ac7f..f6bf899 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - dbg_msg("bad key type at slot %d: %d", - i, key_type(c, &zbr->key)); + ubifs_err("bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); err = 3; goto out_dump; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 1e5a086..5486346 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -42,16 +42,15 @@ #define UBIFS_VERSION 1 /* Normal UBIFS messages */ -#define ubifs_msg(fmt, ...) \ - printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__) /* UBIFS error messages */ -#define ubifs_err(fmt, ...) \ - printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ +#define ubifs_err(fmt, ...) \ + pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ __func__, ##__VA_ARGS__) /* UBIFS warning messages */ -#define ubifs_warn(fmt, ...) \ - printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) +#define ubifs_warn(fmt, ...) \ + pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@ -1426,8 +1425,8 @@ struct ubifs_info { long long rp_size; long long report_rp_size; - uid_t rp_uid; - gid_t rp_gid; + kuid_t rp_uid; + kgid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ unsigned int empty:1; diff --git a/fs/udf/file.c b/fs/udf/file.c index 7f3f7ba..77b5953 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -39,20 +39,24 @@ #include "udf_i.h" #include "udf_sb.h" -static int udf_adinicb_readpage(struct file *file, struct page *page) +static void __udf_adinicb_readpage(struct page *page) { struct inode *inode = page->mapping->host; char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); - BUG_ON(!PageLocked(page)); - kaddr = kmap(page); - memset(kaddr, 0, PAGE_CACHE_SIZE); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); + memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size); flush_dcache_page(page); SetPageUptodate(page); kunmap(page); +} + +static int udf_adinicb_readpage(struct file *file, struct page *page) +{ + BUG_ON(!PageLocked(page)); + __udf_adinicb_readpage(page); unlock_page(page); return 0; @@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page, return 0; } +static int udf_adinicb_write_begin(struct file *file, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + struct page *page; + + if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE)) + return -EIO; + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) + __udf_adinicb_readpage(page); + return 0; +} + static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -95,11 +118,20 @@ static int udf_adinicb_write_end(struct file *file, return simple_write_end(file, mapping, pos, len, copied, page, fsdata); } +static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + /* Fallback to buffered I/O. */ + return 0; +} + const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, - .write_begin = simple_write_begin, - .write_end = udf_adinicb_write_end, + .write_begin = udf_adinicb_write_begin, + .write_end = udf_adinicb_write_end, + .direct_IO = udf_adinicb_direct_IO, }; static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index aa23346..df88b95 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -95,11 +95,33 @@ void udf_evict_inode(struct inode *inode) } } +static void udf_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = inode->i_size; + + if (to > isize) { + truncate_pagecache(inode, to, isize); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } + } +} + static int udf_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, udf_get_block, wbc); } +static int udf_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, udf_get_block); +} + static int udf_readpage(struct file *file, struct page *page) { return mpage_readpage(page, udf_get_block); @@ -118,21 +140,24 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, int ret; ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); - if (unlikely(ret)) { - struct inode *inode = mapping->host; - struct udf_inode_info *iinfo = UDF_I(inode); - loff_t isize = inode->i_size; - - if (pos + len > isize) { - truncate_pagecache(inode, pos + len, isize); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - down_write(&iinfo->i_data_sem); - udf_truncate_extents(inode); - up_write(&iinfo->i_data_sem); - } - } - } + if (unlikely(ret)) + udf_write_failed(mapping, pos + len); + return ret; +} +static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + udf_get_block); + if (unlikely(ret < 0 && (rw & WRITE))) + udf_write_failed(mapping, offset + iov_length(iov, nr_segs)); return ret; } @@ -145,8 +170,10 @@ const struct address_space_operations udf_aops = { .readpage = udf_readpage, .readpages = udf_readpages, .writepage = udf_writepage, - .write_begin = udf_write_begin, - .write_end = generic_write_end, + .writepages = udf_writepages, + .write_begin = udf_write_begin, + .write_end = generic_write_end, + .direct_IO = udf_direct_IO, .bmap = udf_bmap, }; @@ -1312,14 +1339,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } read_lock(&sbi->s_cred_lock); - inode->i_uid = le32_to_cpu(fe->uid); - if (inode->i_uid == -1 || + i_uid_write(inode, le32_to_cpu(fe->uid)); + if (!uid_valid(inode->i_uid) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; - inode->i_gid = le32_to_cpu(fe->gid); - if (inode->i_gid == -1 || + i_gid_write(inode, le32_to_cpu(fe->gid)); + if (!gid_valid(inode->i_gid) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; @@ -1542,12 +1569,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) fe->uid = cpu_to_le32(-1); else - fe->uid = cpu_to_le32(inode->i_uid); + fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) fe->gid = cpu_to_le32(-1); else - fe->gid = cpu_to_le32(inode->i_gid); + fe->gid = cpu_to_le32(i_gid_read(inode)); udfperms = ((inode->i_mode & S_IRWXO)) | ((inode->i_mode & S_IRWXG) << 2) | diff --git a/fs/udf/super.c b/fs/udf/super.c index 18fc038..d44fb56 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -171,6 +171,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(udf_inode_cachep); } @@ -199,8 +204,8 @@ struct udf_options { unsigned int rootdir; unsigned int flags; umode_t umask; - gid_t gid; - uid_t uid; + kgid_t gid; + kuid_t uid; umode_t fmode; umode_t dmode; struct nls_table *nls_map; @@ -335,9 +340,9 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) seq_puts(seq, ",gid=ignore"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) - seq_printf(seq, ",uid=%u", sbi->s_uid); + seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid)); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) - seq_printf(seq, ",gid=%u", sbi->s_gid); + seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid)); if (sbi->s_umask != 0) seq_printf(seq, ",umask=%ho", sbi->s_umask); if (sbi->s_fmode != UDF_INVALID_MODE) @@ -516,13 +521,17 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_gid: if (match_int(args, &option)) return 0; - uopt->gid = option; + uopt->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(uopt->gid)) + return 0; uopt->flags |= (1 << UDF_FLAG_GID_SET); break; case Opt_uid: if (match_int(args, &option)) return 0; - uopt->uid = option; + uopt->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uopt->uid)) + return 0; uopt->flags |= (1 << UDF_FLAG_UID_SET); break; case Opt_umask: @@ -1934,8 +1943,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) struct udf_sb_info *sbi; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); - uopt.uid = -1; - uopt.gid = -1; + uopt.uid = INVALID_UID; + uopt.gid = INVALID_GID; uopt.umask = 0; uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 42ad69a..5f02722 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -128,8 +128,8 @@ struct udf_sb_info { /* Default permissions */ umode_t s_umask; - gid_t s_gid; - uid_t s_uid; + kgid_t s_gid; + kuid_t s_uid; umode_t s_fmode; umode_t s_dmode; /* Lock protecting consistency of above permission settings */ diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index dd7c89d..eb6d0b7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -597,8 +597,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) /* * Linux now has 32-bit uid and gid, so we can support EFT. */ - inode->i_uid = ufs_get_inode_uid(sb, ufs_inode); - inode->i_gid = ufs_get_inode_gid(sb, ufs_inode); + i_uid_write(inode, ufs_get_inode_uid(sb, ufs_inode)); + i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); @@ -645,8 +645,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) /* * Linux now has 32-bit uid and gid, so we can support EFT. */ - inode->i_uid = fs32_to_cpu(sb, ufs2_inode->ui_uid); - inode->i_gid = fs32_to_cpu(sb, ufs2_inode->ui_gid); + i_uid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_uid)); + i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid)); inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); @@ -745,8 +745,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); - ufs_set_inode_uid(sb, ufs_inode, inode->i_uid); - ufs_set_inode_gid(sb, ufs_inode, inode->i_gid); + ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); + ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); @@ -789,8 +789,8 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); - ufs_inode->ui_uid = cpu_to_fs32(sb, inode->i_uid); - ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid); + ufs_inode->ui_uid = cpu_to_fs32(sb, i_uid_read(inode)); + ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 444927e..f7cfecf 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1466,6 +1466,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ufs_inode_cachep); } diff --git a/fs/utimes.c b/fs/utimes.c index fa4dbe4..bb0696a 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -140,19 +140,18 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times, goto out; if (filename == NULL && dfd != AT_FDCWD) { - int fput_needed; - struct file *file; + struct fd f; if (flags & AT_SYMLINK_NOFOLLOW) goto out; - file = fget_light(dfd, &fput_needed); + f = fdget(dfd); error = -EBADF; - if (!file) + if (!f.file) goto out; - error = utimes_common(&file->f_path, times); - fput_light(file, fput_needed); + error = utimes_common(&f.file->f_path, times); + fdput(f); } else { struct path path; int lookup_flags = 0; @@ -20,6 +20,7 @@ #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> +#include <linux/posix_acl_xattr.h> #include <asm/uaccess.h> @@ -295,11 +296,13 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; + mutex_lock(&inode->i_mutex); error = security_inode_removexattr(dentry, name); - if (error) + if (error) { + mutex_unlock(&inode->i_mutex); return error; + } - mutex_lock(&inode->i_mutex); error = inode->i_op->removexattr(dentry, name); mutex_unlock(&inode->i_mutex); @@ -347,6 +350,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, error = -EFAULT; goto out; } + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_from_user(kvalue, size); } error = vfs_setxattr(d, kname, kvalue, size, flags); @@ -399,22 +405,20 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { - int fput_needed; - struct file *f; + struct fd f = fdget(fd); struct dentry *dentry; int error = -EBADF; - f = fget_light(fd, &fput_needed); - if (!f) + if (!f.file) return error; - dentry = f->f_path.dentry; + dentry = f.file->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write_file(f); + error = mnt_want_write_file(f.file); if (!error) { error = setxattr(dentry, name, value, size, flags); - mnt_drop_write_file(f); + mnt_drop_write_file(f.file); } - fput_light(f, fput_needed); + fdput(f); return error; } @@ -450,6 +454,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, error = vfs_getxattr(d, kname, kvalue, size); if (error > 0) { + if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + posix_acl_fix_xattr_to_user(kvalue, size); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { @@ -495,16 +502,14 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { - int fput_needed; - struct file *f; + struct fd f = fdget(fd); ssize_t error = -EBADF; - f = fget_light(fd, &fput_needed); - if (!f) + if (!f.file) return error; - audit_inode(NULL, f->f_path.dentry); - error = getxattr(f->f_path.dentry, name, value, size); - fput_light(f, fput_needed); + audit_inode(NULL, f.file->f_path.dentry); + error = getxattr(f.file->f_path.dentry, name, value, size); + fdput(f); return error; } @@ -576,16 +581,14 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { - int fput_needed; - struct file *f; + struct fd f = fdget(fd); ssize_t error = -EBADF; - f = fget_light(fd, &fput_needed); - if (!f) + if (!f.file) return error; - audit_inode(NULL, f->f_path.dentry); - error = listxattr(f->f_path.dentry, list, size); - fput_light(f, fput_needed); + audit_inode(NULL, f.file->f_path.dentry); + error = listxattr(f.file->f_path.dentry, list, size); + fdput(f); return error; } @@ -645,22 +648,20 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { - int fput_needed; - struct file *f; + struct fd f = fdget(fd); struct dentry *dentry; int error = -EBADF; - f = fget_light(fd, &fput_needed); - if (!f) + if (!f.file) return error; - dentry = f->f_path.dentry; + dentry = f.file->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write_file(f); + error = mnt_want_write_file(f.file); if (!error) { error = removexattr(dentry, name); - mnt_drop_write_file(f); + mnt_drop_write_file(f.file); } - fput_light(f, fput_needed); + fdput(f); return error; } @@ -791,3 +792,183 @@ EXPORT_SYMBOL(generic_getxattr); EXPORT_SYMBOL(generic_listxattr); EXPORT_SYMBOL(generic_setxattr); EXPORT_SYMBOL(generic_removexattr); + +/* + * Allocate new xattr and copy in the value; but leave the name to callers. + */ +struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) +{ + struct simple_xattr *new_xattr; + size_t len; + + /* wrap around? */ + len = sizeof(*new_xattr) + size; + if (len <= sizeof(*new_xattr)) + return NULL; + + new_xattr = kmalloc(len, GFP_KERNEL); + if (!new_xattr) + return NULL; + + new_xattr->size = size; + memcpy(new_xattr->value, value, size); + return new_xattr; +} + +/* + * xattr GET operation for in-memory/pseudo filesystems + */ +int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, + void *buffer, size_t size) +{ + struct simple_xattr *xattr; + int ret = -ENODATA; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (strcmp(name, xattr->name)) + continue; + + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); + } + break; + } + spin_unlock(&xattrs->lock); + return ret; +} + +static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + struct simple_xattr *xattr; + struct simple_xattr *uninitialized_var(new_xattr); + int err = 0; + + /* value == NULL means remove */ + if (value) { + new_xattr = simple_xattr_alloc(value, size); + if (!new_xattr) + return -ENOMEM; + + new_xattr->name = kstrdup(name, GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } + } + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + if (!strcmp(name, xattr->name)) { + if (flags & XATTR_CREATE) { + xattr = new_xattr; + err = -EEXIST; + } else if (new_xattr) { + list_replace(&xattr->list, &new_xattr->list); + } else { + list_del(&xattr->list); + } + goto out; + } + } + if (flags & XATTR_REPLACE) { + xattr = new_xattr; + err = -ENODATA; + } else { + list_add(&new_xattr->list, &xattrs->head); + xattr = NULL; + } +out: + spin_unlock(&xattrs->lock); + if (xattr) { + kfree(xattr->name); + kfree(xattr); + } + return err; + +} + +/** + * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems + * @xattrs: target simple_xattr list + * @name: name of the new extended attribute + * @value: value of the new xattr. If %NULL, will remove the attribute + * @size: size of the new xattr + * @flags: %XATTR_{CREATE|REPLACE} + * + * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails + * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; + * otherwise, fails with -ENODATA. + * + * Returns 0 on success, -errno on failure. + */ +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) +{ + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __simple_xattr_set(xattrs, name, value, size, flags); +} + +/* + * xattr REMOVE operation for in-memory/pseudo filesystems + */ +int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name) +{ + return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE); +} + +static bool xattr_is_trusted(const char *name) +{ + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); +} + +/* + * xattr LIST operation for in-memory/pseudo filesystems + */ +ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, + size_t size) +{ + bool trusted = capable(CAP_SYS_ADMIN); + struct simple_xattr *xattr; + size_t used = 0; + + spin_lock(&xattrs->lock); + list_for_each_entry(xattr, &xattrs->head, list) { + size_t len; + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + len = strlen(xattr->name) + 1; + used += len; + if (buffer) { + if (size < used) { + used = -ERANGE; + break; + } + memcpy(buffer, xattr->name, len); + buffer += len; + } + } + spin_unlock(&xattrs->lock); + + return used; +} + +/* + * Adds an extended attribute to the list + */ +void simple_xattr_list_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + spin_lock(&xattrs->lock); + list_add(&new_xattr->list, &xattrs->head); + spin_unlock(&xattrs->lock); +} diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 69d06b0..11efd83 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -9,13 +9,72 @@ #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/gfp.h> +#include <linux/user_namespace.h> +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} /* * Convert from extended attribute to in-memory representation. */ struct posix_acl * -posix_acl_from_xattr(const void *value, size_t size) +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) { posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; @@ -50,12 +109,21 @@ posix_acl_from_xattr(const void *value, size_t size) case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl_e->e_id = ACL_UNDEFINED_ID; break; case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; case ACL_GROUP: - acl_e->e_id = le32_to_cpu(entry->e_id); + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; break; default: @@ -74,7 +142,8 @@ EXPORT_SYMBOL (posix_acl_from_xattr); * Convert from in-memory to extended attribute representation. */ int -posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) { posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; @@ -89,9 +158,22 @@ posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { - ext_entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - ext_entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - ext_entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } } return real_size; } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index ac702a6..1d32f1d 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return error; @@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d7a9dd7..933b793 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,6 +96,7 @@ xfs_buf_lru_add( atomic_inc(&bp->b_hold); list_add_tail(&bp->b_lru, &btp->bt_lru); btp->bt_lru_nr++; + bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); } @@ -154,7 +155,8 @@ xfs_buf_stale( struct xfs_buftarg *btp = bp->b_target; spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { + if (!list_empty(&bp->b_lru) && + !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { list_del_init(&bp->b_lru); btp->bt_lru_nr--; atomic_dec(&bp->b_hold); @@ -1501,6 +1503,7 @@ xfs_buftarg_shrink( */ list_move(&bp->b_lru, &dispose); btp->bt_lru_nr--; + bp->b_lru_flags |= _XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d03b73b..7c0b6a0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -38,27 +38,28 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ -#define XBF_FUA (1 << 11)/* force cache write through mode */ -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ -#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t; { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -124,7 +126,12 @@ typedef struct xfs_buf { xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ struct list_head b_lru; /* lru list */ + xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e00de08..b9b8646 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -48,44 +48,44 @@ xfs_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct file *file, *tmp_file; + struct fd f, tmp; int error = 0; /* Pull information for the target fd */ - file = fget((int)sxp->sx_fdtarget); - if (!file) { + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { error = XFS_ERROR(EINVAL); goto out; } - if (!(file->f_mode & FMODE_WRITE) || - !(file->f_mode & FMODE_READ) || - (file->f_flags & O_APPEND)) { + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_file; } - tmp_file = fget((int)sxp->sx_fdtmp); - if (!tmp_file) { + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { error = XFS_ERROR(EINVAL); goto out_put_file; } - if (!(tmp_file->f_mode & FMODE_WRITE) || - !(tmp_file->f_mode & FMODE_READ) || - (tmp_file->f_flags & O_APPEND)) { + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_tmp_file; } - if (IS_SWAPFILE(file->f_path.dentry->d_inode) || - IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { + if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || + IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { error = XFS_ERROR(EINVAL); goto out_put_tmp_file; } - ip = XFS_I(file->f_path.dentry->d_inode); - tip = XFS_I(tmp_file->f_path.dentry->d_inode); + ip = XFS_I(f.file->f_path.dentry->d_inode); + tip = XFS_I(tmp.file->f_path.dentry->d_inode); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); @@ -105,9 +105,9 @@ xfs_swapext( error = xfs_swap_extents(ip, tip, sxp); out_put_tmp_file: - fput(tmp_file); + fdput(tmp); out_put_file: - fput(file); + fdput(f); out: return error; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 56afcdb..aa473fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include <linux/dcache.h> #include <linux/falloc.h> +#include <linux/pagevec.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -939,7 +940,6 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -959,17 +959,232 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_data() or xfs_seek_hole(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + STATIC loff_t xfs_seek_data( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec map[2]; - int nmap = 2; loff_t uninitialized_var(offset); xfs_fsize_t isize; xfs_fileoff_t fsbno; @@ -985,36 +1200,74 @@ xfs_seek_data( goto out_unlock; } - fsbno = XFS_B_TO_FSBT(mp, start); - /* * Try to read extents from the first block indicated * by fsbno to the end block of the file. */ + fsbno = XFS_B_TO_FSBT(mp, start); end = XFS_B_TO_FSB(mp, isize); + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; - /* - * Treat unwritten extent as data extent since it might - * contains dirty data in page cache. - */ - if (map[0].br_startblock != HOLESTARTBLOCK) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[0].br_startoff)); - } else { + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a data extent */ + if (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock))) + goto out; + + /* + * Landed in an unwritten extent, try to search data + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + DATA_OFF, &offset)) + goto out; + } + } + + /* + * map[0] is hole or its an unwritten extent but + * without data in page cache. Probably means that + * we are reading after EOF if nothing in map[1]. + */ if (nmap == 1) { error = ENXIO; goto out_unlock; } - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[1].br_startoff)); + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } } +out: if (offset != file->f_pos) file->f_pos = offset; @@ -1029,16 +1282,15 @@ out_unlock: STATIC loff_t xfs_seek_hole( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; loff_t uninitialized_var(offset); - loff_t holeoff; xfs_fsize_t isize; xfs_fileoff_t fsbno; + xfs_filblks_t end; uint lock; int error; @@ -1054,21 +1306,77 @@ xfs_seek_hole( } fsbno = XFS_B_TO_FSBT(mp, start); - error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); - if (error) - goto out_unlock; + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a hole */ + if (map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* + * Landed in an unwritten extent, try to search hole + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + HOLE_OFF, &offset)) + goto out; + } + } + + /* + * map[0] contains data or its unwritten but contains + * data in page cache, probably means that we are + * reading after EOF. We should fix offset to point + * to the end of the file(i.e., there is an implicit + * hole at the end of any file). + */ + if (nmap == 1) { + offset = isize; + break; + } + + ASSERT(i > 1); - holeoff = XFS_FSB_TO_B(mp, fsbno); - if (holeoff <= start) - offset = start; - else { /* - * xfs_bmap_first_unused() could return a value bigger than - * isize if there are no more holes past the supplied offset. + * Both mappings contains data, proceed to the next round of + * search if the current reading offset not beyond or hit EOF. */ - offset = min_t(loff_t, holeoff, isize); + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + offset = isize; + break; + } } +out: + /* + * At this point, we must have found a hole. However, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents, we need to deal with this + * situation in particular. + */ + offset = min_t(loff_t, offset, isize); if (offset != file->f_pos) file->f_pos = offset; @@ -1092,9 +1400,9 @@ xfs_file_llseek( case SEEK_SET: return generic_file_llseek(file, offset, origin); case SEEK_DATA: - return xfs_seek_data(file, offset, origin); + return xfs_seek_data(file, offset); case SEEK_HOLE: - return xfs_seek_hole(file, offset, origin); + return xfs_seek_hole(file, offset); default: return -EINVAL; } @@ -1134,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5aceb3f..445bf1a 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -431,7 +431,7 @@ xfs_ialloc_next_ag( spin_lock(&mp->m_agirotor_lock); agno = mp->m_agirotor; - if (++mp->m_agirotor == mp->m_maxagi) + if (++mp->m_agirotor >= mp->m_maxagi) mp->m_agirotor = 0; spin_unlock(&mp->m_agirotor_lock); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e0232c..8305f2a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,16 +70,16 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct file *file = NULL; + struct fd f; struct path path; int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - file = fget(hreq->fd); - if (!file) + f = fdget(hreq->fd); + if (!f.file) return -EBADF; - inode = file->f_path.dentry->d_inode; + inode = f.file->f_path.dentry->d_inode; } else { error = user_lpath((const char __user *)hreq->path, &path); if (error) @@ -134,7 +134,7 @@ xfs_find_handle( out_put: if (cmd == XFS_IOC_FD_TO_HANDLE) - fput(file); + fdput(f); else path_put(&path); return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 29c2f83..b2bd3a0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -440,7 +440,7 @@ xfs_initialize_perag( xfs_agnumber_t agcount, xfs_agnumber_t *maxagi) { - xfs_agnumber_t index, max_metadata; + xfs_agnumber_t index; xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; @@ -500,43 +500,10 @@ xfs_initialize_perag( else mp->m_flags &= ~XFS_MOUNT_32BITINODES; - if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* - * Calculate how much should be reserved for inodes to meet - * the max inode percentage. - */ - if (mp->m_maxicount) { - __uint64_t icount; - - icount = sbp->sb_dblocks * sbp->sb_imax_pct; - do_div(icount, 100); - icount += sbp->sb_agblocks - 1; - do_div(icount, sbp->sb_agblocks); - max_metadata = icount; - } else { - max_metadata = agcount; - } - - for (index = 0; index < agcount; index++) { - ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > XFS_MAXINUMBER_32) { - index++; - break; - } - - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - if (index < max_metadata) - pag->pagf_metadata = 1; - xfs_perag_put(pag); - } - } else { - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - xfs_perag_put(pag); - } - } + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp); + else + index = xfs_set_inode64(mp); if (maxagi) *maxagi = index; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 05a05a7..deee09e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -54,12 +54,7 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" struct xlog; -struct xfs_mount_args; struct xfs_inode; -struct xfs_bmbt_irec; -struct xfs_bmap_free; -struct xfs_extdelta; -struct xfs_swapext; struct xfs_mru_cache; struct xfs_nameops; struct xfs_ail; diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index fed504f..71926d6 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -97,8 +97,7 @@ xfs_fs_set_xstate( STATIC int xfs_fs_get_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -108,14 +107,14 @@ xfs_fs_get_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } STATIC int xfs_fs_set_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -127,7 +126,8 @@ xfs_fs_set_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bdaf4cb..26a09bd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool; * unwritten extent conversion */ #define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ #define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ #define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ @@ -120,12 +122,18 @@ mempool_t *xfs_ioend_pool; * in the future, too. */ enum { - Opt_barrier, Opt_nobarrier, Opt_err + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err }; static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, {Opt_err, NULL} }; @@ -197,7 +205,9 @@ xfs_parseargs( */ mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; +#if !XFS_BIG_INUMS mp->m_flags |= XFS_MOUNT_SMALL_INUMS; +#endif /* * These can be overridden by the mount option parsing. @@ -294,6 +304,8 @@ xfs_parseargs( return EINVAL; } dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; #if !XFS_BIG_INUMS @@ -492,6 +504,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -591,6 +604,80 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); + xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = sbp->sb_agcount; + } + + for (index = 0; index < sbp->sb_agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + + if (ino > XFS_MAXINUMBER_32) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < mp->m_sb.sb_agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -919,6 +1006,7 @@ xfs_fs_put_super( struct xfs_mount *mp = XFS_M(sb); xfs_filestream_unmount(mp); + cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); xfs_syncd_stop(mp); xfs_freesb(mp); @@ -953,7 +1041,7 @@ xfs_fs_sync_fs( * We schedule xfssyncd now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work_sync(&mp->m_sync_work); + flush_delayed_work(&mp->m_sync_work); } return 0; @@ -1055,6 +1143,12 @@ xfs_fs_remount( case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; break; + case Opt_inode64: + mp->m_maxagi = xfs_set_inode64(mp); + break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp); + break; default: /* * Logically we would return an error here to prevent @@ -1505,6 +1599,11 @@ xfs_init_zones(void) STATIC void xfs_destroy_zones(void) { + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 09b0c26..9de4a92 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -75,6 +75,8 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 9654817..9500caf 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -475,7 +475,7 @@ xfs_flush_inodes( struct xfs_mount *mp = ip->i_mount; queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work_sync(&mp->m_flush_work); + flush_work(&mp->m_flush_work); } STATIC void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e5795dd..7d36ccf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,6 +37,7 @@ struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; struct xfs_inode_log_format; +struct xfs_bmbt_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index bcb6054..0c7fa54 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -578,9 +578,11 @@ xfs_quota_warn( /* no warnings for project quotas - we just return ENOSPC later */ if (dqp->dq_flags & XFS_DQ_PROJ) return; - quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, - be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, - type); + quota_send_warning(make_kqid(&init_user_ns, + (dqp->dq_flags & XFS_DQ_USER) ? + USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id)), + mp->m_super->s_dev, type); } /* |