diff options
Diffstat (limited to 'fs')
222 files changed, 8326 insertions, 6817 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 10b7d3c..8c92a9b 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -259,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { - buf->f_type = V9FS_MAGIC; + buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; @@ -13,7 +13,7 @@ #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/uio.h> @@ -5,7 +5,7 @@ * changes by Thomas Schoebel-Theuer */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 22e9a78..37268c5 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -9,7 +9,7 @@ */ #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 81878b7..504b6eee 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1093,6 +1093,29 @@ out: */ /* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* * Decide what to dump of a segment, part, all or none. */ static unsigned long vma_dump_size(struct vm_area_struct *vma, @@ -1100,10 +1123,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, { #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* The vma can be set up to tell us the answer directly. */ - if (vma->vm_flags & VM_ALWAYSDUMP) + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) goto whole; + if (vma->vm_flags & VM_NODUMP) + return 0; + /* Hugetlb memory check */ if (vma->vm_flags & VM_HUGETLB) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 04f61f0..5979027 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,7 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1ffb603..613aa06 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> @@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; - int err = simple_fill_super(sb, 0x42494e4d, bm_files); + int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; @@ -22,7 +22,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <scsi/sg.h> /* for struct sg_iovec */ diff --git a/fs/block_dev.c b/fs/block_dev.c index a9ff300..e08f6a20 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -16,6 +16,7 @@ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/blkpg.h> +#include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); } static struct file_system_type bd_type = { diff --git a/fs/buffer.c b/fs/buffer.c index 1a30db7..70e2017 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,7 +29,7 @@ #include <linux/file.h> #include <linux/quotaops.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c48937..9fff9f3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode, case S_IFLNK: inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { - int symlen = iinfo->symlink_len; + u32 symlen = iinfo->symlink_len; char *sym; - BUG_ON(symlen != inode->i_size); spin_unlock(&ci->i_ceph_lock); + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; + err = -ENOMEM; - sym = kmalloc(symlen+1, GFP_NOFS); + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); if (!sym) goto out; - memcpy(sym, iinfo->symlink, symlen); - sym[symlen] = 0; spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 866e8d7..89971e1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; - s->s_cap_ttl = 0; + s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; @@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, int wake = 0; spin_lock(&session->s_cap_lock); - was_stale = is_renew && (session->s_cap_ttl == 0 || - time_after_eq(jiffies, session->s_cap_ttl)); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; @@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session, session->s_mds); spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; - session->s_cap_ttl = 0; + session->s_cap_ttl = jiffies - 1; spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a559c80..f04c096 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) /* alloc new snap context */ err = -ENOMEM; - if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); if (!snapc) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 256f852..1e67dd7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -130,10 +130,12 @@ enum { Opt_nodirstat, Opt_rbytes, Opt_norbytes, + Opt_asyncreaddir, Opt_noasyncreaddir, Opt_dcache, Opt_nodcache, Opt_ino32, + Opt_noino32, }; static match_table_t fsopt_tokens = { @@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = { {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {Opt_dcache, "dcache"}, {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, {-1, NULL} }; @@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norbytes: fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; @@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } @@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, *path += 2; dout("server path '%s'\n", *path); - err = ceph_parse_options(popt, options, dev_name, dev_name_end, + *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); - if (err) + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); goto out; + } /* success */ *pfsopt = fsopt; @@ -926,6 +938,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -935,6 +948,7 @@ static int __init init_ceph(void) return 0; out_icache: + ceph_xattr_exit(); destroy_caches(); out: return ret; @@ -944,6 +958,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1421f3d..fc35036 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) - ino = 1; + ino = 2; return ino; } @@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); /* caps.c */ extern const char *ceph_cap_string(int c); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a76f697..35b8633 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -8,9 +8,12 @@ #include <linux/xattr.h> #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, "ceph.", 5) || + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || @@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name) * These define virtual xattrs exposing the recursive directory * statistics and layout metadata. */ -struct ceph_vxattr_cb { - bool readonly; +struct ceph_vxattr { char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); + bool readonly; }; /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, (long)ci->i_rctime.tv_nsec); } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "ceph.dir.files", ceph_vxattrcb_files}, - { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, - { true, NULL, NULL } +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + } + +static struct ceph_vxattr ceph_dir_vxattrs[] = { + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { 0 } /* Required table terminator */ }; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, size_t size) { int ret; @@ -103,21 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (ceph_file_layout_pg_preferred(ci->i_layout)) - ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + + if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) { + val += ret; + size -= ret; + ret += snprintf(val, size, "preferred_osd=%lld\n", (unsigned long long)ceph_file_layout_pg_preferred( ci->i_layout)); + } + return ret; } -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "ceph.file.layout", ceph_vxattrcb_layout}, +static struct ceph_vxattr ceph_file_vxattrs[] = { + XATTR_NAME_CEPH(file, layout), /* The following extended attribute name is deprecated */ - { true, "ceph.layout", ceph_vxattrcb_layout}, - { true, NULL, NULL } + { + .name = XATTR_CEPH_PREFIX "layout", + .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), + .getxattr_cb = ceph_vxattrcb_file_layout, + .readonly = true, + }, + { 0 } /* Required table terminator */ }; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { if (S_ISDIR(inode->i_mode)) return ceph_dir_vxattrs; @@ -126,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG(); + + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { - do { - if (strcmp(vxattr->name, name) == 0) - return vxattr; - vxattr++; - } while (vxattr->name); + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + return NULL; } @@ -502,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr_cb *vxattr = NULL; + struct ceph_vxattr *vxattr = NULL; if (!ceph_is_valid_xattr(name)) return -ENODATA; /* let's see if a virtual xattr was requested */ - if (vxattrs) - vxattr = ceph_match_vxattr(vxattrs, name); + vxattr = ceph_match_vxattr(inode, name); spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, @@ -568,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; u32 namelen; int err; @@ -596,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) goto out; list_xattr: - vir_namelen = 0; - /* include virtual dir xattrs */ - if (vxattrs) - for (i = 0; vxattrs[i].name; i++) - vir_namelen += strlen(vxattrs[i].name) + 1; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + /* adding 1 byte per each variable due to the null termination */ namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; @@ -698,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; int err; + int dirty; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; - int issued; int required_blob_size; - int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -716,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; @@ -730,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, goto out; if (val_len) { - newval = kmalloc(val_len + 1, GFP_NOFS); + newval = kmemdup(value, val_len, GFP_NOFS); if (!newval) goto out; - memcpy(newval, value, val_len); - newval[val_len] = '\0'; } xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); @@ -744,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -752,7 +818,7 @@ retry: if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob = NULL; + struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); @@ -766,12 +832,13 @@ retry: goto retry; } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); err = __set_xattr(ci, newname, name_len, newval, val_len, 1, 1, 1, &xattr); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -816,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; int required_blob_size; @@ -829,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name) if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; err = -ENOMEM; spin_lock(&ci->i_ceph_lock); - __build_xattrs(inode); retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -865,10 +929,10 @@ retry: } err = __remove_xattr_by_name(ceph_inode(inode), name); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); diff --git a/fs/cifs/README b/fs/cifs/README index 895da1d..b7d782b 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -753,10 +753,6 @@ module loading or during the runtime by using the interface i.e. echo "value" > /sys/module/cifs/parameters/<param> -1. echo_retries - The number of echo attempts before giving up and - reconnecting to the server. The default is 5. The value 0 - means never reconnect. - -2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. +1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. [Y/y/1]. To disable use any of [N/n/0]. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 24b3dfc..573b899 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -171,8 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "TCP status: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", server->tcpStatus, server->srv_count, - server->sec_mode, - atomic_read(&server->inFlight)); + server->sec_mode, in_flight(server)); #ifdef CONFIG_CIFS_STATS2 seq_printf(m, " In Send: %d In MaxReq Wait: %d", diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 418fc42..eee522c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -76,12 +76,7 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " - "Default: 50 Range: 2 to 256"); -unsigned short echo_retries = 5; -module_param(echo_retries, ushort, 0644); -MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " - "reconnecting server. Default: 5. 0 means " - "never reconnect."); + "Default: 32767 Range: 2 to 32767."); module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" "y/Y/1"); @@ -1111,9 +1106,9 @@ init_cifs(void) if (cifs_max_pending < 2) { cifs_max_pending = 2; cFYI(1, "cifs_max_pending set to min of 2"); - } else if (cifs_max_pending > 256) { - cifs_max_pending = 256; - cFYI(1, "cifs_max_pending set to max of 256"); + } else if (cifs_max_pending > CIFS_MAX_REQ) { + cifs_max_pending = CIFS_MAX_REQ; + cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ); } rc = cifs_fscache_register(); @@ -1175,11 +1170,8 @@ static void __exit exit_cifs(void) { cFYI(DBG2, "exit_cifs"); - cifs_proc_clean(); - cifs_fscache_unregister(); -#ifdef CONFIG_CIFS_DFS_UPCALL + unregister_filesystem(&cifs_fs_type); cifs_dfs_release_automount_timer(); -#endif #ifdef CONFIG_CIFS_ACL cifs_destroy_idmaptrees(); exit_cifs_idmap(); @@ -1187,10 +1179,11 @@ exit_cifs(void) #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); #endif - unregister_filesystem(&cifs_fs_type); - cifs_destroy_inodecache(); - cifs_destroy_mids(); cifs_destroy_request_bufs(); + cifs_destroy_mids(); + cifs_destroy_inodecache(); + cifs_fscache_unregister(); + cifs_proc_clean(); } MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 76e7d8b..339ebe3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -55,14 +55,9 @@ /* * MAX_REQ is the maximum number of requests that WE will send - * on one socket concurrently. It also matches the most common - * value of max multiplex returned by servers. We may - * eventually want to use the negotiated value (in case - * future servers can handle more) when we are more confident that - * we will not have problems oveloading the socket with pending - * write data. + * on one socket concurrently. */ -#define CIFS_MAX_REQ 50 +#define CIFS_MAX_REQ 32767 #define RFC1001_NAME_LEN 15 #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) @@ -255,7 +250,9 @@ struct TCP_Server_Info { bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; - atomic_t inFlight; /* number of requests on the wire to server */ + int credits; /* send no more requests at once */ + unsigned int in_flight; /* number of requests on the wire to server */ + spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; struct task_struct *tsk; char server_GUID[16]; @@ -263,6 +260,7 @@ struct TCP_Server_Info { bool session_estab; /* mark when very first sess is established */ u16 dialect; /* dialect index that server chose */ enum securityEnum secType; + bool oplocks:1; /* enable oplocks */ unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ /* multiplexed reads or writes */ @@ -307,6 +305,36 @@ struct TCP_Server_Info { #endif }; +static inline unsigned int +in_flight(struct TCP_Server_Info *server) +{ + unsigned int num; + spin_lock(&server->req_lock); + num = server->in_flight; + spin_unlock(&server->req_lock); + return num; +} + +static inline int* +get_credits_field(struct TCP_Server_Info *server) +{ + /* + * This will change to switch statement when we reserve slots for echos + * and oplock breaks. + */ + return &server->credits; +} + +static inline bool +has_credits(struct TCP_Server_Info *server, int *credits) +{ + int num; + spin_lock(&server->req_lock); + num = *credits; + spin_unlock(&server->req_lock); + return num > 0; +} + /* * Macros to allow the TCP_Server_Info->net field and related code to drop out * when CONFIG_NET_NS isn't set. @@ -1010,9 +1038,6 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ -/* reconnect after this many failed echo attempts */ -GLOBAL_EXTERN unsigned short echo_retries; - #ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6f4e243..503e73d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -88,6 +88,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf , struct smb_hdr *out_buf, int *bytes_returned); +extern void cifs_add_credits(struct TCP_Server_Info *server, + const unsigned int add); +extern void cifs_set_credits(struct TCP_Server_Info *server, const int val); extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); @@ -168,7 +171,13 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, const char *devname); extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); + +#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) extern void cifs_dfs_release_automount_timer(void); +#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +#define cifs_dfs_release_automount_timer() do { } while (0) +#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ + void cifs_proc_init(void); void cifs_proc_clean(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 8b7794c..70aac35c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -458,7 +458,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) goto neg_err_exit; } server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); - server->maxReq = le16_to_cpu(rsp->MaxMpxCount); + server->maxReq = min_t(unsigned int, + le16_to_cpu(rsp->MaxMpxCount), + cifs_max_pending); + cifs_set_credits(server, server->maxReq); server->maxBuf = le16_to_cpu(rsp->MaxBufSize); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this @@ -564,7 +567,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) /* one byte, so no need to convert this or EncryptionKeyLen from little endian */ - server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); + server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), + cifs_max_pending); + cifs_set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); @@ -716,8 +721,7 @@ cifs_echo_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = mid->callback_data; DeleteMidQEntry(mid); - atomic_dec(&server->inFlight); - wake_up(&server->request_q); + cifs_add_credits(server, 1); } int @@ -1669,8 +1673,7 @@ cifs_readv_callback(struct mid_q_entry *mid) queue_work(system_nrt_wq, &rdata->work); DeleteMidQEntry(mid); - atomic_dec(&server->inFlight); - wake_up(&server->request_q); + cifs_add_credits(server, 1); } /* cifs_async_readv - send an async write, and set up mid to handle result */ @@ -2110,8 +2113,7 @@ cifs_writev_callback(struct mid_q_entry *mid) queue_work(system_nrt_wq, &wdata->work); DeleteMidQEntry(mid); - atomic_dec(&tcon->ses->server->inFlight); - wake_up(&tcon->ses->server->request_q); + cifs_add_credits(tcon->ses->server, 1); } /* cifs_async_writev - send an async write, and set up mid to handle result */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 602f77c..5560e1d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -373,12 +373,22 @@ allocate_buffers(struct TCP_Server_Info *server) static bool server_unresponsive(struct TCP_Server_Info *server) { - if (echo_retries > 0 && server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + - (echo_retries * SMB_ECHO_INTERVAL))) { + /* + * We need to wait 2 echo intervals to make sure we handle such + * situations right: + * 1s client sends a normal SMB request + * 2s client gets a response + * 30s echo workqueue job pops, and decides we got a response recently + * and don't need to send another + * ... + * 65s kernel_recvmsg times out, and we see that we haven't gotten + * a response in >60s. + */ + if (server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { cERROR(1, "Server %s has not responded in %d seconds. " "Reconnecting...", server->hostname, - (echo_retries * SMB_ECHO_INTERVAL / HZ)); + (2 * SMB_ECHO_INTERVAL) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -642,19 +652,11 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_unlock(&GlobalMid_Lock); wake_up_all(&server->response_q); - /* - * Check if we have blocked requests that need to free. Note that - * cifs_max_pending is normally 50, but can be set at module install - * time to as little as two. - */ - spin_lock(&GlobalMid_Lock); - if (atomic_read(&server->inFlight) >= cifs_max_pending) - atomic_set(&server->inFlight, cifs_max_pending - 1); - /* - * We do not want to set the max_pending too low or we could end up - * with the counter going negative. - */ - spin_unlock(&GlobalMid_Lock); + /* check if we have blocked requests that need to free */ + spin_lock(&server->req_lock); + if (server->credits <= 0) + server->credits = 1; + spin_unlock(&server->req_lock); /* * Although there should not be any requests blocked on this queue it * can not hurt to be paranoid and try to wake up requests that may @@ -1909,7 +1911,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->noblocksnd = volume_info->noblocksnd; tcp_ses->noautotune = volume_info->noautotune; tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; - atomic_set(&tcp_ses->inFlight, 0); + tcp_ses->in_flight = 0; + tcp_ses->credits = 1; init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); INIT_LIST_HEAD(&tcp_ses->pending_mid_q); @@ -3371,7 +3374,7 @@ cifs_ra_pages(struct cifs_sb_info *cifs_sb) int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { - int rc = 0; + int rc; int xid; struct cifs_ses *pSesInfo; struct cifs_tcon *tcon; @@ -3398,6 +3401,7 @@ try_mount_again: FreeXid(xid); } #endif + rc = 0; tcon = NULL; pSesInfo = NULL; srvTcp = NULL; @@ -3759,9 +3763,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) if (server->maxBuf != 0) return 0; + cifs_set_credits(server, 1); rc = CIFSSMBNegotiate(xid, ses); if (rc == -EAGAIN) { /* retry only once on 1st time connection */ + cifs_set_credits(server, 1); rc = CIFSSMBNegotiate(xid, ses); if (rc == -EAGAIN) rc = -EHOSTDOWN; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index bc7e244..d172c8e 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, } tcon = tlink_tcon(tlink); - if (enable_oplocks) + if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; if (nd) @@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ - __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0; + __u32 oplock; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; @@ -518,6 +518,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); + oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0; + /* * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5e64748..159fcc5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -380,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (enable_oplocks) + if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -505,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, pCifsFile->f_flags, full_path); - if (enable_oplocks) + if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -960,9 +960,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) INIT_LIST_HEAD(&locks_to_send); /* - * Allocating count locks is enough because no locks can be added to - * the list while we are holding cinode->lock_mutex that protects - * locking operations of this inode. + * Allocating count locks is enough because no FL_POSIX locks can be + * added to the list while we are holding cinode->lock_mutex that + * protects locking operations of this inode. */ for (; i < count; i++) { lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); @@ -973,18 +973,20 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) list_add_tail(&lck->llist, &locks_to_send); } - i = 0; el = locks_to_send.next; lock_flocks(); cifs_for_each_lock(cfile->dentry->d_inode, before) { + flock = *before; + if ((flock->fl_flags & FL_POSIX) == 0) + continue; if (el == &locks_to_send) { - /* something is really wrong */ + /* + * The list ended. We don't have enough allocated + * structures - something is really wrong. + */ cERROR(1, "Can't push all brlocks!"); break; } - flock = *before; - if ((flock->fl_flags & FL_POSIX) == 0) - continue; length = 1 + flock->fl_end - flock->fl_start; if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) type = CIFS_RDLCK; @@ -996,7 +998,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) lck->length = length; lck->type = type; lck->offset = flock->fl_start; - i++; el = el->next; } unlock_flocks(); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 703ef5c..c273c12 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -690,3 +690,22 @@ backup_cred(struct cifs_sb_info *cifs_sb) return false; } + +void +cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add) +{ + spin_lock(&server->req_lock); + server->credits += add; + server->in_flight--; + spin_unlock(&server->req_lock); + wake_up(&server->request_q); +} + +void +cifs_set_credits(struct TCP_Server_Info *server, const int val) +{ + spin_lock(&server->req_lock); + server->credits = val; + server->oplocks = val > 1 ? enable_oplocks : false; + spin_unlock(&server->req_lock); +} diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0cc9584..310918b 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -254,44 +254,60 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, return smb_sendv(server, &iov, 1); } -static int wait_for_free_request(struct TCP_Server_Info *server, - const int long_op) +static int +wait_for_free_credits(struct TCP_Server_Info *server, const int optype, + int *credits) { - if (long_op == CIFS_ASYNC_OP) { + int rc; + + spin_lock(&server->req_lock); + if (optype == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ - atomic_inc(&server->inFlight); + server->in_flight++; + *credits -= 1; + spin_unlock(&server->req_lock); return 0; } - spin_lock(&GlobalMid_Lock); while (1) { - if (atomic_read(&server->inFlight) >= cifs_max_pending) { - spin_unlock(&GlobalMid_Lock); + if (*credits <= 0) { + spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - wait_event(server->request_q, - atomic_read(&server->inFlight) - < cifs_max_pending); + rc = wait_event_killable(server->request_q, + has_credits(server, credits)); cifs_num_waiters_dec(server); - spin_lock(&GlobalMid_Lock); + if (rc) + return rc; + spin_lock(&server->req_lock); } else { if (server->tcpStatus == CifsExiting) { - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->req_lock); return -ENOENT; } - /* can not count locking commands against total - as they are allowed to block on server */ + /* + * Can not count locking commands against total + * as they are allowed to block on server. + */ /* update # of requests on the wire to server */ - if (long_op != CIFS_BLOCKING_OP) - atomic_inc(&server->inFlight); - spin_unlock(&GlobalMid_Lock); + if (optype != CIFS_BLOCKING_OP) { + *credits -= 1; + server->in_flight++; + } + spin_unlock(&server->req_lock); break; } } return 0; } +static int +wait_for_free_request(struct TCP_Server_Info *server, const int optype) +{ + return wait_for_free_credits(server, optype, get_credits_field(server)); +} + static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { @@ -359,7 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid = AllocMidQEntry(hdr, server); if (mid == NULL) { mutex_unlock(&server->srv_mutex); - atomic_dec(&server->inFlight); + cifs_add_credits(server, 1); wake_up(&server->request_q); return -ENOMEM; } @@ -392,7 +408,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, return rc; out_err: delete_mid(mid); - atomic_dec(&server->inFlight); + cifs_add_credits(server, 1); wake_up(&server->request_q); return rc; } @@ -564,8 +580,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, mutex_unlock(&ses->server->srv_mutex); cifs_small_buf_release(in_buf); /* Update # of requests on wire to server */ - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); @@ -601,8 +616,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); cifs_small_buf_release(in_buf); - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } spin_unlock(&GlobalMid_Lock); @@ -612,8 +626,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } @@ -637,8 +650,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, midQ->resp_buf = NULL; out: delete_mid(midQ); - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } @@ -688,8 +700,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc) { mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } @@ -721,8 +732,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } spin_unlock(&GlobalMid_Lock); @@ -730,8 +740,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } @@ -747,8 +756,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_check_receive(midQ, ses->server, 0); out: delete_mid(midQ); - atomic_dec(&ses->server->inFlight); - wake_up(&ses->server->request_q); + cifs_add_credits(ses->server, 1); return rc; } diff --git a/fs/compat.c b/fs/compat.c index 07880ba..14483a7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -33,7 +33,6 @@ #include <linux/nfs4_mount.h> #include <linux/syscalls.h> #include <linux/ctype.h> -#include <linux/module.h> #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 10d8cd9..debdfe0 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -49,7 +49,6 @@ #include <linux/elevator.h> #include <linux/rtc.h> #include <linux/pci.h> -#include <linux/module.h> #include <linux/serial.h> #include <linux/if_tun.h> #include <linux/ctype.h> diff --git a/fs/dcache.c b/fs/dcache.c index 2b55bd0..b60ddc4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -23,7 +23,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mount.h> #include <linux/file.h> #include <asm/uaccess.h> @@ -2404,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) if (d_ancestor(alias, dentry)) { /* Check for loops */ actual = ERR_PTR(-ELOOP); + spin_unlock(&inode->i_lock); } else if (IS_ROOT(alias)) { /* Is this an anonymous mountpoint that we * could splice into our tree? */ @@ -2413,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) goto found; } else { /* Nope, but we must(!) avoid directory - * aliasing */ + * aliasing. This drops inode->i_lock */ actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); diff --git a/fs/dcookies.c b/fs/dcookies.c index dda0dc7..17c7799 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -13,7 +13,7 @@ */ #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mount.h> diff --git a/fs/eventfd.c b/fs/eventfd.c index d9a5917..dba15fe 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -16,7 +16,7 @@ #include <linux/spinlock.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kref.h> #include <linux/eventfd.h> diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4d9d3a4..629e9ed 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -427,6 +427,31 @@ out_unlock: return error; } +/* + * As described in commit 0ccf831cb lockdep: annotate epoll + * the use of wait queues used by epoll is done in a very controlled + * manner. Wake ups can nest inside each other, but are never done + * with the same locking. For example: + * + * dfd = socket(...); + * efd1 = epoll_create(); + * efd2 = epoll_create(); + * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); + * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); + * + * When a packet arrives to the device underneath "dfd", the net code will + * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a + * callback wakeup entry on that queue, and the wake_up() performed by the + * "dfd" net code will end up in ep_poll_callback(). At this point epoll + * (efd1) notices that it may have some event ready, so it needs to wake up + * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() + * that ends up in another wake_up(), after having checked about the + * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to + * avoid stack blasting. + * + * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle + * this special case of epoll. + */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, unsigned long events, int subclass) @@ -699,9 +724,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct epitem *epi, *tmp; + poll_table pt; + init_poll_funcptr(&pt, NULL); list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events) return POLLIN | POLLRDNORM; else { @@ -1049,13 +1077,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) */ static int reverse_path_check(void) { - int length = 0; int error = 0; struct file *current_file; /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { - length++; path_count_init(); error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, reverse_path_check_proc, current_file, @@ -1097,6 +1123,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); + epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1191,6 +1218,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); @@ -1198,13 +1228,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1239,6 +1270,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * We can loop without lock because we are passed a task private list. @@ -1251,7 +1285,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events; /* diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a203892..1e036b7 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1743,8 +1743,11 @@ allocated: *errp = 0; brelse(bitmap_bh); - dquot_free_block(inode, *count-num); - *count = num; + + if (num < *count) { + dquot_free_block(inode, *count-num); + *count = num; + } trace_ext3_allocate_blocks(inode, goal, num, (unsigned long long)ret_block); @@ -1970,7 +1973,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, sbi = EXT3_SB(sb); /* Walk through the whole group */ - while (start < max) { + while (start <= max) { start = bitmap_search_next_usable_block(start, bitmap_bh, max); if (start < 0) break; @@ -1980,7 +1983,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, * Allocate contiguous free extents by setting bits in the * block bitmap */ - while (next < max + while (next <= max && claim_block(sb_bgl_lock(sbi, group), next, bitmap_bh)) { next++; @@ -2091,73 +2094,74 @@ err_out: */ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) { - ext3_grpblk_t last_block, first_block, free_blocks; - unsigned long first_group, last_group; - unsigned long group, ngroups; + ext3_grpblk_t last_block, first_block; + unsigned long group, first_group, last_group; struct ext3_group_desc *gdp; struct ext3_super_block *es = EXT3_SB(sb)->s_es; - uint64_t start, len, minlen, trimmed; + uint64_t start, minlen, end, trimmed = 0; + ext3_fsblk_t first_data_blk = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); int ret = 0; - start = (range->start >> sb->s_blocksize_bits) + - le32_to_cpu(es->s_first_data_block); - len = range->len >> sb->s_blocksize_bits; + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - trimmed = 0; - if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) + if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) return -EINVAL; - if (start >= max_blks) - return -EINVAL; - if (start + len > max_blks) - len = max_blks - start; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; - ngroups = EXT3_SB(sb)->s_groups_count; smp_rmb(); /* Determine first and last group to examine based on start and len */ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, &first_group, &first_block); - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, &last_group, &last_block); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_block = EXT3_BLOCKS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last block to discard in this group */ + end = EXT3_BLOCKS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { gdp = ext3_get_group_desc(sb, group, NULL); if (!gdp) break; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - if (free_blocks < minlen) - continue; - /* * For all the groups except the last one, last block will - * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case first_block + - * len < EXT3_BLOCKS_PER_GROUP(sb). + * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_block is + * already computed earlier by ext3_get_group_no_and_offset() */ - if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) - last_block = first_block + len; - len -= last_block - first_block; + if (group == last_group) + end = last_block; - ret = ext3_trim_all_free(sb, group, first_block, - last_block, minlen); - if (ret < 0) - break; + if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { + ret = ext3_trim_all_free(sb, group, first_block, + end, minlen); + if (ret < 0) + break; + trimmed += ret; + } - trimmed += ret; + /* + * For every group except the first one, we are sure + * that the first block to discard will be block #0. + */ first_block = 0; } - if (ret >= 0) + if (ret > 0) ret = 0; - range->len = trimmed * sb->s_blocksize; +out: + range->len = trimmed * sb->s_blocksize; return ret; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2d0afec..6d34186 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -756,6 +756,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; block_i = ei->i_block_alloc_info; /* @@ -795,9 +796,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } /* ext3_mark_inode_dirty already updated i_sync_tid */ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9e2cd8..4bbd07a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -336,10 +336,10 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; - struct buffer_head *bh = NULL; + struct buffer_head *bh; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } @@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ + set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { - put_bh(bh); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + return bh; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); - return NULL; + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ ext4_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, - * continue with corrupt bitmap - */ + return 0; +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); + return NULL; + } return bh; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c560..ad56866 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 0; if (filp) - ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else - ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "EXT4-fs: call_filldir: called with " - "null fname?!?\n"); + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); return 0; } curr_pos = hash2pos(fname->hash, fname->minor_hash); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 513004f..ded731a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,7 +53,7 @@ printk(KERN_DEBUG f, ## a); \ } while (0) #else -#define ext4_debug(f, a...) do {} while (0) +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ @@ -184,6 +184,8 @@ struct mpage_da_data { #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 +#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -192,18 +194,25 @@ struct ext4_io_page { #define MAX_IO_PAGES 128 +/* + * For converting uninitialized extents on a work queue. + * + * 'page' is only used from the writepage() path; 'pages' is only used for + * buffered writes; they are used to keep page references until conversion + * takes place. For AIO/DIO, neither field is filled in. + */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - struct page *page; /* page struct for buffer write */ + struct page *page; /* for writepage() path */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; - struct ext4_io_page *pages[MAX_IO_PAGES]; + int num_io_pages; /* for writepages() */ + struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ } ext4_io_end_t; struct ext4_io_submit { @@ -923,6 +932,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ @@ -941,7 +951,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ @@ -1142,6 +1151,7 @@ struct ext4_sb_info { unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned int s_mount_flags; + unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; uid_t s_resuid; gid_t s_resgid; @@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); extern void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t group, @@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a52db3a..0f58b86 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -47,9 +47,9 @@ */ #define EXT_DEBUG__ #ifdef EXT_DEBUG -#define ext_debug(a...) printk(a) +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ext_debug(a...) +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1..83b20fc 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -104,6 +104,78 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); +} + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74f23c2..1421938 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,14 @@ #include <trace/events/ext4.h> +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle, int split_flag, int flags); +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + if (len == 0) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_extent *ex; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf\n", start); + ext_debug("truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext_debug(" border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ - if (end <= ex_ee_block) { + if (end < ex_ee_block) { ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { - EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", - start, end); + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); err = -EIO; goto out; } else if (a != ex_ee_block) { @@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) handle_t *handle; int i, err; - ext_debug("truncate since %u\n", start); + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start(inode, depth + 1); @@ -2504,6 +2525,61 @@ again: trace_ext4_ext_remove_space(inode, start, depth); /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block; + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, end, NULL); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) + goto cont; + + ee_block = le32_to_cpu(ex->ee_block); + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + int split_flag = 0; + + if (ext4_ext_is_uninitialized(ex)) + split_flag = EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent + */ + err = ext4_split_extent_at(handle, inode, path, + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (err < 0) + goto out; + } + ext4_ext_drop_refs(path); + kfree(path); + } +cont: + + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ @@ -2515,6 +2591,7 @@ again: } path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; @@ -2526,7 +2603,7 @@ again: /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial_cluster, start, - EXT_MAX_BLOCKS - 1); + end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) - printk(KERN_INFO "EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST - printk(", aggressive tests"); + ", aggressive tests" #endif #ifdef CHECK_BINSEARCH - printk(", check binsearch"); + ", check binsearch" #endif #ifdef EXTENTS_STATS - printk(", stats"); + ", stats" #endif - printk("\n"); + "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } /* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -/* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle @@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, depth = ext_depth(inode); eh = path[depth].p_hdr; - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - return -EIO; - } + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; last_ex = EXT_LAST_EXTENT(eh); /* * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, for (i = depth-1; i >= 0; i--) if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) return 0; +out: ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); return ext4_mark_inode_dirty(handle, inode); } @@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; - unsigned int punched_out = 0; - unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext4_lblk_t cluster_offset; @@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && - ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) @@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { - struct ext4_map_blocks punch_map; - ext4_fsblk_t partial_cluster = 0; - newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - - /* - * Punch out the map length, but only to the - * end of the extent - */ - punched_out = allocated < map->m_len ? - allocated : map->m_len; - /* - * Sense extents need to be converted to - * uninitialized, they must fit in an - * uninitialized extent + * Do not put uninitialized extent + * in the cache */ - if (punched_out > EXT_UNINIT_MAX_LEN) - punched_out = EXT_UNINIT_MAX_LEN; - - punch_map.m_lblk = map->m_lblk; - punch_map.m_pblk = newblock; - punch_map.m_len = punched_out; - punch_map.m_flags = 0; - - /* Check to see if the extent needs to be split */ - if (punch_map.m_len != ee_len || - punch_map.m_lblk != ee_block) { - - ret = ext4_split_extent(handle, inode, - path, &punch_map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (ret < 0) { - err = ret; - goto out2; - } - /* - * find extent for the block at - * the start of the hole - */ - ext4_ext_drop_refs(path); - kfree(path); - - path = ext4_ext_find_extent(inode, - map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - ee_block = le32_to_cpu(ex->ee_block); - ee_start = ext4_ext_pblock(ex); - - } - - ext4_ext_mark_uninitialized(ex); - - ext4_ext_invalidate_cache(inode); - - err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, map->m_lblk, - map->m_lblk + punched_out); - - if (!err && path->p_hdr->eh_entries == 0) { - /* - * Punch hole freed all of this sub tree, - * so we need to correct eh_depth - */ - err = ext4_ext_get_access(handle, inode, path); - if (err == 0) { - ext_inode_hdr(inode)->eh_depth = 0; - ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root( - inode, 0)); - - err = ext4_ext_dirty( - handle, inode, path); - } + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; } - - goto out2; + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; } } @@ -4165,13 +4146,11 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? - punched_out : allocated; trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : result); + newblock, map->m_len, err ? err : allocated); - return err ? err : result; + return err ? err : allocated; } void ext4_ext_truncate(struct inode *inode) @@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, map.m_len); + ext4_msg(inode->i_sb, KERN_ERR, + "%s:%d: inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + __func__, __LINE__, inode->i_ino, map.m_lblk, + map.m_len, ret); } ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; - struct ext4_ext_cache cache_ex; - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - struct ext4_map_blocks map; handle_t *handle; loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; - int ret, credits, blocks_released, err = 0; + int credits, err = 0; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } - /* * If i_size is contained in the last page, we need to * unmap and zero the partial page after i_size @@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + /* If there are no blocks to remove, return now */ - if (first_block >= last_block) + if (first_block >= stop_block) goto out; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); - /* - * Loop over all the blocks and identify blocks - * that need to be punched out - */ - iblock = first_block; - blocks_released = 0; - while (iblock < last_block) { - max_blocks = last_block - iblock; - num_blocks = 1; - memset(&map, 0, sizeof(map)); - map.m_lblk = iblock; - map.m_len = max_blocks; - ret = ext4_ext_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (ret > 0) { - blocks_released += ret; - num_blocks = ret; - } else if (ret == 0) { - /* - * If map blocks could not find the block, - * then it is in a hole. If the hole was - * not already cached, then map blocks should - * put it in the cache. So we can get the hole - * out of the cache - */ - memset(&cache_ex, 0, sizeof(cache_ex)); - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && - !cache_ex.ec_start) { - - /* The hole is cached */ - num_blocks = cache_ex.ec_block + - cache_ex.ec_len - iblock; - - } else { - /* The block could not be identified */ - err = -EIO; - break; - } - } else { - /* Map blocks error */ - err = ret; - break; - } - - if (num_blocks == 0) { - /* This condition should never happen */ - ext_debug("Block lookup failed"); - err = -EIO; - break; - } - - iblock += num_blocks; - } + err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - if (blocks_released > 0) { - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - } + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 00a2cb7..bb6c7d8 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode) io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); list_del_init(&io->list); + io->flag |= EXT4_IO_END_IN_FSYNC; /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode) if (ret < 0) ret2 = ret; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + io->flag &= ~EXT4_IO_END_IN_FSYNC; } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25d8c97..409c2ee 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, return EXT4_INODES_PER_GROUP(sb); } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return NULL; } return bh; @@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - if (atomic_read(&inode->i_count) > 1) { - printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); return; } - if (inode->i_nlink) { - printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); return; } - if (!sb) { - printk(KERN_ERR "ext4_free_inode: inode on " - "nonexistent device\n"); + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); @@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode - */ -static int ext4_claim_inode(struct super_block *sb, - struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, umode_t mode) -{ - int free = 0, retval = 0, count; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - - /* - * We have to be sure that new inode allocation does not race with - * inode table initialization, because otherwise we may end up - * allocating and writing new inode right before sb_issue_zeroout - * takes place and overwriting our new inode with zeroes. So we - * take alloc_sem to prevent it. - */ - down_read(&grp->alloc_sem); - ext4_lock_group(sb, group); - if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { - /* not a free inode */ - retval = 1; - goto err_ret; - } - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - ext4_error(sb, "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - return 1; - } - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); - - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - return retval; -} - -/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -741,6 +664,11 @@ got_group: if (ret2 == -1) goto out; + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; @@ -757,51 +685,24 @@ repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - - if (ino < EXT4_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - group_desc_bh); - if (err) - goto fail; - if (!ext4_claim_inode(sb, inode_bitmap_bh, - ino, group, mode)) { - /* we won it */ - BUFFER_TRACE(inode_bitmap_bh, - "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, - NULL, - inode_bitmap_bh); - if (err) - goto fail; - /* zero bit is inode number 1*/ - ino++; - goto got; - } - /* we lost it */ - ext4_handle_release_buffer(handle, inode_bitmap_bh); - ext4_handle_release_buffer(handle, group_desc_bh); - - if (++ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; + if (ino >= EXT4_INODES_PER_GROUP(sb)) { + if (++group == ngroups) + group = 0; + continue; } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ - if (++group == ngroups) - group = 0; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; } err = -ENOSPC; goto out; @@ -838,6 +739,59 @@ got: if (err) goto fail; } + + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; + + /* Update the relevant bg descriptor fields */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + } + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) @@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) @@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u\n" - "Used itable blocks: %d" - "itable unused count: %u\n", + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index feaa82f..c77b0bd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode, trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " - "with only %d reserved data blocks\n", + "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); @@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) */ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " - "data blocks\n", inode->i_ino, to_free, + "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; @@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_CRIT "Total free blocks count %lld\n", + struct super_block *sb = inode->i_sb; + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(inode->i_sb))); - printk(KERN_CRIT "Free/Dirty block details\n"); - printk(KERN_CRIT "free_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); - printk(KERN_CRIT "dirty_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); - printk(KERN_CRIT "Block reservation details\n"); - printk(KERN_CRIT "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + EXT4_I(inode)->i_reserved_data_blocks); + ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: return ext4_ordered_write_end(file, mapping, pos, len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { + case EXT4_INODE_WRITEBACK_DATA_MODE: return ext4_writeback_write_end(file, mapping, pos, len, copied, page, fsdata); - } else { + default: BUG(); } } @@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -2795,9 +2798,6 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - printk("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); + ext4_msg(io_end->inode->i_sb, KERN_INFO, + "sb umounted, discard end_io request for inode %lu", + io_end->inode->i_ino); ext4_free_io_end(io_end); goto out; } @@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode, GFP_NOFS); - if (!iocb->private) + ext4_io_end_t *io_end = + ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) return -ENOMEM; + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* * we save the io structure for current async * direct IO, so that later ext4_map_blocks() @@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_get_block_write, ext4_end_io_dio, NULL, - DIO_LOCKING | DIO_SKIP_HOLES); + DIO_LOCKING); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; /* @@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; - else + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } } @@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; if (!S_ISREG(inode->i_mode)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { /* TODO: Add support for non extent hole punching */ - return -ENOTSUPP; + return -EOPNOTSUPP; } if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ - return -ENOTSUPP; + return -EOPNOTSUPP; } return ext4_ext_punch_hole(file, offset, length); @@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle, ext4_update_dynamic_rev(sb); EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, NULL, - EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super(handle, sb); } } raw_inode->i_generation = cpu_to_le32(inode->i_generation); @@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { + if (attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - ext4_truncate(inode); + ext4_truncate(inode); } if (!rc) { @@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (test_opt(inode->i_sb, I_VERSION)) + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); /* the do_update_inode consumes one bh->b_count */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb990b2..99ab428 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,6 +21,7 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" #include <linux/debugfs.h> #include <linux/slab.h> @@ -339,7 +340,7 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); - return EXT4_MB_BITMAP(e4b); + return e4b->bd_bitmap; } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; @@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= ngroups) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - grinfo = ext4_get_group_info(sb, first_group + i); + grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; goto out; - - if (bitmap_uptodate(bh[i])) - continue; - - lock_buffer(bh[i]); - if (bitmap_uptodate(bh[i])) { - unlock_buffer(bh[i]); - continue; - } - ext4_lock_group(sb, first_group + i); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_bitmap_uptodate(bh[i]); - set_buffer_uptodate(bh[i]); - ext4_unlock_group(sb, first_group + i); - unlock_buffer(bh[i]); - continue; } - ext4_unlock_group(sb, first_group + i); - if (buffer_uptodate(bh[i])) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh[i]); - unlock_buffer(bh[i]); - continue; - } - get_bh(bh[i]); - /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. - */ - set_bitmap_uptodate(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug(1, "read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page; i++) - if (bh[i]) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page; i++) - if (bh[i] && !buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } - err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, /* let's maintain fragments counter */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); + block = !mb_test_bit(first - 1, e4b->bd_bitmap); if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(first + count, e4b->bd_bitmap); if (block && max) e4b->bd_info->bb_fragments--; else if (!block && !max) @@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, block = first++; order = 0; - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { + if (!mb_test_bit(block, e4b->bd_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u)", block); } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); + mb_clear_bit(block, e4b->bd_bitmap); e4b->bd_info->bb_counters[order]++; /* start of the buddy */ @@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); @@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } @@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; out_free_locality_groups: @@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb, * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_cluster, entry->count); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_unload_buddy(&e4b); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } @@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void) return -ENOMEM; } - ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; @@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void) rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); + kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } @@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " - "fs metadata\n", block, block+len); + "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - loff_t size, orig_size, start_off; + loff_t size, start_off; + loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_cluster, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", ac->ac_ex_scanned, ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4428,9 +4376,9 @@ out: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - cluster = new_entry->start_cluster; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (cluster < entry->start_cluster) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (cluster >= (entry->start_cluster + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(entry, new_entry)) { - new_entry->start_cluster = entry->start_cluster; - new_entry->count += entry->count; + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4691,15 +4634,15 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { err = -ENOMEM; goto error_return; } - new_entry->start_cluster = bit; - new_entry->group = block_group; - new_entry->count = count_clusters; - new_entry->t_tid = handle->h_transaction->t_tid; + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; - while (start < max) { - start = mb_find_next_zero_bit(bitmap, max, start); - if (start >= max) + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) break; - next = mb_find_next_bit(bitmap, max, start); + next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { ext4_trim_extent(sb, start, @@ -5027,37 +4970,36 @@ out: int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_group_info *grp; - ext4_group_t first_group, last_group; - ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; - uint64_t start, len, minlen, trimmed = 0; + uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; - len = range->len >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) return -EINVAL; - if (start + len <= first_data_blk) + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) goto out; - if (start < first_data_blk) { - len -= first_data_blk - start; + if (start < first_data_blk) start = first_data_blk; - } - /* Determine first and last group to examine based on start and len */ + /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } /* - * For all the groups except the last one, last block will - * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case start + - * len < EXT4_BLOCKS_PER_GROUP(sb). + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() */ - if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) - last_cluster = first_cluster + len; - len -= last_cluster - first_cluster; + if (group == last_group) + end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - last_cluster, minlen); + end, minlen); if (cnt < 0) { ret = cnt; break; } + trimmed += cnt; } - trimmed += cnt; + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ first_cluster = 0; } - range->len = trimmed * sb->s_blocksize; if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: + range->len = trimmed * sb->s_blocksize; return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3..c070618 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -96,21 +96,23 @@ extern u8 mb_enable_debug; struct ext4_free_data { - /* this links the free block information from group_info */ - struct rb_node node; + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ - /* this links the free block information from ext4_sb_info */ - struct list_head list; + /* this links the free block information from group_info */ + struct rb_node efd_node; /* group which free block extent belongs */ - ext4_group_t group; + ext4_group_t efd_group; /* free block extent */ - ext4_grpblk_t start_cluster; - ext4_grpblk_t count; + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; /* transaction which freed this extent */ - tid_t t_tid; + tid_t efd_tid; }; struct ext4_prealloc_space { @@ -210,8 +212,6 @@ struct ext4_buddy { __u16 bd_blkbits; ext4_group_t bd_group; }; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7d6bb0..f39f80f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode) tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = PTR_ERR(inode); + retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 7ea4ba4..ed6548d 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb, * If check_interval in MMP block is larger, use that instead of * update_interval from the superblock. */ - if (mmp->mmp_check_interval > mmp_check_interval) - mmp_check_interval = mmp->mmp_check_interval; + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); seq = le32_to_cpu(mmp->mmp_seq); if (seq == EXT4_MMP_SEQ_CLEAN) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2043f48..349d7b3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -468,7 +468,7 @@ fail2: fail: if (*err == ERR_BAD_DX_DIR) ext4_warning(dir->i_sb, - "Corrupt dir inode %ld, running e2fsck is " + "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); return NULL; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4758518..74cd1f7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode) static void put_io_page(struct ext4_io_page *io_page) { if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); put_page(io_page->p_page); kmem_cache_free(io_page_cachep, io_page); } @@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->iocb) aio_complete(io->iocb, io->result, 0); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); @@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (io->flag & EXT4_IO_END_IN_FSYNC) + goto requeue; if (list_empty(&io->list)) { spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); goto free; } if (!mutex_trylock(&inode->i_mutex)) { + bool was_queued; +requeue: + was_queued = !!(io->flag & EXT4_IO_END_QUEUED); + io->flag |= EXT4_IO_END_QUEUED; spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work @@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work) * yield the cpu if it sees an end_io request that has already * been requeued. */ - if (io->flag & EXT4_IO_END_QUEUED) + if (was_queued) yield(); - io->flag |= EXT4_IO_END_QUEUED; return; } list_del_init(&io->list); @@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - put_io_page(io_end->pages[i]); + if (atomic_read(&io_end->pages[i]->p_count) == 1) + end_page_writeback(io_end->pages[i]->p_page); } - io_end->num_io_pages = 0; inode = io_end->inode; if (error) { @@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ + if (atomic_read(&io_page->p_count) == 1) + end_page_writeback(page); put_io_page(io_page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f9d948f..59fa0be 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb, do_div(reserved_blocks, 100); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); /* * We need to protect s_groups_count against other CPUs seeing @@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, } ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ @@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", - o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT4-fs: filesystem on %s:" - " too large to resize to %llu blocks safely\n", - sb->s_id, n_blocks_count); + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); if (sizeof(sector_t) < 8) ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; @@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_group_t o_group; ext4_group_t n_group; - ext4_grpblk_t offset; + ext4_grpblk_t offset, add; unsigned long n_desc_blocks; unsigned long o_desc_blocks; unsigned long desc_blocks; @@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) return 0; ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); - ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / EXT4_DESC_PER_BLOCK(sb); @@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) } brelse(bh); - if (offset != 0) { - /* extend the last group */ - ext4_grpblk_t add; - add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; @@ -1674,7 +1681,7 @@ out: iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " + "upto %llu blocks", o_blocks_count, n_blocks_count); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9339009..ceebaf8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); @@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb) return bdi->dev == NULL; } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce, *tmp; + + spin_lock(&sbi->s_md_lock); + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk(KERN_CONT "block %llu: ", block); - printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function, path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); - if (block) - printk(KERN_CONT "block %llu: ", block); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->dio_unwritten_wq); lock_super(sb); - if (sb->s_dirt) - ext4_commit_super(sb, 1); - if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, 1); } + if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode) } } -static inline void ext4_show_quota_options(struct seq_file *seq, - struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ - int def_errors; - unsigned long def_mount_opts; - struct super_block *sb = root->d_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - def_errors = le16_to_cpu(es->s_errors); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%llu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT4_DEF_RESUID || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); - } - if (sbi->s_resgid != EXT4_DEF_RESGID || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); - } - if (test_opt(sb, ERRORS_RO)) { - if (def_errors == EXT4_ERRORS_PANIC || - def_errors == EXT4_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER)) - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { - seq_printf(seq, ",min_batch_time=%u", - (unsigned) sbi->s_min_batch_time); - } - if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { - seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_max_batch_time); - } - - /* - * We're changing the default of barrier mount option, so - * let's always display its mount state so it's clear what its - * status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - seq_puts(seq, ",journal_async_commit"); - else if (test_opt(sb, JOURNAL_CHECKSUM)) - seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, I_VERSION)) - seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC) && - !(def_mount_opts & EXT4_DEFM_NODELALLOC)) - seq_puts(seq, ",nodelalloc"); - - if (!test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",nomblk_io_submit"); - if (sbi->s_stripe) - seq_printf(seq, ",stripe=%lu", sbi->s_stripe); - /* - * journal mode get enabled in different ways - * So just print the value even if we didn't specify it - */ - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - - if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) - seq_printf(seq, ",inode_readahead_blks=%u", - sbi->s_inode_readahead_blks); - - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",noauto_da_alloc"); - - if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) - seq_puts(seq, ",discard"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - if (test_opt(sb, DIOREAD_NOLOCK)) - seq_puts(seq, ",dioread_nolock"); - - if (test_opt(sb, BLOCK_VALIDITY) && - !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) - seq_puts(seq, ",block_validity"); - - if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_itable"); - else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_itable=%u", - (unsigned) sbi->s_li_wait_mult); - - ext4_show_quota_options(seq, sb); - - return 0; -} - static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_dev, - Opt_journal_checksum, Opt_journal_async_commit, + Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1350,20 +1203,19 @@ static const match_table_t tokens = { {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_noload, "noload"}, {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, {Opt_commit, "commit=%u"}, {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_update, "journal=update"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -1389,7 +1241,6 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_mblk_io_submit, "mblk_io_submit"}, @@ -1408,6 +1259,11 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ {Opt_err, NULL}, }; @@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype) } #endif -static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, - ext4_fsblk_t *n_blocks_count, int is_remount) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; - substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 #ifdef CONFIG_QUOTA - int qfmt; +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT #endif - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, MINIX_DF); - break; - case Opt_minix_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, MINIX_DF); - - break; - case Opt_grpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, GRPID); - - break; - case Opt_nogrpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, GRPID); - - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resuid = option; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resgid = option; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_RO); - set_opt(sb, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt(sb, ERRORS_RO); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt(sb, NO_UID32); - break; - case Opt_debug: - set_opt(sb, DEBUG); - break; - case Opt_oldalloc: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; +#define MOPT_DATAJ 0x0080 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, + {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, #ifdef CONFIG_EXT4_FS_XATTR - case Opt_user_xattr: - set_opt(sb, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sb, XATTR_USER); - break; + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); - break; + {Opt_user_xattr, 0, MOPT_NOSUPPORT}, + {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL - case Opt_acl: - set_opt(sb, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sb, POSIX_ACL); - break; + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else - case Opt_acl: - case Opt_noacl: - ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); - break; + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; - } - set_opt(sb, UPDATE_JOURNAL); - break; - case Opt_journal_dev: - if (is_remount) { + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + int arg = 0; + + if (args->from && match_int(args, &arg)) + return -1; + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, + "Ignoring removed %s option", opt); + return 1; + case Opt_resuid: + sbi->s_resuid = arg; + return 1; + case Opt_resgid: + sbi->s_resgid = arg; + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + return 1; + case Opt_journal_ioprio: + if (arg < 0 || arg > 7) + return -1; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + if (token != m->token) + continue; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg > (1 << 30)) + return -1; + if (arg && !is_power_of_2(arg)) { ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); + return -1; } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_checksum: - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_journal_async_commit: - set_opt(sb, JOURNAL_ASYNC_COMMIT); - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_noload: - set_opt(sb, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_max_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = option; - break; - case Opt_min_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_min_batch_time = option; - break; - case Opt_data_journal: - data_opt = EXT4_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT4_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT4_MOUNT_WRITEBACK_DATA; - datacheck: + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != data_opt) { + else if (test_opt(sb, DATA_FLAGS) != + m->mount_opt) { ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return 0; + "Cannot change data mode on remount"); + return -1; } } else { clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; + sbi->s_mount_opt |= m->mount_opt; } - break; - case Opt_data_err_abort: - set_opt(sb, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sb, DATA_ERR_ABORT); - break; #ifdef CONFIG_QUOTA - case Opt_usrjquota: + } else if (token == Opt_usrjquota) { if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: + return -1; + } else if (token == Opt_grpjquota) { if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: + return -1; + } else if (token == Opt_offusrjquota) { if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: + return -1; + } else if (token == Opt_offgrpjquota) { if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: + return -1; + } else if (m->flags & MOPT_QFMT) { if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sb, QUOTA); - set_opt(sb, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sb, QUOTA); - set_opt(sb, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return 0; + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot " + "change journaled quota options " + "when quota turned on"); + return -1; } - clear_opt(sb, QUOTA); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext4_msg(sb, KERN_ERR, - "quota options not supported"); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext4_msg(sb, KERN_ERR, - "journaled quota options not supported"); - break; - case Opt_noquota: - break; + sbi->s_jquota_fmt = m->mount_opt; #endif - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - break; - case Opt_nobarrier: - clear_opt(sb, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sb, BARRIER); - else - clear_opt(sb, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext4_msg(sb, KERN_ERR, - "resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated nobh option"); - break; - case Opt_bh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated bh option"); - break; - case Opt_i_version: - set_opt(sb, I_VERSION); - sb->s_flags |= MS_I_VERSION; - break; - case Opt_nodelalloc: - clear_opt(sb, DELALLOC); - clear_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_mblk_io_submit: - set_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_nomblk_io_submit: - clear_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_stripe: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_stripe = option; - break; - case Opt_delalloc: - set_opt(sb, DELALLOC); - set_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_block_validity: - set_opt(sb, BLOCK_VALIDITY); - break; - case Opt_noblock_validity: - clear_opt(sb, BLOCK_VALIDITY); - break; - case Opt_inode_readahead_blks: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > (1 << 30)) - return 0; - if (option && !is_power_of_2(option)) { - ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return 0; + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; } - sbi->s_inode_readahead_blks = option; - break; - case Opt_journal_ioprio: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > 7) - break; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, - option); - break; - case Opt_noauto_da_alloc: - set_opt(sb, NO_AUTO_DA_ALLOC); - break; - case Opt_auto_da_alloc: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - clear_opt(sb, NO_AUTO_DA_ALLOC); + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; else - set_opt(sb,NO_AUTO_DA_ALLOC); - break; - case Opt_discard: - set_opt(sb, DISCARD); - break; - case Opt_nodiscard: - clear_opt(sb, DISCARD); - break; - case Opt_dioread_nolock: - set_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_dioread_lock: - clear_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_init_itable: - set_opt(sb, INIT_INODE_TABLE); - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = EXT4_DEF_LI_WAIT_MULT; - if (option < 0) - return 0; - sbi->s_li_wait_mult = option; - break; - case Opt_noinit_itable: - clear_opt(sb, INIT_INODE_TABLE); - break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " - "or missing value", p); - return 0; + sbi->s_mount_opt &= ~m->mount_opt; } + return 1; + } + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) + return 0; } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { @@ -1942,6 +1651,160 @@ set_qf_format: return 1; } +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static const char *token2str(int token) +{ + static const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); + if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE(inode)->data); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void) ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; - printk(KERN_CRIT "EXT4: error %d creating inode table " + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; @@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { - ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", - "2.6.38"); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); - } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ @@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, NULL, 0)) { + &journal_devnum, &journal_ioprio, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", sbi->s_es->s_mount_opts); } + sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, NULL, 0)) + &journal_ioprio, 0)) goto failed_mount; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #else es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif - sb->s_dirt = 1; } /* Handle clustersize */ @@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + /* * The journal may have updated the bg summary counts, so we * need to update the global counters. @@ -3861,6 +3727,7 @@ failed_mount2: ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = jbd2_journal_update_format(journal); - if (err) { - ext4_msg(sb, KERN_ERR, "error updating journal"); - jbd2_journal_destroy(journal); - return err; - } - } - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { @@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; int enable_quota = 0; @@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &journal_ioprio, - &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; } @@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > ext4_blocks_count(es)) { + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; @@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext4_group_extend(sb, es, n_blocks_count))) - goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; if (EXT4_HAS_INCOMPAT_FEATURE(sb, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93a00d8..e88748e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -82,8 +82,8 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_cache_insert(struct buffer_head *); @@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) static inline int ext4_xattr_check_block(struct buffer_head *bh) { - int error; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); - return error; + return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); } static inline int @@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) goto cleanup; @@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = 0; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); error = -EIO; if (!bh) @@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); } - unlock_buffer(bh); out: ext4_std_error(inode->i_sb, error); return; @@ -834,7 +834,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); new_bh = sb_getblk(sb, block); if (!new_bh) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index a81eb23..98ae804 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -521,57 +521,46 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, op = &outname[*outlen * sizeof(wchar_t)]; } else { - if (nls) { - for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= FAT_LFN_LEN; - *outlen += 1) - { - if (escape && (*ip == ':')) { - if (i > len - 5) - return -EINVAL; - ec = 0; - for (k = 1; k < 5; k++) { - nc = ip[k]; - ec <<= 4; - if (nc >= '0' && nc <= '9') { - ec |= nc - '0'; - continue; - } - if (nc >= 'a' && nc <= 'f') { - ec |= nc - ('a' - 10); - continue; - } - if (nc >= 'A' && nc <= 'F') { - ec |= nc - ('A' - 10); - continue; - } - return -EINVAL; + for (i = 0, ip = name, op = outname, *outlen = 0; + i < len && *outlen < FAT_LFN_LEN; + *outlen += 1) { + if (escape && (*ip == ':')) { + if (i > len - 5) + return -EINVAL; + ec = 0; + for (k = 1; k < 5; k++) { + nc = ip[k]; + ec <<= 4; + if (nc >= '0' && nc <= '9') { + ec |= nc - '0'; + continue; } - *op++ = ec & 0xFF; - *op++ = ec >> 8; - ip += 5; - i += 5; - } else { - if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0) - return -EINVAL; - ip += charlen; - i += charlen; - op += 2; + if (nc >= 'a' && nc <= 'f') { + ec |= nc - ('a' - 10); + continue; + } + if (nc >= 'A' && nc <= 'F') { + ec |= nc - ('A' - 10); + continue; + } + return -EINVAL; } + *op++ = ec & 0xFF; + *op++ = ec >> 8; + ip += 5; + i += 5; + } else { + charlen = nls->char2uni(ip, len - i, + (wchar_t *)op); + if (charlen < 0) + return -EINVAL; + ip += charlen; + i += charlen; + op += 2; } - if (i < len) - return -ENAMETOOLONG; - } else { - for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= FAT_LFN_LEN; - i++, *outlen += 1) - { - *op++ = *ip++; - *op++ = 0; - } - if (i < len) - return -ENAMETOOLONG; } + if (i < len) + return -ENAMETOOLONG; } *longlen = *outlen; @@ -6,7 +6,7 @@ * Manage the dynamic fd arrays in the process files_struct. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mmzone.h> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 77b535a..539f36c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,7 +14,7 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sched.h> @@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) } /* - * Move expired dirty inodes from @delaying_queue to @dispatch_queue. + * Move expired (dirtied after work->older_than_this) dirty inodes from + * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, @@ -1148,23 +1149,6 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); -/* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * The inodes to be written are parked on bdi->b_io. They are moved back onto - * bdi->b_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. - */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync) ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); - if (sync) - inode_sync_wait(inode); return ret; } EXPORT_SYMBOL(write_inode_now); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 6324c42..e159e68 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/path.h> diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 3cbfa93..1fe7313 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x); extern int open_file(char *path, int r, int w, int append); extern void *open_dir(char *path, int *err_out); extern char *read_dir(void *stream, unsigned long long *pos, - unsigned long long *ino_out, int *len_out); + unsigned long long *ino_out, int *len_out, + unsigned int *type_out); extern void close_file(void *stream); extern int replace_file(int oldfd, int fd); extern void close_dir(void *stream); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 588d458..07c516b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) char *name; unsigned long long next, ino; int error, len; + unsigned int type; name = dentry_name(file->f_path.dentry); if (name == NULL) @@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) if (dir == NULL) return -error; next = file->f_pos; - while ((name = read_dir(dir, &next, &ino, &len)) != NULL) { + while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { error = (*filldir)(ent, name, len, file->f_pos, - ino, DT_UNKNOWN); + ino, type); if (error) break; file->f_pos = next; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index dd7bc38..a74ad0d 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out) } char *read_dir(void *stream, unsigned long long *pos, - unsigned long long *ino_out, int *len_out) + unsigned long long *ino_out, int *len_out, + unsigned int *type_out) { DIR *dir = stream; struct dirent *ent; @@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos, return NULL; *len_out = strlen(ent->d_name); *ino_out = ent->d_ino; + *type_out = ent->d_type; *pos = telldir(dir); return ent->d_name; } @@ -10,7 +10,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d49d202..c78841e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh) * whole transaction. * * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __try_to_free_cp_buf(struct journal_head *jh) { int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { /* * Get our reference so that bh cannot be freed before @@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - jbd_unlock_bh_state(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); - } else { - jbd_unlock_bh_state(bh); } return ret; } @@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } /* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk. Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) - __releases(journal->j_list_lock) -{ - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_lock_bh_state(bh); - jbd_unlock_bh_state(bh); - put_bh(bh); -} - -/* * Clean up transaction's list of buffers submitted for io. * We wait for any pending IO to complete and remove any clean * buffers. Note that we take the buffers in the opposite ordering @@ -222,15 +203,9 @@ restart: while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -246,7 +221,6 @@ restart: * it has been written out and so we can drop it from the list */ released = __jbd2_journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); __brelse(bh); } @@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count) for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; - clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); } @@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count) * be written out. * * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, int *batch_count, transaction_t *transaction) @@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so starting and @@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (unlikely(buffer_write_io_error(bh))) ret = -EIO; get_bh(bh); - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); __brelse(bh); } else { /* @@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); - set_buffer_jwrite(bh); journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == JBD2_NR_BATCH) { @@ -407,15 +373,7 @@ restart: int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { - struct buffer_head *bh; - jh = transaction->t_checkpoint_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - retry = 1; - break; - } retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) @@ -478,79 +436,28 @@ out: int jbd2_cleanup_journal_tail(journal_t *journal) { - transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned long blocknr; if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. */ - - write_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - write_unlock(&journal->j_state_lock); + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - } - - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - write_unlock(&journal->j_state_lock); + J_ASSERT(blocknr != 0); /* - * If there is an external journal, we need to make sure that - * any data blocks that were recently written out --- perhaps - * by jbd2_log_do_checkpoint() --- are flushed out before we - * drop the transactions from the external journal. It's - * unlikely this will be necessary, especially with a - * appropriately sized journal, but we need this to guarantee - * correctness. Fortunately jbd2_cleanup_journal_tail() - * doesn't get called all that often. + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. */ - if ((journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) + if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!(journal->j_flags & JBD2_ABORT)) - jbd2_journal_update_superblock(journal, 1); + + __jbd2_update_log_tail(journal, first_tid, blocknr); return 0; } @@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) do { jh = next_jh; next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; } } /* @@ -673,9 +577,7 @@ out: * The function can free jh and bh. * * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ - int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { struct transaction_chp_stats_s *stats; @@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); - kfree(transaction); + jbd2_journal_free_transaction(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd2_drop_transaction(journal, transaction); + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index c067a8c..17f557f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; struct blk_plug plug; + /* Tail of the journal */ + unsigned long first_block; + tid_t first_tid; + int update_tail; /* * First job: lock down the current transaction and wait for @@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - jbd2_journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -677,10 +692,30 @@ start_journal_io: err = 0; } + /* + * Get current oldest transaction in the log before we issue flush + * to the filesystem device. After the flush we can be sure that + * blocks of all older transactions are checkpointed to persistent + * storage and we will be safe to update journal start in the + * superblock with the numbers we get here. + */ + update_tail = + jbd2_journal_get_log_tail(journal, &first_tid, &first_block); + write_lock(&journal->j_state_lock); + if (update_tail) { + long freed = first_block - journal->j_tail; + + if (first_block < journal->j_tail) + freed += journal->j_last - journal->j_first; + /* Update tail only if we free significant amount of space */ + if (freed < journal->j_maxlen / 4) + update_tail = 0; + } J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -831,6 +866,14 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); + /* + * Now disk caches for filesystem device are flushed so we are safe to + * erase checkpointed transactions from the log by updating journal + * superblock. + */ + if (update_tail) + jbd2_update_log_tail(journal, first_tid, first_block); + /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on @@ -1048,7 +1091,7 @@ restart_loop: jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) - kfree(commit_transaction); + jbd2_journal_free_transaction(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 839377e..98ed6db 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke); EXPORT_SYMBOL(jbd2_journal_init_dev); EXPORT_SYMBOL(jbd2_journal_init_inode); -EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); @@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); @@ -746,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); +} + +/* + * This is a variaon of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + mutex_lock(&journal->j_checkpoint_mutex); + if (tid_gt(tid, journal->j_tail_sequence)) + __jbd2_update_log_tail(journal, tid, block); + mutex_unlock(&journal->j_checkpoint_mutex); +} + struct jbd2_stats_proc_session { journal_t *journal; struct transaction_stats_s *stats; @@ -1114,40 +1204,45 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - jbd2_journal_update_superblock(journal, 1); - return jbd2_journal_start_thread(journal); -} - -/** - * void jbd2_journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void jbd2_journal_update_superblock(journal_t *journal, int wait) -{ - journal_superblock_t *sb = journal->j_superblock; - struct buffer_head *bh = journal->j_sb_buffer; - /* * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JBD2_FLUSHED. This avoids + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JBD2_FLUSHED. This avoids * attempting a write to a potential-readonly device. */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { + if (sb->s_start == 0) { jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); - goto out; + journal->j_flags |= JBD2_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); } + return jbd2_journal_start_thread(journal); +} +static void jbd2_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_jbd2_write_superblock(journal, write_op); + if (!(journal->j_flags & JBD2_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1163,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: Error %d detected when updating " + "journal superblock for %s.\n", ret, + journal->j_devname); + } +} + +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + jbd2_write_superblock(journal, write_op); + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - printk(KERN_ERR "JBD2: I/O error detected " - "when updating journal superblock for %s.\n", - journal->j_devname); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); - -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + jbd2_write_superblock(journal, WRITE_FUA); + /* Log is no longer empty */ write_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JBD2_FLUSHED; - else - journal->j_flags |= JBD2_FLUSHED; + journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); } + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void jbd2_journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + read_lock(&journal->j_state_lock); + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + read_unlock(&journal->j_state_lock); + + jbd2_write_superblock(journal, WRITE_SYNC); +} + /* * Read the superblock for a given journal, performing initial * validation of the format. */ - static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; @@ -1398,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = - ++journal->j_transaction_sequence; - jbd2_journal_update_superblock(journal, 1); - } else { + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } @@ -1552,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, EXPORT_SYMBOL(jbd2_journal_clear_features); /** - * int jbd2_journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int jbd2_journal_update_format (journal_t *journal) -{ - journal_superblock_t *sb; - int err; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V2: - return 0; - case JBD2_SUPERBLOCK_V1: - return journal_convert_superblock_v1(journal, sb); - default: - break; - } - return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, - journal_superblock_t *sb) -{ - int offset, blocksize; - struct buffer_head *bh; - - printk(KERN_WARNING - "JBD2: Converting superblock from version 1 to 2.\n"); - - /* Pre-initialise new fields to zero */ - offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); - blocksize = be32_to_cpu(sb->s_blocksize); - memset(&sb->s_feature_compat, 0, blocksize-offset); - - sb->s_nr_users = cpu_to_be32(1); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - journal->j_format_version = 2; - - bh = journal->j_sb_buffer; - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); - return 0; -} - - -/** * int jbd2_journal_flush () - Flush journal * @journal: Journal to act on. * @@ -1619,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; write_lock(&journal->j_state_lock); @@ -1654,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); jbd2_cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1661,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - write_unlock(&journal->j_state_lock); - jbd2_journal_update_superblock(journal, 1); - write_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1708,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); - if (write) - jbd2_journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1759,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); if (errno) - jbd2_journal_update_superblock(journal, 1); + jbd2_journal_update_sb_errno(journal); } /** @@ -2017,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache; static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif -static int journal_init_jbd2_journal_head_cache(void) +static int jbd2_journal_init_journal_head_cache(void) { int retval; @@ -2035,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void) return retval; } -static void jbd2_journal_destroy_jbd2_journal_head_cache(void) +static void jbd2_journal_destroy_journal_head_cache(void) { if (jbd2_journal_head_cache) { kmem_cache_destroy(jbd2_journal_head_cache); @@ -2323,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; -static int __init journal_init_handle_cache(void) +static int __init jbd2_journal_init_handle_cache(void) { jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (jbd2_handle_cache == NULL) { @@ -2358,17 +2452,20 @@ static int __init journal_init_caches(void) ret = jbd2_journal_init_revoke_caches(); if (ret == 0) - ret = journal_init_jbd2_journal_head_cache(); + ret = jbd2_journal_init_journal_head_cache(); + if (ret == 0) + ret = jbd2_journal_init_handle_cache(); if (ret == 0) - ret = journal_init_handle_cache(); + ret = jbd2_journal_init_transaction_cache(); return ret; } static void jbd2_journal_destroy_caches(void) { jbd2_journal_destroy_revoke_caches(); - jbd2_journal_destroy_jbd2_journal_head_cache(); + jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index da6d7ba..c1a0335 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/crc32.h> +#include <linux/blkdev.h> #endif /* @@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 30b2867..6973705 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void) J_ASSERT(!jbd2_revoke_record_cache); J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", - sizeof(struct jbd2_revoke_record_s), - 0, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, - NULL); + jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) goto record_cache_failure; - jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", - sizeof(struct jbd2_revoke_table_s), - 0, SLAB_TEMPORARY, NULL); + jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, + SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) goto table_cache_failure; return 0; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5aba56..ddcd354 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -33,6 +33,35 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ + J_ASSERT(!transaction_cache); + transaction_cache = kmem_cache_create("jbd2_transaction_s", + sizeof(transaction_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (transaction_cache) + return 0; + return -ENOMEM; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ + if (transaction_cache) { + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; + } +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ + if (unlikely(ZERO_OR_NULL_PTR(transaction))) + return; + kmem_cache_free(transaction_cache, transaction); +} + /* * jbd2_get_transaction: obtain a new transaction_t object. * @@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); + new_transaction = kmem_cache_alloc(transaction_cache, + gfp_mask | __GFP_ZERO); if (!new_transaction) { /* * If __GFP_FS is not present, then we may be @@ -162,7 +192,7 @@ repeat: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return -EROFS; } @@ -284,7 +314,7 @@ repeat: read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return 0; } @@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * of these pointers, it could go bad. Generally the caller needs to re-read * the pointer from the transaction_t. * - * Called under j_list_lock. The journal may not be locked. + * Called under j_list_lock. */ -void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; @@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ - if (jh->b_jlist == BJ_None) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __jbd2_journal_remove_checkpoint(jh); } spin_unlock(&journal->j_list_lock); out: @@ -1949,6 +1977,8 @@ zap_buffer_unlocked: clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } @@ -3,7 +3,7 @@ * Library for filesystems writers. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/mount.h> diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index f848b52..3ddcbb1 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -598,7 +598,7 @@ static struct rpc_procinfo nlm4_procedures[] = { PROC(GRANTED_RES, res, norep), }; -struct rpc_version nlm_version4 = { +const struct rpc_version nlm_version4 = { .number = 4, .nrprocs = ARRAY_SIZE(nlm4_procedures), .procs = nlm4_procedures, diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 8d4ea83..ba1dc2e 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -62,7 +62,8 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, nlm_init->noresvport); + nlm_init->hostname, nlm_init->noresvport, + nlm_init->net); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 180ac34..3d35e3e 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -596,19 +596,19 @@ static struct rpc_procinfo nlm_procedures[] = { PROC(GRANTED_RES, res, norep), }; -static struct rpc_version nlm_version1 = { +static const struct rpc_version nlm_version1 = { .number = 1, .nrprocs = ARRAY_SIZE(nlm_procedures), .procs = nlm_procedures, }; -static struct rpc_version nlm_version3 = { +static const struct rpc_version nlm_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(nlm_procedures), .procs = nlm_procedures, }; -static struct rpc_version *nlm_versions[] = { +static const struct rpc_version *nlm_versions[] = { [1] = &nlm_version1, [3] = &nlm_version3, #ifdef CONFIG_LOCKD_V4 @@ -618,7 +618,7 @@ static struct rpc_version *nlm_versions[] = { static struct rpc_stat nlm_rpc_stats; -struct rpc_program nlm_program = { +const struct rpc_program nlm_program = { .name = "lockd", .number = NLM_PROGRAM, .nrvers = ARRAY_SIZE(nlm_versions), diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 6f29836..eb75ca7 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -17,6 +17,8 @@ #include <linux/lockd/lockd.h> #include <linux/mutex.h> +#include <linux/sunrpc/svc_xprt.h> + #include <net/ipv6.h> #define NLMDBG_FACILITY NLMDBG_HOSTCACHE @@ -54,6 +56,7 @@ struct nlm_lookup_host_info { const char *hostname; /* remote's hostname */ const size_t hostname_len; /* it's length */ const int noresvport; /* use non-priv port */ + struct net *net; /* network namespace to bind */ }; /* @@ -155,6 +158,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, INIT_LIST_HEAD(&host->h_reclaim); host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; + host->net = ni->net; out: return host; @@ -206,7 +210,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const unsigned short protocol, const u32 version, const char *hostname, - int noresvport) + int noresvport, + struct net *net) { struct nlm_lookup_host_info ni = { .server = 0, @@ -217,6 +222,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .hostname = hostname, .hostname_len = strlen(hostname), .noresvport = noresvport, + .net = net, }; struct hlist_head *chain; struct hlist_node *pos; @@ -231,6 +237,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, chain = &nlm_client_hosts[nlm_hash_address(sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { + if (host->net != net) + continue; if (!rpc_cmp_addr(nlm_addr(host), sap)) continue; @@ -318,6 +326,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct nsm_handle *nsm = NULL; struct sockaddr *src_sap = svc_daddr(rqstp); size_t src_len = rqstp->rq_daddrlen; + struct net *net = rqstp->rq_xprt->xpt_net; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -326,6 +335,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, .version = rqstp->rq_vers, .hostname = hostname, .hostname_len = hostname_len, + .net = net, }; dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, @@ -339,6 +349,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { + if (host->net != net) + continue; if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) continue; @@ -431,7 +443,7 @@ nlm_bind_host(struct nlm_host *host) .to_retries = 5U, }; struct rpc_create_args args = { - .net = &init_net, + .net = host->net, .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, @@ -553,12 +565,8 @@ void nlm_host_rebooted(const struct nlm_reboot *info) nsm_release(nsm); } -/* - * Shut down the hosts module. - * Note that this routine is called only at server shutdown time. - */ void -nlm_shutdown_hosts(void) +nlm_shutdown_hosts_net(struct net *net) { struct hlist_head *chain; struct hlist_node *pos; @@ -570,6 +578,8 @@ nlm_shutdown_hosts(void) /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts...\n"); for_each_host(host, pos, chain, nlm_server_hosts) { + if (net && host->net != net) + continue; host->h_expires = jiffies - 1; if (host->h_rpcclnt) { rpc_shutdown_client(host->h_rpcclnt); @@ -580,15 +590,29 @@ nlm_shutdown_hosts(void) /* Then, perform a garbage collection pass */ nlm_gc_hosts(); mutex_unlock(&nlm_host_mutex); +} + +/* + * Shut down the hosts module. + * Note that this routine is called only at server shutdown time. + */ +void +nlm_shutdown_hosts(void) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + + nlm_shutdown_hosts_net(NULL); /* complain if any hosts are left */ if (nrhosts != 0) { printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); dprintk("lockd: %lu hosts left:\n", nrhosts); for_each_host(host, pos, chain, nlm_server_hosts) { - dprintk(" %s (cnt %d use %d exp %ld)\n", + dprintk(" %s (cnt %d use %d exp %ld net %p)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires); + host->h_inuse, host->h_expires, host->net); } } } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 65ba36b..7ef14b3 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -47,7 +47,7 @@ struct nsm_res { u32 state; }; -static struct rpc_program nsm_program; +static const struct rpc_program nsm_program; static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); @@ -62,14 +62,14 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static struct rpc_clnt *nsm_create(void) +static struct rpc_clnt *nsm_create(struct net *net) { struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_LOOPBACK), }; struct rpc_create_args args = { - .net = &init_net, + .net = net, .protocol = XPRT_TRANSPORT_UDP, .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), @@ -83,7 +83,8 @@ static struct rpc_clnt *nsm_create(void) return rpc_create(&args); } -static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, + struct net *net) { struct rpc_clnt *clnt; int status; @@ -99,7 +100,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) .rpc_resp = res, }; - clnt = nsm_create(); + clnt = nsm_create(net); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); dprintk("lockd: failed to create NSM upcall transport, " @@ -149,7 +150,7 @@ int nsm_monitor(const struct nlm_host *host) */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net); if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { @@ -183,7 +184,7 @@ void nsm_unmonitor(const struct nlm_host *host) && nsm->sm_monitored && !nsm->sm_sticky) { dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net); if (res.status != 0) status = -EIO; if (status < 0) @@ -534,19 +535,19 @@ static struct rpc_procinfo nsm_procedures[] = { }, }; -static struct rpc_version nsm_version1 = { +static const struct rpc_version nsm_version1 = { .number = 1, .nrprocs = ARRAY_SIZE(nsm_procedures), .procs = nsm_procedures }; -static struct rpc_version * nsm_version[] = { +static const struct rpc_version *nsm_version[] = { [1] = &nsm_version1, }; static struct rpc_stat nsm_stats; -static struct rpc_program nsm_program = { +static const struct rpc_program nsm_program = { .name = "statd", .number = NSM_PROGRAM, .nrvers = ARRAY_SIZE(nsm_version), diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h new file mode 100644 index 0000000..ce227e0 --- /dev/null +++ b/fs/lockd/netns.h @@ -0,0 +1,12 @@ +#ifndef __LOCKD_NETNS_H__ +#define __LOCKD_NETNS_H__ + +#include <net/netns/generic.h> + +struct lockd_net { + unsigned int nlmsvc_users; +}; + +extern int lockd_net_id; + +#endif diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index c061b9a..2774e10 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -35,6 +35,8 @@ #include <linux/lockd/lockd.h> #include <linux/nfs.h> +#include "netns.h" + #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) #define ALLOWED_SIGS (sigmask(SIGKILL)) @@ -50,6 +52,8 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; +int lockd_net_id; + /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 @@ -189,27 +193,29 @@ lockd(void *vrqstp) } static int create_lockd_listener(struct svc_serv *serv, const char *name, - const int family, const unsigned short port) + struct net *net, const int family, + const unsigned short port) { struct svc_xprt *xprt; - xprt = svc_find_xprt(serv, name, family, 0); + xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, &init_net, family, port, + return svc_create_xprt(serv, name, net, family, port, SVC_SOCK_DEFAULTS); svc_xprt_put(xprt); return 0; } -static int create_lockd_family(struct svc_serv *serv, const int family) +static int create_lockd_family(struct svc_serv *serv, struct net *net, + const int family) { int err; - err = create_lockd_listener(serv, "udp", family, nlm_udpport); + err = create_lockd_listener(serv, "udp", net, family, nlm_udpport); if (err < 0) return err; - return create_lockd_listener(serv, "tcp", family, nlm_tcpport); + return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport); } /* @@ -222,16 +228,16 @@ static int create_lockd_family(struct svc_serv *serv, const int family) * Returns zero if all listeners are available; otherwise a * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv) +static int make_socks(struct svc_serv *serv, struct net *net) { static int warned; int err; - err = create_lockd_family(serv, PF_INET); + err = create_lockd_family(serv, net, PF_INET); if (err < 0) goto out_err; - err = create_lockd_family(serv, PF_INET6); + err = create_lockd_family(serv, net, PF_INET6); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; @@ -245,6 +251,47 @@ out_err: return err; } +static int lockd_up_net(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + struct svc_serv *serv = nlmsvc_rqst->rq_server; + int error; + + if (ln->nlmsvc_users) + return 0; + + error = svc_rpcb_setup(serv, net); + if (error) + goto err_rpcb; + + error = make_socks(serv, net); + if (error < 0) + goto err_socks; + return 0; + +err_socks: + svc_rpcb_cleanup(serv, net); +err_rpcb: + return error; +} + +static void lockd_down_net(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + struct svc_serv *serv = nlmsvc_rqst->rq_server; + + if (ln->nlmsvc_users) { + if (--ln->nlmsvc_users == 0) { + nlm_shutdown_hosts_net(net); + svc_shutdown_net(serv, net); + } + } else { + printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", + nlmsvc_task, net); + BUG(); + } +} + /* * Bring up the lockd process if it's not already up. */ @@ -252,13 +299,16 @@ int lockd_up(void) { struct svc_serv *serv; int error = 0; + struct net *net = current->nsproxy->net_ns; mutex_lock(&nlmsvc_mutex); /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) + if (nlmsvc_rqst) { + error = lockd_up_net(net); goto out; + } /* * Sanity check: if there's no pid, @@ -275,7 +325,7 @@ int lockd_up(void) goto out; } - error = make_socks(serv); + error = make_socks(serv, net); if (error < 0) goto destroy_and_out; @@ -313,8 +363,12 @@ int lockd_up(void) destroy_and_out: svc_destroy(serv); out: - if (!error) + if (!error) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + ln->nlmsvc_users++; nlmsvc_users++; + } mutex_unlock(&nlmsvc_mutex); return error; } @@ -328,8 +382,10 @@ lockd_down(void) { mutex_lock(&nlmsvc_mutex); if (nlmsvc_users) { - if (--nlmsvc_users) + if (--nlmsvc_users) { + lockd_down_net(current->nsproxy->net_ns); goto out; + } } else { printk(KERN_ERR "lockd_down: no users! task=%p\n", nlmsvc_task); @@ -497,24 +553,55 @@ module_param_call(nlm_tcpport, param_set_port, param_get_int, module_param(nsm_use_hostnames, bool, 0644); module_param(nlm_max_connections, uint, 0644); +static int lockd_init_net(struct net *net) +{ + return 0; +} + +static void lockd_exit_net(struct net *net) +{ +} + +static struct pernet_operations lockd_net_ops = { + .init = lockd_init_net, + .exit = lockd_exit_net, + .id = &lockd_net_id, + .size = sizeof(struct lockd_net), +}; + + /* * Initialising and terminating the module. */ static int __init init_nlm(void) { + int err; + #ifdef CONFIG_SYSCTL + err = -ENOMEM; nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); - return nlm_sysctl_table ? 0 : -ENOMEM; -#else + if (nlm_sysctl_table == NULL) + goto err_sysctl; +#endif + err = register_pernet_subsys(&lockd_net_ops); + if (err) + goto err_pernet; return 0; + +err_pernet: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nlm_sysctl_table); #endif +err_sysctl: + return err; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); + unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); #endif diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index f0179c3..e46353f 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -46,7 +46,6 @@ static void nlmsvc_remove_block(struct nlm_block *block); static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); static void nlmsvc_freegrantargs(struct nlm_rqst *call); static const struct rpc_call_ops nlmsvc_grant_ops; -static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie); /* * The list of blocked locks to retry @@ -54,6 +53,35 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie); static LIST_HEAD(nlm_blocked); static DEFINE_SPINLOCK(nlm_blocked_lock); +#ifdef LOCKD_DEBUG +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + /* + * We can get away with a static buffer because we're only + * called with BKL held. + */ + static char buf[2*NLM_MAXCOOKIELEN+1]; + unsigned int i, len = sizeof(buf); + char *p = buf; + + len--; /* allow for trailing \0 */ + if (len < 3) + return "???"; + for (i = 0 ; i < cookie->len ; i++) { + if (len < 2) { + strcpy(p-3, "..."); + break; + } + sprintf(p, "%02x", cookie->data[i]); + p += 2; + len -= 2; + } + *p = '\0'; + + return buf; +} +#endif + /* * Insert a blocked lock into the global list */ @@ -935,32 +963,3 @@ nlmsvc_retry_blocked(void) return timeout; } - -#ifdef RPC_DEBUG -static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) -{ - /* - * We can get away with a static buffer because we're only - * called with BKL held. - */ - static char buf[2*NLM_MAXCOOKIELEN+1]; - unsigned int i, len = sizeof(buf); - char *p = buf; - - len--; /* allow for trailing \0 */ - if (len < 3) - return "???"; - for (i = 0 ; i < cookie->len ; i++) { - if (len < 2) { - strcpy(p-3, "..."); - break; - } - sprintf(p, "%02x", cookie->data[i]); - p += 2; - len -= 2; - } - *p = '\0'; - - return buf; -} -#endif @@ -13,7 +13,7 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> @@ -15,7 +15,7 @@ */ #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index dbcd821..2a0e6c5 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -64,6 +64,7 @@ config NFS_V4 bool "NFS client support for NFS version 4" depends on NFS_FS select SUNRPC_GSS + select KEYS help This option enables support for version 4 of the NFS protocol (RFC 3530) in the kernel's NFS client. @@ -98,6 +99,18 @@ config PNFS_OBJLAYOUT depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD default m +config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN + string "NFSv4.1 Implementation ID Domain" + depends on NFS_V4_1 + default "kernel.org" + help + This option defines the domain portion of the implementation ID that + may be sent in the NFS exchange_id operation. The value must be in + the format of a DNS domain name and should be set to the DNS domain + name of the distribution. + If the NFS client is unchanged from the upstream kernel, this + option should be set to the default "kernel.org". + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP @@ -130,16 +143,10 @@ config NFS_USE_KERNEL_DNS bool depends on NFS_V4 && !NFS_USE_LEGACY_DNS select DNS_RESOLVER - select KEYS default y -config NFS_USE_NEW_IDMAPPER - bool "Use the new idmapper upcall routine" - depends on NFS_V4 && KEYS - help - Say Y here if you want NFS to use the new idmapper upcall functions. - You will need /sbin/request-key (usually provided by the keyutils - package). For details, read - <file:Documentation/filesystems/nfs/idmapper.txt>. - - If you are unsure, say N. +config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 + default y diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 48cfac3..9c94297 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -46,9 +46,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -struct dentry *bl_device_pipe; -wait_queue_head_t bl_wq; - static void print_page(struct page *page) { dprintk("PRINTPAGE page %p\n", page); @@ -236,12 +233,11 @@ bl_read_pagelist(struct nfs_read_data *rdata) sector_t isect, extent_length = 0; struct parallel_io *par; loff_t f_offset = rdata->args.offset; - size_t count = rdata->args.count; struct page **pages = rdata->args.pages; int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; - dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, - rdata->npages, f_offset, count); + dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, + rdata->npages, f_offset, (unsigned int)rdata->args.count); par = alloc_parallel(rdata); if (!par) @@ -1025,10 +1021,128 @@ static const struct rpc_pipe_ops bl_upcall_ops = { .destroy_msg = bl_pipe_destroy_msg, }; +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + static int __init nfs4blocklayout_init(void) { - struct vfsmount *mnt; - struct path path; int ret; dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); @@ -1037,32 +1151,17 @@ static int __init nfs4blocklayout_init(void) if (ret) goto out; - init_waitqueue_head(&bl_wq); - - mnt = rpc_get_mount(); - if (IS_ERR(mnt)) { - ret = PTR_ERR(mnt); + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) goto out_remove; - } - - ret = vfs_path_lookup(mnt->mnt_root, - mnt, - NFS_PIPE_DIRNAME, 0, &path); + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); if (ret) - goto out_putrpc; - - bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, - &bl_upcall_ops, 0); - path_put(&path); - if (IS_ERR(bl_device_pipe)) { - ret = PTR_ERR(bl_device_pipe); - goto out_putrpc; - } + goto out_notifier; out: return ret; -out_putrpc: - rpc_put_mount(); +out_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); out_remove: pnfs_unregister_layoutdriver(&blocklayout_type); return ret; @@ -1073,9 +1172,9 @@ static void __exit nfs4blocklayout_exit(void) dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); pnfs_unregister_layoutdriver(&blocklayout_type); - rpc_unlink(bl_device_pipe); - rpc_put_mount(); } MODULE_ALIAS("nfs-layouttype4-3"); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index e31a2df..0335069 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -37,6 +37,7 @@ #include <linux/sunrpc/rpc_pipe_fs.h> #include "../pnfs.h" +#include "../netns.h" #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) @@ -50,6 +51,7 @@ struct pnfs_block_dev { struct list_head bm_node; struct nfs4_deviceid bm_mdevid; /* associated devid */ struct block_device *bm_mdev; /* meta device itself */ + struct net *net; }; enum exstate4 { @@ -151,9 +153,9 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) return BLK_LO2EXT(lseg->pls_layout); } -struct bl_dev_msg { - int32_t status; - uint32_t major, minor; +struct bl_pipe_msg { + struct rpc_pipe_msg msg; + wait_queue_head_t *bl_wq; }; struct bl_msg_hdr { @@ -161,9 +163,6 @@ struct bl_msg_hdr { u16 totallen; /* length of entire message, including hdr itself */ }; -extern struct dentry *bl_device_pipe; -extern wait_queue_head_t bl_wq; - #define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ #define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ #define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index d08ba91..a5c88a5 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -46,7 +46,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) *rp = xdr_decode_hyper(*rp, &s); if (s & 0x1ff) { - printk(KERN_WARNING "%s: sector not aligned\n", __func__); + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); return -1; } *sp = s >> SECTOR_SHIFT; @@ -79,27 +79,30 @@ int nfs4_blkdev_put(struct block_device *bdev) return blkdev_put(bdev, FMODE_READ); } -static struct bl_dev_msg bl_mount_reply; - ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfs_net_id); + if (mlen != sizeof (struct bl_dev_msg)) return -EINVAL; - if (copy_from_user(&bl_mount_reply, src, mlen) != 0) + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) return -EFAULT; - wake_up(&bl_wq); + wake_up(&nn->bl_wq); return mlen; } void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) { + struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); + if (msg->errno >= 0) return; - wake_up(&bl_wq); + wake_up(bl_pipe_msg->bl_wq); } /* @@ -111,29 +114,33 @@ nfs4_blk_decode_device(struct nfs_server *server, { struct pnfs_block_dev *rv; struct block_device *bd = NULL; - struct rpc_pipe_msg msg; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; struct bl_msg_hdr bl_msg = { .type = BL_DEVICE_MOUNT, .totallen = dev->mincount, }; uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); - struct bl_dev_msg *reply = &bl_mount_reply; int offset, len, i, rc; + struct net *net = server->nfs_client->net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, dev->mincount); - memset(&msg, 0, sizeof(msg)); - msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); - if (!msg.data) { + bl_pipe_msg.bl_wq = &nn->bl_wq; + memset(msg, 0, sizeof(*msg)); + msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); + if (!msg->data) { rv = ERR_PTR(-ENOMEM); goto out; } - memcpy(msg.data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg.data; + memcpy(msg->data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg->data; len = dev->mincount; offset = sizeof(bl_msg); for (i = 0; len > 0; i++) { @@ -142,13 +149,13 @@ nfs4_blk_decode_device(struct nfs_server *server, len -= PAGE_CACHE_SIZE; offset += PAGE_CACHE_SIZE; } - msg.len = sizeof(bl_msg) + dev->mincount; + msg->len = sizeof(bl_msg) + dev->mincount; dprintk("%s CALLING USERSPACE DAEMON\n", __func__); - add_wait_queue(&bl_wq, &wq); - rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); if (rc < 0) { - remove_wait_queue(&bl_wq, &wq); + remove_wait_queue(&nn->bl_wq, &wq); rv = ERR_PTR(rc); goto out; } @@ -156,7 +163,7 @@ nfs4_blk_decode_device(struct nfs_server *server, set_current_state(TASK_UNINTERRUPTIBLE); schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&bl_wq, &wq); + remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { dprintk("%s failed to open device: %d\n", @@ -181,13 +188,14 @@ nfs4_blk_decode_device(struct nfs_server *server, rv->bm_mdev = bd; memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); + rv->net = net; dprintk("%s Created device %s with bd_block_size %u\n", __func__, bd->bd_disk->disk_name, bd->bd_block_size); out: - kfree(msg.data); + kfree(msg->data); return rv; } diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index d055c75..737d839 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -38,9 +38,10 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static void dev_remove(dev_t dev) +static void dev_remove(struct net *net, dev_t dev) { - struct rpc_pipe_msg msg; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; struct bl_dev_msg bl_umount_request; struct bl_msg_hdr bl_msg = { .type = BL_DEVICE_UMOUNT, @@ -48,36 +49,38 @@ static void dev_remove(dev_t dev) }; uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); + struct nfs_net *nn = net_generic(net, nfs_net_id); dprintk("Entering %s\n", __func__); - memset(&msg, 0, sizeof(msg)); - msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); - if (!msg.data) + bl_pipe_msg.bl_wq = &nn->bl_wq; + memset(msg, 0, sizeof(*msg)); + msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + if (!msg->data) goto out; memset(&bl_umount_request, 0, sizeof(bl_umount_request)); bl_umount_request.major = MAJOR(dev); bl_umount_request.minor = MINOR(dev); - memcpy(msg.data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg.data; + memcpy(msg->data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg.len = sizeof(bl_msg) + bl_msg.totallen; + msg->len = sizeof(bl_msg) + bl_msg.totallen; - add_wait_queue(&bl_wq, &wq); - if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { - remove_wait_queue(&bl_wq, &wq); + add_wait_queue(&nn->bl_wq, &wq); + if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { + remove_wait_queue(&nn->bl_wq, &wq); goto out; } set_current_state(TASK_UNINTERRUPTIBLE); schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&bl_wq, &wq); + remove_wait_queue(&nn->bl_wq, &wq); out: - kfree(msg.data); + kfree(msg->data); } /* @@ -90,10 +93,10 @@ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) dprintk("%s Releasing\n", __func__); rv = nfs4_blkdev_put(bdev->bm_mdev); if (rv) - printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", + printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n", __func__, rv); - dev_remove(bdev->bm_mdev->bd_dev); + dev_remove(bdev->net, bdev->bm_mdev->bd_dev); } void bl_free_block_dev(struct pnfs_block_dev *bdev) diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 1abac09..1f9a603 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -147,7 +147,7 @@ static int _preload_range(struct pnfs_inval_markings *marks, count = (int)(end - start) / (int)tree->mtt_step_size; /* Pre-malloc what memory we might need */ - storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); + storage = kcalloc(count, sizeof(*storage), GFP_NOFS); if (!storage) return -ENOMEM; for (i = 0; i < count; i++) { diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index c98b439..dded263 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/net_namespace.h> #include "cache_lib.h" @@ -111,30 +112,54 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) return 0; } -int nfs_cache_register(struct cache_detail *cd) +int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd) { - struct vfsmount *mnt; - struct path path; int ret; + struct dentry *dir; - mnt = rpc_get_mount(); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path); - if (ret) - goto err; - ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd); - path_put(&path); - if (!ret) - return ret; -err: - rpc_put_mount(); + dir = rpc_d_lookup_sb(sb, "cache"); + BUG_ON(dir == NULL); + ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); + dput(dir); return ret; } -void nfs_cache_unregister(struct cache_detail *cd) +int nfs_cache_register_net(struct net *net, struct cache_detail *cd) { - sunrpc_cache_unregister_pipefs(cd); - rpc_put_mount(); + struct super_block *pipefs_sb; + int ret = 0; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + ret = nfs_cache_register_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + } + return ret; +} + +void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) +{ + if (cd->u.pipefs.dir) + sunrpc_cache_unregister_pipefs(cd); +} + +void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs_cache_unregister_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + } +} + +void nfs_cache_init(struct cache_detail *cd) +{ + sunrpc_init_cache_detail(cd); } +void nfs_cache_destroy(struct cache_detail *cd) +{ + sunrpc_destroy_cache_detail(cd); +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 7cf6caf..317db95 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -23,5 +23,11 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern int nfs_cache_register(struct cache_detail *cd); -extern void nfs_cache_unregister(struct cache_detail *cd); +extern void nfs_cache_init(struct cache_detail *cd); +extern void nfs_cache_destroy(struct cache_detail *cd); +extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); +extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); +extern int nfs_cache_register_sb(struct super_block *sb, + struct cache_detail *cd); +extern void nfs_cache_unregister_sb(struct super_block *sb, + struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 516f337..eb95f50 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -85,7 +85,7 @@ nfs4_callback_svc(void *vrqstp) } if (err < 0) { if (err != preverr) { - printk(KERN_WARNING "%s: unexpected error " + printk(KERN_WARNING "NFS: %s: unexpected error " "from svc_recv (%d)\n", __func__, err); preverr = err; } @@ -101,12 +101,12 @@ nfs4_callback_svc(void *vrqstp) /* * Prepare to bring up the NFSv4 callback service */ -struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) +static struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { int ret; - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, + ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; @@ -114,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, + ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nfs_callback_tcpport6 = ret; @@ -172,7 +172,7 @@ nfs41_callback_svc(void *vrqstp) /* * Bring up the NFSv4.1 callback service */ -struct svc_rqst * +static struct svc_rqst * nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { struct svc_rqst *rqstp; @@ -183,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) * fore channel connection. * Returns the input port (0) and sets the svc_serv bc_xprt on success */ - ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, + ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0, SVC_SOCK_ANONYMOUS); if (ret < 0) { rqstp = ERR_PTR(ret); @@ -269,7 +269,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) serv, xprt, &rqstp, &callback_svc); if (!minorversion_setup) { /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); + rqstp = nfs4_callback_up(serv, xprt); callback_svc = nfs4_callback_svc; } @@ -332,7 +332,6 @@ void nfs_callback_down(int minorversion) int check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) { - struct rpc_clnt *r = clp->cl_rpcclient; char *p = svc_gss_principal(rqstp); if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) @@ -353,7 +352,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) if (memcmp(p, "nfs@", 4) != 0) return 0; p += 4; - if (strcmp(p, r->cl_server) != 0) + if (strcmp(p, clp->cl_hostname) != 0) return 0; return 1; } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index c89d3b9..a5527c9 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,7 +38,8 @@ enum nfs4_callback_opnum { struct cb_process_state { __be32 drc_status; struct nfs_client *clp; - int slotid; + u32 slotid; + struct net *net; }; struct cb_compound_hdr_arg { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 54cea8a..1b5d809 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -8,6 +8,7 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/slab.h> +#include <linux/rcupdate.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" @@ -33,7 +34,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - dprintk("NFS: GETATTR callback request from %s\n", + dprintk_rcu("NFS: GETATTR callback request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); @@ -73,7 +74,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ goto out; - dprintk("NFS: RECALL callback request from %s\n", + dprintk_rcu("NFS: RECALL callback request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); res = htonl(NFS4ERR_BADHANDLE); @@ -86,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = 0; break; case -ENOENT: - if (res != 0) - res = htonl(NFS4ERR_BAD_STATEID); + res = htonl(NFS4ERR_BAD_STATEID); break; default: res = htonl(NFS4ERR_RESOURCE); @@ -98,52 +98,64 @@ out: return res; } -int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) -{ - if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) - return 0; - return 1; -} - #if defined(CONFIG_NFS_V4_1) -static u32 initiate_file_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) +/* + * Lookup a layout by filehandle. + * + * Note: gets a refcount on the layout hdr and on its respective inode. + * Caller must put the layout hdr and the inode. + * + * TODO: keep track of all layouts (and delegations) in a hash table + * hashed by filehandle. + */ +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) { struct nfs_server *server; - struct pnfs_layout_hdr *lo; struct inode *ino; - bool found = false; - u32 rv = NFS4ERR_NOMATCHING_LAYOUT; - LIST_HEAD(free_me_list); + struct pnfs_layout_hdr *lo; - spin_lock(&clp->cl_lock); - rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (nfs_compare_fh(&args->cbl_fh, - &NFS_I(lo->plh_inode)->fh)) + if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) continue; ino = igrab(lo->plh_inode); if (!ino) continue; - found = true; - /* Without this, layout can be freed as soon - * as we release cl_lock. - */ get_layout_hdr(lo); - break; + return lo; } - if (found) - break; } + + return NULL; +} + +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); + lo = get_layout_by_fh_locked(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); - if (!found) + return lo; +} + +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct inode *ino; + struct pnfs_layout_hdr *lo; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + lo = get_layout_by_fh(clp, &args->cbl_fh); + if (!lo) return NFS4ERR_NOMATCHING_LAYOUT; + ino = lo->plh_inode; spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || mark_matching_lsegs_invalid(lo, &free_me_list, @@ -213,17 +225,13 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 res = NFS4ERR_DELAY; + u32 res; dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); - if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) - goto out; if (args->cbl_recall_type == RETURN_FILE) res = initiate_file_draining(clp, args); else res = initiate_bulk_draining(clp, args); - clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); -out: dprintk("%s returning %i\n", __func__, res); return res; @@ -303,21 +311,6 @@ out: return res; } -int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) -{ - if (delegation == NULL) - return 0; - - if (stateid->stateid.seqid != 0) - return 0; - if (memcmp(&delegation->stateid.stateid.other, - &stateid->stateid.other, - NFS4_STATEID_OTHER_SIZE)) - return 0; - - return 1; -} - /* * Validate the sequenceID sent by the server. * Return success if the sequenceID is one more than what we last saw on @@ -441,7 +434,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, int i; __be32 status = htonl(NFS4ERR_BADSESSION); - clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); + clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); if (clp == NULL) goto out; @@ -517,7 +510,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, if (!cps->clp) /* set in cb_sequence */ goto out; - dprintk("NFS: RECALL_ANY callback request from %s\n", + dprintk_rcu("NFS: RECALL_ANY callback request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); status = cpu_to_be32(NFS4ERR_INVAL); @@ -552,7 +545,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, if (!cps->clp) /* set in cb_sequence */ goto out; - dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), args->crsa_target_max_slots); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d50b274..95bfc24 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -9,6 +9,8 @@ #include <linux/sunrpc/svc.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> #include <linux/slab.h> #include <linux/sunrpc/bc_xprt.h> #include "nfs4_fs.h" @@ -73,7 +75,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) p = xdr_inline_decode(xdr, nbytes); if (unlikely(p == NULL)) - printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); + printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); return p; } @@ -138,10 +140,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, 16); + p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid->data, p, 16); + memcpy(stateid, p, NFS4_STATEID_SIZE); return 0; } @@ -155,7 +157,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return status; /* We do not like overly long tags! */ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { - printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", + printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", __func__, hdr->taglen); return htonl(NFS4ERR_RESOURCE); } @@ -167,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound if (hdr->minorversion <= 1) { hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ } else { - printk(KERN_WARNING "%s: NFSv4 server callback with " + pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " "illegal minor version %u!\n", __func__, hdr->minorversion); return htonl(NFS4ERR_MINOR_VERS_MISMATCH); @@ -759,14 +761,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * Let the state manager know callback processing done. * A single slot, so highest used slotid is either 0 or -1 */ - tbl->highest_used_slotid = -1; + tbl->highest_used_slotid = NFS4_NO_SLOT; nfs4_check_drain_bc_complete(session); spin_unlock(&tbl->slot_tbl_lock); } static void nfs4_cb_free_slot(struct cb_process_state *cps) { - if (cps->slotid != -1) + if (cps->slotid != NFS4_NO_SLOT) nfs4_callback_free_slot(cps->clp->cl_session); } @@ -860,7 +862,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_process_state cps = { .drc_status = 0, .clp = NULL, - .slotid = -1, + .slotid = NFS4_NO_SLOT, + .net = rqstp->rq_xprt->xpt_net, }; unsigned int nops = 0; @@ -876,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_garbage_args; if (hdr_arg.minorversion == 0) { - cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); + cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) return rpc_drop_reply; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d4f772e..4a108a0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -40,6 +40,8 @@ #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> #include <asm/system.h> @@ -50,15 +52,12 @@ #include "internal.h" #include "fscache.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_CLIENT -static DEFINE_SPINLOCK(nfs_client_lock); -static LIST_HEAD(nfs_client_list); -static LIST_HEAD(nfs_volume_list); static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); #ifdef CONFIG_NFS_V4 -static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */ /* * Get a unique NFSv4.0 callback identifier which will be used @@ -67,15 +66,16 @@ static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) { int ret = 0; + struct nfs_net *nn = net_generic(clp->net, nfs_net_id); if (clp->rpc_ops->version != 4 || minorversion != 0) return ret; retry: - if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) + if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) return -ENOMEM; - spin_lock(&nfs_client_lock); - ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); - spin_unlock(&nfs_client_lock); + spin_lock(&nn->nfs_client_lock); + ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); + spin_unlock(&nn->nfs_client_lock); if (ret == -EAGAIN) goto retry; return ret; @@ -90,7 +90,7 @@ static bool nfs4_disable_idmapping = true; /* * RPC cruft for NFS */ -static struct rpc_version *nfs_version[5] = { +static const struct rpc_version *nfs_version[5] = { [2] = &nfs_version2, #ifdef CONFIG_NFS_V3 [3] = &nfs_version3, @@ -100,7 +100,7 @@ static struct rpc_version *nfs_version[5] = { #endif }; -struct rpc_program nfs_program = { +const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), @@ -116,11 +116,11 @@ struct rpc_stat nfs_rpcstat = { #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version * nfsacl_version[] = { +static const struct rpc_version *nfsacl_version[] = { [3] = &nfsacl_version3, }; -struct rpc_program nfsacl_program = { +const struct rpc_program nfsacl_program = { .name = "nfsacl", .number = NFS_ACL_PROGRAM, .nrvers = ARRAY_SIZE(nfsacl_version), @@ -136,6 +136,7 @@ struct nfs_client_initdata { const struct nfs_rpc_ops *rpc_ops; int proto; u32 minorversion; + struct net *net; }; /* @@ -172,6 +173,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_proto = cl_init->proto; + clp->net = get_net(cl_init->net); #ifdef CONFIG_NFS_V4 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); @@ -203,8 +205,11 @@ error_0: #ifdef CONFIG_NFS_V4_1 static void nfs4_shutdown_session(struct nfs_client *clp) { - if (nfs4_has_session(clp)) + if (nfs4_has_session(clp)) { + nfs4_deviceid_purge_client(clp); nfs4_destroy_session(clp->cl_session); + } + } #else /* CONFIG_NFS_V4_1 */ static void nfs4_shutdown_session(struct nfs_client *clp) @@ -234,16 +239,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp) } /* idr_remove_all is not needed as all id's are removed by nfs_put_client */ -void nfs_cleanup_cb_ident_idr(void) +void nfs_cleanup_cb_ident_idr(struct net *net) { - idr_destroy(&cb_ident_idr); + struct nfs_net *nn = net_generic(net, nfs_net_id); + + idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { + struct nfs_net *nn = net_generic(clp->net, nfs_net_id); + if (clp->cl_cb_ident) - idr_remove(&cb_ident_idr, clp->cl_cb_ident); + idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) @@ -261,7 +270,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp) { } -void nfs_cleanup_cb_ident_idr(void) +void nfs_cleanup_cb_ident_idr(struct net *net) { } @@ -293,10 +302,10 @@ static void nfs_free_client(struct nfs_client *clp) if (clp->cl_machine_cred != NULL) put_rpccred(clp->cl_machine_cred); - nfs4_deviceid_purge_client(clp); - + put_net(clp->net); kfree(clp->cl_hostname); kfree(clp->server_scope); + kfree(clp->impl_id); kfree(clp); dprintk("<-- nfs_free_client()\n"); @@ -307,15 +316,18 @@ static void nfs_free_client(struct nfs_client *clp) */ void nfs_put_client(struct nfs_client *clp) { + struct nfs_net *nn; + if (!clp) return; dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + nn = net_generic(clp->net, nfs_net_id); - if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { + if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); BUG_ON(!list_empty(&clp->cl_superblocks)); @@ -393,6 +405,7 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, (sin1->sin_port == sin2->sin_port); } +#if defined(CONFIG_NFS_V4_1) /* * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields, excluding the port number. @@ -411,6 +424,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } +#endif /* CONFIG_NFS_V4_1 */ /* * Test if two socket addresses represent the same actual socket, @@ -431,10 +445,10 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, return 0; } +#if defined(CONFIG_NFS_V4_1) /* Common match routine for v4.0 and v4.1 callback services */ -bool -nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, - u32 minorversion) +static bool nfs4_cb_match_client(const struct sockaddr *addr, + struct nfs_client *clp, u32 minorversion) { struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; @@ -454,6 +468,7 @@ nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, return true; } +#endif /* CONFIG_NFS_V4_1 */ /* * Find an nfs_client on the list that matches the initialisation data @@ -463,8 +478,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat { struct nfs_client *clp; const struct sockaddr *sap = data->addr; + struct nfs_net *nn = net_generic(data->net, nfs_net_id); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) @@ -502,13 +518,14 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, { struct nfs_client *clp, *new = NULL; int error; + struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); dprintk("--> nfs_get_client(%s,v%u)\n", cl_init->hostname ?: "", cl_init->rpc_ops->version); /* see if the client already exists */ do { - spin_lock(&nfs_client_lock); + spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) @@ -516,7 +533,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, if (new) goto install_client; - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); new = nfs_alloc_client(cl_init); } while (!IS_ERR(new)); @@ -527,8 +544,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, /* install a new client and return with it unready */ install_client: clp = new; - list_add(&clp->cl_share_link, &nfs_client_list); - spin_unlock(&nfs_client_lock); + list_add(&clp->cl_share_link, &nn->nfs_client_list); + spin_unlock(&nn->nfs_client_lock); error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, authflavour, noresvport); @@ -543,7 +560,7 @@ install_client: * - make sure it's ready before returning */ found_client: - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); if (new) nfs_free_client(new); @@ -643,7 +660,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { - .net = &init_net, + .net = clp->net, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, @@ -697,6 +714,7 @@ static int nfs_start_lockd(struct nfs_server *server) .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, + .net = clp->net, }; if (nlm_init.nfs_version > 3) @@ -832,6 +850,7 @@ static int nfs_init_server(struct nfs_server *server, .addrlen = data->nfs_server.addrlen, .rpc_ops = &nfs_v2_clientops, .proto = data->nfs_server.protocol, + .net = data->net, }; struct rpc_timeout timeparms; struct nfs_client *clp; @@ -1030,25 +1049,30 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve static void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn = net_generic(clp->net, nfs_net_id); - spin_lock(&nfs_client_lock); + spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); + list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); } static void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn; - spin_lock(&nfs_client_lock); + if (clp == NULL) + return; + nn = net_generic(clp->net, nfs_net_id); + spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); - if (clp && list_empty(&clp->cl_superblocks)) + if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } @@ -1087,6 +1111,8 @@ static struct nfs_server *nfs_alloc_server(void) return NULL; } + ida_init(&server->openowner_id); + ida_init(&server->lockowner_id); pnfs_init_server(server); return server; @@ -1112,6 +1138,8 @@ void nfs_free_server(struct nfs_server *server) nfs_put_client(server->nfs_client); + ida_destroy(&server->lockowner_id); + ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); bdi_destroy(&server->backing_dev_info); kfree(server); @@ -1190,45 +1218,19 @@ error: /* * NFSv4.0 callback thread helper * - * Find a client by IP address, protocol version, and minorversion - * - * Called from the pg_authenticate method. The callback identifier - * is not used as it has not been decoded. - * - * Returns NULL if no such client - */ -struct nfs_client * -nfs4_find_client_no_ident(const struct sockaddr *addr) -{ - struct nfs_client *clp; - - spin_lock(&nfs_client_lock); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, 0) == false) - continue; - atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); - return clp; - } - spin_unlock(&nfs_client_lock); - return NULL; -} - -/* - * NFSv4.0 callback thread helper - * * Find a client by callback identifier */ struct nfs_client * -nfs4_find_client_ident(int cb_ident) +nfs4_find_client_ident(struct net *net, int cb_ident) { struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); - spin_lock(&nfs_client_lock); - clp = idr_find(&cb_ident_idr, cb_ident); + spin_lock(&nn->nfs_client_lock); + clp = idr_find(&nn->cb_ident_idr, cb_ident); if (clp) atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); return clp; } @@ -1241,13 +1243,14 @@ nfs4_find_client_ident(int cb_ident) * Returns NULL if no such client */ struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *addr, +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, struct nfs4_sessionid *sid) { struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); - spin_lock(&nfs_client_lock); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { if (nfs4_cb_match_client(addr, clp, 1) == false) continue; @@ -1260,17 +1263,17 @@ nfs4_find_client_sessionid(const struct sockaddr *addr, continue; atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); return clp; } - spin_unlock(&nfs_client_lock); + spin_unlock(&nn->nfs_client_lock); return NULL; } #else /* CONFIG_NFS_V4_1 */ struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *addr, +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, struct nfs4_sessionid *sid) { return NULL; @@ -1285,16 +1288,18 @@ static int nfs4_init_callback(struct nfs_client *clp) int error; if (clp->rpc_ops->version == 4) { + struct rpc_xprt *xprt; + + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + if (nfs4_has_session(clp)) { - error = xprt_setup_backchannel( - clp->cl_rpcclient->cl_xprt, + error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); if (error < 0) return error; } - error = nfs_callback_up(clp->cl_mvops->minor_version, - clp->cl_rpcclient->cl_xprt); + error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); if (error < 0) { dprintk("%s: failed to start callback. Error = %d\n", __func__, error); @@ -1345,6 +1350,7 @@ int nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour, int noresvport) { + char buf[INET6_ADDRSTRLEN + 1]; int error; if (clp->cl_cons_state == NFS_CS_READY) { @@ -1360,6 +1366,20 @@ int nfs4_init_client(struct nfs_client *clp, 1, noresvport); if (error < 0) goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (error < 0) + goto error; + error = rpc_ntop(sap, buf, sizeof(buf)); + if (error < 0) + goto error; + ip_addr = (const char *)buf; + } strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); @@ -1394,7 +1414,7 @@ static int nfs4_set_client(struct nfs_server *server, const char *ip_addr, rpc_authflavor_t authflavour, int proto, const struct rpc_timeout *timeparms, - u32 minorversion) + u32 minorversion, struct net *net) { struct nfs_client_initdata cl_init = { .hostname = hostname, @@ -1403,6 +1423,7 @@ static int nfs4_set_client(struct nfs_server *server, .rpc_ops = &nfs_v4_clientops, .proto = proto, .minorversion = minorversion, + .net = net, }; struct nfs_client *clp; int error; @@ -1454,6 +1475,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, .rpc_ops = &nfs_v4_clientops, .proto = ds_proto, .minorversion = mds_clp->cl_minorversion, + .net = mds_clp->net, }; struct rpc_timeout ds_timeout = { .to_initval = 15 * HZ, @@ -1581,7 +1603,8 @@ static int nfs4_init_server(struct nfs_server *server, data->auth_flavors[0], data->nfs_server.protocol, &timeparms, - data->minorversion); + data->minorversion, + data->net); if (error < 0) goto error; @@ -1676,9 +1699,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->addrlen, parent_client->cl_ipaddr, data->authflavor, - parent_server->client->cl_xprt->prot, + rpc_protocol(parent_server->client), parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version); + parent_client->cl_mvops->minor_version, + parent_client->net); if (error < 0) goto error; @@ -1771,6 +1795,18 @@ out_free_server: return ERR_PTR(error); } +void nfs_clients_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + INIT_LIST_HEAD(&nn->nfs_client_list); + INIT_LIST_HEAD(&nn->nfs_volume_list); +#ifdef CONFIG_NFS_V4 + idr_init(&nn->cb_ident_idr); +#endif + spin_lock_init(&nn->nfs_client_lock); +} + #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_fs_nfs; @@ -1824,13 +1860,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; + struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; + struct net *net = pid_ns->child_reaper->nsproxy->net_ns; ret = seq_open(file, &nfs_server_list_ops); if (ret < 0) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = net; return 0; } @@ -1840,9 +1878,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) { + struct nfs_net *nn = net_generic(m->private, nfs_net_id); + /* lock the list against modification */ - spin_lock(&nfs_client_lock); - return seq_list_start_head(&nfs_client_list, *_pos); + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* @@ -1850,7 +1890,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { - return seq_list_next(v, &nfs_client_list, pos); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + return seq_list_next(v, &nn->nfs_client_list, pos); } /* @@ -1858,7 +1900,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_server_list_stop(struct seq_file *p, void *v) { - spin_unlock(&nfs_client_lock); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); } /* @@ -1867,9 +1911,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v) static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; + struct nfs_net *nn = net_generic(m->private, nfs_net_id); /* display header on line 1 */ - if (v == &nfs_client_list) { + if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } @@ -1881,12 +1926,14 @@ static int nfs_server_list_show(struct seq_file *m, void *v) if (clp->cl_cons_state != NFS_CS_READY) return 0; + rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), atomic_read(&clp->cl_count), clp->cl_hostname); + rcu_read_unlock(); return 0; } @@ -1898,13 +1945,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; + struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; + struct net *net = pid_ns->child_reaper->nsproxy->net_ns; ret = seq_open(file, &nfs_volume_list_ops); if (ret < 0) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = net; return 0; } @@ -1914,9 +1963,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) { + struct nfs_net *nn = net_generic(m->private, nfs_net_id); + /* lock the list against modification */ - spin_lock(&nfs_client_lock); - return seq_list_start_head(&nfs_volume_list, *_pos); + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* @@ -1924,7 +1975,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { - return seq_list_next(v, &nfs_volume_list, pos); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + return seq_list_next(v, &nn->nfs_volume_list, pos); } /* @@ -1932,7 +1985,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_volume_list_stop(struct seq_file *p, void *v) { - spin_unlock(&nfs_client_lock); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); } /* @@ -1943,9 +1998,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) struct nfs_server *server; struct nfs_client *clp; char dev[8], fsid[17]; + struct nfs_net *nn = net_generic(m->private, nfs_net_id); /* display header on line 1 */ - if (v == &nfs_volume_list) { + if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); return 0; } @@ -1960,6 +2016,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); + rcu_read_lock(); seq_printf(m, "v%u %s %s %-7s %-17s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), @@ -1967,6 +2024,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) dev, fsid, nfs_server_fscache_state(server)); + rcu_read_unlock(); return 0; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7f26540..89af1d2 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -105,7 +105,7 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) continue; - if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + if (!nfs4_stateid_match(&state->stateid, stateid)) continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); @@ -139,8 +139,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation != NULL) { spin_lock(&delegation->lock); if (delegation->inode != NULL) { - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); + nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; oldcred = delegation->cred; @@ -236,8 +235,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); + nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; delegation->change_attr = inode->i_version; @@ -250,19 +248,22 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct old_delegation = rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); if (old_delegation != NULL) { - if (memcmp(&delegation->stateid, &old_delegation->stateid, - sizeof(old_delegation->stateid)) == 0 && + if (nfs4_stateid_match(&delegation->stateid, + &old_delegation->stateid) && delegation->type == old_delegation->type) { goto out; } /* * Deal with broken servers that hand out two * delegations for the same file. + * Allow for upgrades to a WRITE delegation, but + * nothing else. */ dfprintk(FILE, "%s: server %s handed out " "a duplicate delegation!\n", __func__, clp->cl_hostname); - if (delegation->type <= old_delegation->type) { + if (delegation->type == old_delegation->type || + !(delegation->type & FMODE_WRITE)) { freeme = delegation; delegation = NULL; goto out; @@ -455,17 +456,24 @@ static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, rcu_read_unlock(); } -static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) -{ - nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} - static void nfs_delegation_run_state_manager(struct nfs_client *clp) { if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) nfs4_schedule_state_manager(clp); } +void nfs_remove_bad_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + /** * nfs_expire_all_delegation_types * @clp: client to process @@ -488,18 +496,6 @@ void nfs_expire_all_delegations(struct nfs_client *clp) nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); } -/** - * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN - * @clp: client to process - * - */ -void nfs_handle_cb_pathdown(struct nfs_client *clp) -{ - if (clp == NULL) - return; - nfs_client_mark_return_all_delegations(clp); -} - static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -531,7 +527,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /** * nfs_async_inode_return_delegation - asynchronously return a delegation * @inode: inode to process - * @stateid: state ID information from CB_RECALL arguments + * @stateid: state ID information * * Returns zero on success, or a negative errno value. */ @@ -545,7 +541,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) { rcu_read_unlock(); return -ENOENT; } @@ -684,21 +680,25 @@ int nfs_delegations_present(struct nfs_client *clp) * nfs4_copy_delegation_stateid - Copy inode's state ID information * @dst: stateid data structure to fill in * @inode: inode to check + * @flags: delegation type requirement * - * Returns one and fills in "dst->data" * if inode had a delegation, - * otherwise zero is returned. + * Returns "true" and fills in "dst->data" * if inode had a delegation, + * otherwise "false" is returned. */ -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, + fmode_t flags) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int ret = 0; + bool ret; + flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation != NULL) { - memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); - ret = 1; + ret = (delegation != NULL && (delegation->type & flags) == flags); + if (ret) { + nfs4_stateid_copy(dst, &delegation->stateid); + nfs_mark_delegation_referenced(delegation); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d9322e4..cd6a7a8 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -42,9 +42,9 @@ void nfs_super_return_all_delegations(struct super_block *sb); void nfs_expire_all_delegations(struct nfs_client *clp); void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); -void nfs_handle_cb_pathdown(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -53,7 +53,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32aa691..4aaf031 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -207,7 +207,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - unsigned int size; + int size; int eof_index; u64 last_cookie; struct nfs_cache_array_entry array[0]; @@ -1429,6 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } open_flags = nd->intent.open.flags; + attr.ia_valid = 0; ctx = create_nfs_open_context(dentry, open_flags); res = ERR_CAST(ctx); @@ -1437,11 +1438,14 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; - attr.ia_valid = ATTR_MODE; + attr.ia_valid |= ATTR_MODE; attr.ia_mode &= ~current_umask(); - } else { + } else open_flags &= ~(O_EXCL | O_CREAT); - attr.ia_valid = 0; + + if (open_flags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; } /* Open the file on the server */ @@ -1495,6 +1499,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *inode; struct inode *dir; struct nfs_open_context *ctx; + struct iattr attr; int openflags, ret = 0; if (nd->flags & LOOKUP_RCU) @@ -1523,19 +1528,27 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) /* We cannot do exclusive creation on a positive dentry */ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open_dput; - /* We can't create new files, or truncate existing ones here */ - openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); + /* We can't create new files here */ + openflags &= ~(O_CREAT|O_EXCL); ctx = create_nfs_open_context(dentry, openflags); ret = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; + + attr.ia_valid = 0; + if (openflags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + nfs_wb_all(inode); + } + /* * Note: we're not holding inode->i_mutex and so may be racing with * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); if (IS_ERR(inode)) { ret = PTR_ERR(inode); switch (ret) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1940f1a..9c7f66a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -265,9 +265,7 @@ static void nfs_direct_read_release(void *calldata) } static const struct rpc_call_ops nfs_read_direct_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_direct_read_result, .rpc_release = nfs_direct_read_release, }; @@ -554,9 +552,7 @@ static void nfs_direct_commit_release(void *calldata) } static const struct rpc_call_ops nfs_commit_direct_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_direct_commit_result, .rpc_release = nfs_direct_commit_release, }; @@ -696,9 +692,7 @@ out_unlock: } static const struct rpc_call_ops nfs_write_direct_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_direct_write_result, .rpc_release = nfs_direct_write_release, }; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index a6e711a..b3924b8 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -10,8 +10,9 @@ #include <linux/sunrpc/clnt.h> #include <linux/dns_resolver.h> +#include "dns_resolve.h" -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, +ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, struct sockaddr *sa, size_t salen) { ssize_t ret; @@ -20,7 +21,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen, ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); if (ip_len > 0) - ret = rpc_pton(ip_addr, ip_len, sa, salen); + ret = rpc_pton(net, ip_addr, ip_len, sa, salen); else ret = -ESRCH; kfree(ip_addr); @@ -40,15 +41,15 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen, #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/rpc_pipe_fs.h> #include "dns_resolve.h" #include "cache_lib.h" +#include "netns.h" #define NFS_DNS_HASHBITS 4 #define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) -static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; - struct nfs_dns_ent { struct cache_head h; @@ -224,7 +225,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) len = qword_get(&buf, buf1, sizeof(buf1)); if (len <= 0) goto out; - key.addrlen = rpc_pton(buf1, len, + key.addrlen = rpc_pton(cd->net, buf1, len, (struct sockaddr *)&key.addr, sizeof(key.addr)); @@ -259,21 +260,6 @@ out: return ret; } -static struct cache_detail nfs_dns_resolve = { - .owner = THIS_MODULE, - .hash_size = NFS_DNS_HASHTBL_SIZE, - .hash_table = nfs_dns_table, - .name = "dns_resolve", - .cache_put = nfs_dns_ent_put, - .cache_upcall = nfs_dns_upcall, - .cache_parse = nfs_dns_parse, - .cache_show = nfs_dns_show, - .match = nfs_dns_match, - .init = nfs_dns_ent_init, - .update = nfs_dns_ent_update, - .alloc = nfs_dns_ent_alloc, -}; - static int do_cache_lookup(struct cache_detail *cd, struct nfs_dns_ent *key, struct nfs_dns_ent **item, @@ -336,8 +322,8 @@ out: return ret; } -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, - struct sockaddr *sa, size_t salen) +ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen) { struct nfs_dns_ent key = { .hostname = name, @@ -345,28 +331,118 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen, }; struct nfs_dns_ent *item = NULL; ssize_t ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); - ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); if (ret == 0) { if (salen >= item->addrlen) { memcpy(sa, &item->addr, item->addrlen); ret = item->addrlen; } else ret = -EOVERFLOW; - cache_put(&item->h, &nfs_dns_resolve); + cache_put(&item->h, nn->nfs_dns_resolve); } else if (ret == -ENOENT) ret = -ESRCH; return ret; } +int nfs_dns_resolver_cache_init(struct net *net) +{ + int err = -ENOMEM; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct cache_detail *cd; + struct cache_head **tbl; + + cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); + if (cd == NULL) + goto err_cd; + + tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *), + GFP_KERNEL); + if (tbl == NULL) + goto err_tbl; + + cd->owner = THIS_MODULE, + cd->hash_size = NFS_DNS_HASHTBL_SIZE, + cd->hash_table = tbl, + cd->name = "dns_resolve", + cd->cache_put = nfs_dns_ent_put, + cd->cache_upcall = nfs_dns_upcall, + cd->cache_parse = nfs_dns_parse, + cd->cache_show = nfs_dns_show, + cd->match = nfs_dns_match, + cd->init = nfs_dns_ent_init, + cd->update = nfs_dns_ent_update, + cd->alloc = nfs_dns_ent_alloc, + + nfs_cache_init(cd); + err = nfs_cache_register_net(net, cd); + if (err) + goto err_reg; + nn->nfs_dns_resolve = cd; + return 0; + +err_reg: + nfs_cache_destroy(cd); + kfree(cd->hash_table); +err_tbl: + kfree(cd); +err_cd: + return err; +} + +void nfs_dns_resolver_cache_destroy(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct cache_detail *cd = nn->nfs_dns_resolve; + + nfs_cache_unregister_net(net, cd); + nfs_cache_destroy(cd); + kfree(cd->hash_table); + kfree(cd); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct cache_detail *cd = nn->nfs_dns_resolve; + int ret = 0; + + if (cd == NULL) + return 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + switch (event) { + case RPC_PIPEFS_MOUNT: + ret = nfs_cache_register_sb(sb, cd); + break; + case RPC_PIPEFS_UMOUNT: + nfs_cache_unregister_sb(sb, cd); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs_dns_resolver_block = { + .notifier_call = rpc_pipefs_event, +}; + int nfs_dns_resolver_init(void) { - return nfs_cache_register(&nfs_dns_resolve); + return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); } void nfs_dns_resolver_destroy(void) { - nfs_cache_unregister(&nfs_dns_resolve); + rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); } - #endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 199bb55..2e4f596 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void) static inline void nfs_dns_resolver_destroy(void) {} + +static inline int nfs_dns_resolver_cache_init(struct net *net) +{ + return 0; +} + +static inline void nfs_dns_resolver_cache_destroy(struct net *net) +{} #else extern int nfs_dns_resolver_init(void); extern void nfs_dns_resolver_destroy(void); +extern int nfs_dns_resolver_cache_init(struct net *net); +extern void nfs_dns_resolver_cache_destroy(struct net *net); #endif -extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, - struct sockaddr *sa, size_t salen); +extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen); #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c43a452..4fdaaa6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -530,6 +530,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (mapping != dentry->d_inode->i_mapping) goto out_unlock; + wait_on_page_writeback(page); + pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 419119c..ae65c16 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_server *nfss = NFS_SERVER(inode); - struct fscache_cookie *old = nfsi->fscache; + NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache); nfs_fscache_inode_lock(inode); if (nfsi->fscache) { diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a1bbf77..b7f348b 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -34,11 +34,29 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/types.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/fs.h> #include <linux/nfs_idmap.h> +#include <net/net_namespace.h> +#include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/module.h> + +#include "internal.h" +#include "netns.h" + +#define NFS_UINT_MAXLEN 11 + +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; + /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields @@ -142,24 +160,7 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) return snprintf(buf, buflen, "%u", id); } -#ifdef CONFIG_NFS_USE_NEW_IDMAPPER - -#include <linux/cred.h> -#include <linux/sunrpc/sched.h> -#include <linux/nfs4.h> -#include <linux/nfs_fs_sb.h> -#include <linux/keyctl.h> -#include <linux/key-type.h> -#include <linux/rcupdate.h> -#include <linux/err.h> - -#include <keys/user-type.h> - -#define NFS_UINT_MAXLEN 11 - -const struct cred *id_resolver_cache; - -struct key_type key_type_id_resolver = { +static struct key_type key_type_id_resolver = { .name = "id_resolver", .instantiate = user_instantiate, .match = user_match, @@ -169,13 +170,14 @@ struct key_type key_type_id_resolver = { .read = user_read, }; -int nfs_idmap_init(void) +static int nfs_idmap_init_keyring(void) { struct cred *cred; struct key *keyring; int ret = 0; - printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); + printk(KERN_NOTICE "NFS: Registering the %s key type\n", + key_type_id_resolver.name); cred = prepare_kernel_cred(NULL); if (!cred) @@ -211,7 +213,7 @@ failed_put_cred: return ret; } -void nfs_idmap_quit(void) +static void nfs_idmap_quit_keyring(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); @@ -246,8 +248,10 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, return desclen; } -static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, - const char *type, void *data, size_t data_size) +static ssize_t nfs_idmap_request_key(struct key_type *key_type, + const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) { const struct cred *saved_cred; struct key *rkey; @@ -260,8 +264,12 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, goto out; saved_cred = override_creds(id_resolver_cache); - rkey = request_key(&key_type_id_resolver, desc, ""); + if (idmap) + rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap); + else + rkey = request_key(&key_type_id_resolver, desc, ""); revert_creds(saved_cred); + kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); @@ -294,31 +302,46 @@ out: return ret; } +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) +{ + ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver, + name, namelen, type, data, + data_size, NULL); + if (ret < 0) { + ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, + name, namelen, type, data, + data_size, idmap); + } + return ret; +} /* ID -> Name */ -static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, + size_t buflen, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; int id_len; ssize_t ret; id_len = snprintf(id_str, sizeof(id_str), "%u", id); - ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); + ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); if (ret < 0) return -EINVAL; return ret; } /* Name -> ID */ -static int nfs_idmap_lookup_id(const char *name, size_t namelen, - const char *type, __u32 *id) +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, + __u32 *id, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; long id_long; ssize_t data_size; int ret = 0; - data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); + data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); if (data_size <= 0) { ret = -EINVAL; } else { @@ -328,114 +351,103 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, return ret; } -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) -{ - if (nfs_map_string_to_numeric(name, namelen, uid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "uid", uid); -} - -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) -{ - if (nfs_map_string_to_numeric(name, namelen, gid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "gid", gid); -} - -int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) -{ - int ret = -EINVAL; - - if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); - if (ret < 0) - ret = nfs_map_numeric_to_string(uid, buf, buflen); - return ret; -} -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) -{ - int ret = -EINVAL; +/* idmap classic begins here */ +module_param(nfs_idmap_cache_timeout, int, 0644); - if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); - if (ret < 0) - ret = nfs_map_numeric_to_string(gid, buf, buflen); - return ret; -} - -#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ - -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/init.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/sched.h> -#include <linux/sunrpc/clnt.h> -#include <linux/workqueue.h> -#include <linux/sunrpc/rpc_pipe_fs.h> - -#include <linux/nfs_fs.h> - -#include "nfs4_fs.h" - -#define IDMAP_HASH_SZ 128 - -/* Default cache timeout is 10 minutes */ -unsigned int nfs_idmap_cache_timeout = 600 * HZ; - -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) -{ - char *endp; - int num = simple_strtol(val, &endp, 0); - int jif = num * HZ; - if (endp == val || *endp || num < 0 || jif < num) - return -EINVAL; - *((int *)kp->arg) = jif; - return 0; -} - -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, - &nfs_idmap_cache_timeout, 0644); - -struct idmap_hashent { - unsigned long ih_expires; - __u32 ih_id; - size_t ih_namelen; - char ih_name[IDMAP_NAMESZ]; +struct idmap { + struct rpc_pipe *idmap_pipe; + struct key_construction *idmap_key_cons; }; -struct idmap_hashtable { - __u8 h_type; - struct idmap_hashent h_entries[IDMAP_HASH_SZ]; +enum { + Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err }; -struct idmap { - struct dentry *idmap_dentry; - wait_queue_head_t idmap_wq; - struct idmap_msg idmap_im; - struct mutex idmap_lock; /* Serializes upcalls */ - struct mutex idmap_im_lock; /* Protects the hashtable */ - struct idmap_hashtable idmap_user_hash; - struct idmap_hashtable idmap_group_hash; +static const match_table_t nfs_idmap_tokens = { + { Opt_find_uid, "uid:%s" }, + { Opt_find_gid, "gid:%s" }, + { Opt_find_user, "user:%s" }, + { Opt_find_group, "group:%s" }, + { Opt_find_err, NULL } }; +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); -static unsigned int fnvhash32(const void *, size_t); - static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, }; +static struct key_type key_type_id_resolver_legacy = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, + .request_key = nfs_idmap_legacy_upcall, +}; + +static void __nfs_idmap_unregister(struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int __nfs_idmap_register(struct dentry *dir, + struct idmap *idmap, + struct rpc_pipe *pipe) +{ + struct dentry *dentry; + + dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + pipe->dentry = dentry; + return 0; +} + +static void nfs_idmap_unregister(struct nfs_client *clp, + struct rpc_pipe *pipe) +{ + struct net *net = clp->net; + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + __nfs_idmap_unregister(pipe); + rpc_put_sb_net(net); + } +} + +static int nfs_idmap_register(struct nfs_client *clp, + struct idmap *idmap, + struct rpc_pipe *pipe) +{ + struct net *net = clp->net; + struct super_block *pipefs_sb; + int err = 0; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + if (clp->cl_rpcclient->cl_dentry) + err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, + idmap, pipe); + rpc_put_sb_net(net); + } + return err; +} + int nfs_idmap_new(struct nfs_client *clp) { struct idmap *idmap; + struct rpc_pipe *pipe; int error; BUG_ON(clp->cl_idmap != NULL); @@ -444,19 +456,19 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, - "idmap", idmap, &idmap_upcall_ops, 0); - if (IS_ERR(idmap->idmap_dentry)) { - error = PTR_ERR(idmap->idmap_dentry); + pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); + if (IS_ERR(pipe)) { + error = PTR_ERR(pipe); kfree(idmap); return error; } - - mutex_init(&idmap->idmap_lock); - mutex_init(&idmap->idmap_im_lock); - init_waitqueue_head(&idmap->idmap_wq); - idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; - idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; + error = nfs_idmap_register(clp, idmap, pipe); + if (error) { + rpc_destroy_pipe_data(pipe); + kfree(idmap); + return error; + } + idmap->idmap_pipe = pipe; clp->cl_idmap = idmap; return 0; @@ -469,211 +481,220 @@ nfs_idmap_delete(struct nfs_client *clp) if (!idmap) return; - rpc_unlink(idmap->idmap_dentry); + nfs_idmap_unregister(clp, idmap->idmap_pipe); + rpc_destroy_pipe_data(idmap->idmap_pipe); clp->cl_idmap = NULL; kfree(idmap); } -/* - * Helper routines for manipulating the hashtable - */ -static inline struct idmap_hashent * -idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) -{ - return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; -} - -static struct idmap_hashent * -idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) +static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event, + struct super_block *sb) { - struct idmap_hashent *he = idmap_name_hash(h, name, len); + int err = 0; - if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) - return NULL; - if (time_after(jiffies, he->ih_expires)) - return NULL; - return he; + switch (event) { + case RPC_PIPEFS_MOUNT: + BUG_ON(clp->cl_rpcclient->cl_dentry == NULL); + err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, + clp->cl_idmap, + clp->cl_idmap->idmap_pipe); + break; + case RPC_PIPEFS_UMOUNT: + if (clp->cl_idmap->idmap_pipe) { + struct dentry *parent; + + parent = clp->cl_idmap->idmap_pipe->dentry->d_parent; + __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe); + /* + * Note: This is a dirty hack. SUNRPC hook has been + * called already but simple_rmdir() call for the + * directory returned with error because of idmap pipe + * inside. Thus now we have to remove this directory + * here. + */ + if (rpc_rmdir(parent)) + printk(KERN_ERR "NFS: %s: failed to remove " + "clnt dir!\n", __func__); + } + break; + default: + printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__, + event); + return -ENOTSUPP; + } + return err; +} + +static struct nfs_client *nfs_get_client_for_event(struct net *net, int event) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *cl_dentry; + struct nfs_client *clp; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + if (clp->rpc_ops != &nfs_v4_clientops) + continue; + cl_dentry = clp->cl_idmap->idmap_pipe->dentry; + if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) || + ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry)) + continue; + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; + } + spin_unlock(&nn->nfs_client_lock); + return NULL; } -static inline struct idmap_hashent * -idmap_id_hash(struct idmap_hashtable* h, __u32 id) +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) { - return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; -} + struct super_block *sb = ptr; + struct nfs_client *clp; + int error = 0; -static struct idmap_hashent * -idmap_lookup_id(struct idmap_hashtable *h, __u32 id) -{ - struct idmap_hashent *he = idmap_id_hash(h, id); - if (he->ih_id != id || he->ih_namelen == 0) - return NULL; - if (time_after(jiffies, he->ih_expires)) - return NULL; - return he; + while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) { + error = __rpc_pipefs_event(clp, event, sb); + nfs_put_client(clp); + if (error) + break; + } + return error; } -/* - * Routines for allocating new entries in the hashtable. - * For now, we just have 1 entry per bucket, so it's all - * pretty trivial. - */ -static inline struct idmap_hashent * -idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) -{ - return idmap_name_hash(h, name, len); -} +#define PIPEFS_NFS_PRIO 1 + +static struct notifier_block nfs_idmap_block = { + .notifier_call = rpc_pipefs_event, + .priority = SUNRPC_PIPEFS_NFS_PRIO, +}; -static inline struct idmap_hashent * -idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +int nfs_idmap_init(void) { - return idmap_id_hash(h, id); + int ret; + ret = nfs_idmap_init_keyring(); + if (ret != 0) + goto out; + ret = rpc_pipefs_notifier_register(&nfs_idmap_block); + if (ret != 0) + nfs_idmap_quit_keyring(); +out: + return ret; } -static void -idmap_update_entry(struct idmap_hashent *he, const char *name, - size_t namelen, __u32 id) +void nfs_idmap_quit(void) { - he->ih_id = id; - memcpy(he->ih_name, name, namelen); - he->ih_name[namelen] = '\0'; - he->ih_namelen = namelen; - he->ih_expires = jiffies + nfs_idmap_cache_timeout; + rpc_pipefs_notifier_unregister(&nfs_idmap_block); + nfs_idmap_quit_keyring(); } -/* - * Name -> ID - */ -static int -nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, - const char *name, size_t namelen, __u32 *id) +static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im, + struct rpc_pipe_msg *msg) { - struct rpc_pipe_msg msg; - struct idmap_msg *im; - struct idmap_hashent *he; - DECLARE_WAITQUEUE(wq, current); - int ret = -EIO; - - im = &idmap->idmap_im; - - /* - * String sanity checks - * Note that the userland daemon expects NUL terminated strings - */ - for (;;) { - if (namelen == 0) - return -EINVAL; - if (name[namelen-1] != '\0') - break; - namelen--; - } - if (namelen >= IDMAP_NAMESZ) - return -EINVAL; + substring_t substr; + int token, ret; - mutex_lock(&idmap->idmap_lock); - mutex_lock(&idmap->idmap_im_lock); - - he = idmap_lookup_name(h, name, namelen); - if (he != NULL) { - *id = he->ih_id; - ret = 0; - goto out; - } + memset(im, 0, sizeof(*im)); + memset(msg, 0, sizeof(*msg)); - memset(im, 0, sizeof(*im)); - memcpy(im->im_name, name, namelen); + im->im_type = IDMAP_TYPE_GROUP; + token = match_token(desc, nfs_idmap_tokens, &substr); - im->im_type = h->h_type; - im->im_conv = IDMAP_CONV_NAMETOID; + switch (token) { + case Opt_find_uid: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_gid: + im->im_conv = IDMAP_CONV_NAMETOID; + ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); + break; - memset(&msg, 0, sizeof(msg)); - msg.data = im; - msg.len = sizeof(*im); + case Opt_find_user: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_group: + im->im_conv = IDMAP_CONV_IDTONAME; + ret = match_int(&substr, &im->im_id); + break; - add_wait_queue(&idmap->idmap_wq, &wq); - if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { - remove_wait_queue(&idmap->idmap_wq, &wq); + default: + ret = -EINVAL; goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&idmap->idmap_im_lock); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&idmap->idmap_wq, &wq); - mutex_lock(&idmap->idmap_im_lock); + msg->data = im; + msg->len = sizeof(struct idmap_msg); - if (im->im_status & IDMAP_STATUS_SUCCESS) { - *id = im->im_id; - ret = 0; - } - - out: - memset(im, 0, sizeof(*im)); - mutex_unlock(&idmap->idmap_im_lock); - mutex_unlock(&idmap->idmap_lock); +out: return ret; } -/* - * ID -> Name - */ -static int -nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, - __u32 id, char *name) +static int nfs_idmap_legacy_upcall(struct key_construction *cons, + const char *op, + void *aux) { - struct rpc_pipe_msg msg; + struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap_hashent *he; - DECLARE_WAITQUEUE(wq, current); - int ret = -EIO; - unsigned int len; - - im = &idmap->idmap_im; + struct idmap *idmap = (struct idmap *)aux; + struct key *key = cons->key; + int ret; - mutex_lock(&idmap->idmap_lock); - mutex_lock(&idmap->idmap_im_lock); + /* msg and im are freed in idmap_pipe_destroy_msg */ + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out0; + } - he = idmap_lookup_id(h, id); - if (he) { - memcpy(name, he->ih_name, he->ih_namelen); - ret = he->ih_namelen; - goto out; + im = kmalloc(sizeof(*im), GFP_KERNEL); + if (IS_ERR(im)) { + ret = PTR_ERR(im); + goto out1; } - memset(im, 0, sizeof(*im)); - im->im_type = h->h_type; - im->im_conv = IDMAP_CONV_IDTONAME; - im->im_id = id; + ret = nfs_idmap_prepare_message(key->description, im, msg); + if (ret < 0) + goto out2; - memset(&msg, 0, sizeof(msg)); - msg.data = im; - msg.len = sizeof(*im); + idmap->idmap_key_cons = cons; - add_wait_queue(&idmap->idmap_wq, &wq); + ret = rpc_queue_upcall(idmap->idmap_pipe, msg); + if (ret < 0) + goto out2; - if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { - remove_wait_queue(&idmap->idmap_wq, &wq); - goto out; - } + return ret; + +out2: + kfree(im); +out1: + kfree(msg); +out0: + key_revoke(cons->key); + key_revoke(cons->authkey); + return ret; +} + +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +{ + return key_instantiate_and_link(key, data, strlen(data) + 1, + id_resolver_cache->thread_keyring, + authkey); +} - set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&idmap->idmap_im_lock); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&idmap->idmap_wq, &wq); - mutex_lock(&idmap->idmap_im_lock); - - if (im->im_status & IDMAP_STATUS_SUCCESS) { - if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) - goto out; - memcpy(name, im->im_name, len); - ret = len; +static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) +{ + char id_str[NFS_UINT_MAXLEN]; + int ret = -EINVAL; + + switch (im->im_conv) { + case IDMAP_CONV_NAMETOID: + sprintf(id_str, "%d", im->im_id); + ret = nfs_idmap_instantiate(key, authkey, id_str); + break; + case IDMAP_CONV_IDTONAME: + ret = nfs_idmap_instantiate(key, authkey, im->im_name); + break; } - out: - memset(im, 0, sizeof(*im)); - mutex_unlock(&idmap->idmap_im_lock); - mutex_unlock(&idmap->idmap_lock); return ret; } @@ -682,115 +703,51 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); struct idmap *idmap = (struct idmap *)rpci->private; - struct idmap_msg im_in, *im = &idmap->idmap_im; - struct idmap_hashtable *h; - struct idmap_hashent *he = NULL; + struct key_construction *cons = idmap->idmap_key_cons; + struct idmap_msg im; size_t namelen_in; int ret; - if (mlen != sizeof(im_in)) - return -ENOSPC; - - if (copy_from_user(&im_in, src, mlen) != 0) - return -EFAULT; - - mutex_lock(&idmap->idmap_im_lock); - - ret = mlen; - im->im_status = im_in.im_status; - /* If we got an error, terminate now, and wake up pending upcalls */ - if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { - wake_up(&idmap->idmap_wq); + if (mlen != sizeof(im)) { + ret = -ENOSPC; goto out; } - /* Sanity checking of strings */ - ret = -EINVAL; - namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); - if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) + if (copy_from_user(&im, src, mlen) != 0) { + ret = -EFAULT; goto out; + } - switch (im_in.im_type) { - case IDMAP_TYPE_USER: - h = &idmap->idmap_user_hash; - break; - case IDMAP_TYPE_GROUP: - h = &idmap->idmap_group_hash; - break; - default: - goto out; + if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { + ret = mlen; + complete_request_key(idmap->idmap_key_cons, -ENOKEY); + goto out_incomplete; } - switch (im_in.im_conv) { - case IDMAP_CONV_IDTONAME: - /* Did we match the current upcall? */ - if (im->im_conv == IDMAP_CONV_IDTONAME - && im->im_type == im_in.im_type - && im->im_id == im_in.im_id) { - /* Yes: copy string, including the terminating '\0' */ - memcpy(im->im_name, im_in.im_name, namelen_in); - im->im_name[namelen_in] = '\0'; - wake_up(&idmap->idmap_wq); - } - he = idmap_alloc_id(h, im_in.im_id); - break; - case IDMAP_CONV_NAMETOID: - /* Did we match the current upcall? */ - if (im->im_conv == IDMAP_CONV_NAMETOID - && im->im_type == im_in.im_type - && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in - && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { - im->im_id = im_in.im_id; - wake_up(&idmap->idmap_wq); - } - he = idmap_alloc_name(h, im_in.im_name, namelen_in); - break; - default: + namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { + ret = -EINVAL; goto out; } - /* If the entry is valid, also copy it to the cache */ - if (he != NULL) - idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); - ret = mlen; + ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); + if (ret >= 0) { + key_set_timeout(cons->key, nfs_idmap_cache_timeout); + ret = mlen; + } + out: - mutex_unlock(&idmap->idmap_im_lock); + complete_request_key(idmap->idmap_key_cons, ret); +out_incomplete: return ret; } static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) { - struct idmap_msg *im = msg->data; - struct idmap *idmap = container_of(im, struct idmap, idmap_im); - - if (msg->errno >= 0) - return; - mutex_lock(&idmap->idmap_im_lock); - im->im_status = IDMAP_STATUS_LOOKUPFAIL; - wake_up(&idmap->idmap_wq); - mutex_unlock(&idmap->idmap_im_lock); -} - -/* - * Fowler/Noll/Vo hash - * http://www.isthe.com/chongo/tech/comp/fnv/ - */ - -#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ -#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ - -static unsigned int fnvhash32(const void *buf, size_t buflen) -{ - const unsigned char *p, *end = (const unsigned char *)buf + buflen; - unsigned int hash = FNV_1_32; - - for (p = buf; p < end; p++) { - hash *= FNV_P_32; - hash ^= (unsigned int)*p; - } - - return hash; + /* Free memory allocated in nfs_idmap_legacy_upcall() */ + kfree(msg->data); + kfree(msg); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) @@ -799,16 +756,16 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_ if (nfs_map_string_to_numeric(name, namelen, uid)) return 0; - return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); + return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap); } -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) { struct idmap *idmap = server->nfs_client->cl_idmap; - if (nfs_map_string_to_numeric(name, namelen, uid)) + if (nfs_map_string_to_numeric(name, namelen, gid)) return 0; - return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); + return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap); } int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) @@ -817,21 +774,19 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s int ret = -EINVAL; if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; } -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap); if (ret < 0) - ret = nfs_map_numeric_to_string(uid, buf, buflen); + ret = nfs_map_numeric_to_string(gid, buf, buflen); return ret; } - -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f649fba..7bb4d13 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> +#include <linux/crc32.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -51,6 +52,7 @@ #include "fscache.h" #include "dns_resolve.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -388,9 +390,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", + dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), + nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); out: @@ -401,7 +404,7 @@ out_no_inode: goto out; } -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int nfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -423,7 +426,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; - if ((attr->ia_valid & ~ATTR_FILE) == 0) + if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; /* Write all dirty data */ @@ -1044,6 +1047,67 @@ struct nfs_fh *nfs_alloc_fhandle(void) return fh; } +#ifdef NFS_DEBUG +/* + * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle + * in the same way that wireshark does + * + * @fh: file handle + * + * For debugging only. + */ +u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) +{ + /* wireshark uses 32-bit AUTODIN crc and does a bitwise + * not on the result */ + return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size); +} + +/* + * _nfs_display_fhandle - display an NFS file handle on the console + * + * @fh: file handle to display + * @caption: display caption + * + * For debugging only. + */ +void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) +{ + unsigned short i; + + if (fh == NULL || fh->size == 0) { + printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); + return; + } + + printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", + caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); + for (i = 0; i < fh->size; i += 16) { + __be32 *pos = (__be32 *)&fh->data[i]; + + switch ((fh->size - i - 1) >> 2) { + case 0: + printk(KERN_DEFAULT " %08x\n", + be32_to_cpup(pos)); + break; + case 1: + printk(KERN_DEFAULT " %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1)); + break; + case 2: + printk(KERN_DEFAULT " %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2)); + break; + default: + printk(KERN_DEFAULT " %08x %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); + } + } +} +#endif + /** * nfs_inode_attrs_need_update - check if the inode attributes need updating * @inode - pointer to inode @@ -1211,8 +1275,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; - dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, + nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) @@ -1406,7 +1471,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); out_err: /* @@ -1495,7 +1560,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); - INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + INIT_LIST_HEAD(&nfsi->commit_list); nfsi->npages = 0; nfsi->ncommit = 0; atomic_set(&nfsi->silly_count, 1); @@ -1552,6 +1617,28 @@ static void nfsiod_stop(void) destroy_workqueue(wq); } +int nfs_net_id; +EXPORT_SYMBOL_GPL(nfs_net_id); + +static int nfs_net_init(struct net *net) +{ + nfs_clients_init(net); + return nfs_dns_resolver_cache_init(net); +} + +static void nfs_net_exit(struct net *net) +{ + nfs_dns_resolver_cache_destroy(net); + nfs_cleanup_cb_ident_idr(net); +} + +static struct pernet_operations nfs_net_ops = { + .init = nfs_net_init, + .exit = nfs_net_exit, + .id = &nfs_net_id, + .size = sizeof(struct nfs_net), +}; + /* * Initialize NFS */ @@ -1561,10 +1648,14 @@ static int __init init_nfs_fs(void) err = nfs_idmap_init(); if (err < 0) - goto out9; + goto out10; err = nfs_dns_resolver_init(); if (err < 0) + goto out9; + + err = register_pernet_subsys(&nfs_net_ops); + if (err < 0) goto out8; err = nfs_fscache_register(); @@ -1600,14 +1691,14 @@ static int __init init_nfs_fs(void) goto out0; #ifdef CONFIG_PROC_FS - rpc_proc_register(&nfs_rpcstat); + rpc_proc_register(&init_net, &nfs_rpcstat); #endif if ((err = register_nfs_fs()) != 0) goto out; return 0; out: #ifdef CONFIG_PROC_FS - rpc_proc_unregister("nfs"); + rpc_proc_unregister(&init_net, "nfs"); #endif nfs_destroy_directcache(); out0: @@ -1625,10 +1716,12 @@ out5: out6: nfs_fscache_unregister(); out7: - nfs_dns_resolver_destroy(); + unregister_pernet_subsys(&nfs_net_ops); out8: - nfs_idmap_quit(); + nfs_dns_resolver_destroy(); out9: + nfs_idmap_quit(); +out10: return err; } @@ -1640,12 +1733,12 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); + unregister_pernet_subsys(&nfs_net_ops); nfs_dns_resolver_destroy(); nfs_idmap_quit(); #ifdef CONFIG_PROC_FS - rpc_proc_unregister("nfs"); + rpc_proc_unregister(&init_net, "nfs"); #endif - nfs_cleanup_cb_ident_idr(); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8102db9..2476dc6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -123,6 +123,7 @@ struct nfs_parsed_mount_data { } nfs_server; struct security_mnt_opts lsm_opts; + struct net *net; }; /* mount_clnt.c */ @@ -137,20 +138,22 @@ struct nfs_mount_request { int noresvport; unsigned int *auth_flav_len; rpc_authflavor_t *auth_flavs; + struct net *net; }; extern int nfs_mount(struct nfs_mount_request *info); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ -extern struct rpc_program nfs_program; +extern const struct rpc_program nfs_program; +extern void nfs_clients_init(struct net *net); -extern void nfs_cleanup_cb_ident_idr(void); +extern void nfs_cleanup_cb_ident_idr(struct net *); extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); -extern struct nfs_client *nfs4_find_client_ident(int); +extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); +nfs4_find_client_sessionid(struct net *, const struct sockaddr *, + struct nfs4_sessionid *); extern struct nfs_server *nfs_create_server( const struct nfs_parsed_mount_data *, struct nfs_fh *); @@ -329,6 +332,8 @@ void nfs_retry_commit(struct list_head *page_list, void nfs_commit_clear_lock(struct nfs_inode *nfsi); void nfs_commitdata_release(void *data); void nfs_commit_release_pages(struct nfs_write_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head); +void nfs_request_remove_commit_list(struct nfs_page *req); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index d4c2d6b..8e65c7f 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -16,7 +16,7 @@ #include <linux/nfs_fs.h> #include "internal.h" -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT #endif @@ -67,7 +67,7 @@ enum { MOUNTPROC3_EXPORT = 5, }; -static struct rpc_program mnt_program; +static const struct rpc_program mnt_program; /* * Defined by OpenGroup XNFS Version 3W, chapter 8 @@ -153,7 +153,7 @@ int nfs_mount(struct nfs_mount_request *info) .rpc_resp = &result, }; struct rpc_create_args args = { - .net = &init_net, + .net = info->net, .protocol = info->protocol, .address = info->sap, .addrsize = info->salen, @@ -225,7 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info) .to_retries = 2, }; struct rpc_create_args args = { - .net = &init_net, + .net = info->net, .protocol = IPPROTO_UDP, .address = info->sap, .addrsize = info->salen, @@ -488,19 +488,19 @@ static struct rpc_procinfo mnt3_procedures[] = { }; -static struct rpc_version mnt_version1 = { +static const struct rpc_version mnt_version1 = { .number = 1, .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; -static struct rpc_version mnt_version3 = { +static const struct rpc_version mnt_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; -static struct rpc_version *mnt_version[] = { +static const struct rpc_version *mnt_version[] = { NULL, &mnt_version1, NULL, @@ -509,7 +509,7 @@ static struct rpc_version *mnt_version[] = { static struct rpc_stat mnt_stats; -static struct rpc_program mnt_program = { +static const struct rpc_program mnt_program = { .name = "mount", .number = NFS_MNT_PROGRAM, .nrvers = ARRAY_SIZE(mnt_version), diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 8102391..1807866 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -276,7 +276,10 @@ out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); out_nofree: - dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); + if (IS_ERR(mnt)) + dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); + else + dprintk("<-- %s() = %p\n", __func__, mnt); return mnt; } diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h new file mode 100644 index 0000000..aa14ec3 --- /dev/null +++ b/fs/nfs/netns.h @@ -0,0 +1,27 @@ +#ifndef __NFS_NETNS_H__ +#define __NFS_NETNS_H__ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct bl_dev_msg { + int32_t status; + uint32_t major, minor; +}; + +struct nfs_net { + struct cache_detail *nfs_dns_resolve; + struct rpc_pipe *bl_device_pipe; + struct bl_dev_msg bl_mount_reply; + wait_queue_head_t bl_wq; + struct list_head nfs_client_list; + struct list_head nfs_volume_list; +#ifdef CONFIG_NFS_V4 + struct idr cb_ident_idr; /* Protected by nfs_client_lock */ +#endif + spinlock_t nfs_client_lock; +}; + +extern int nfs_net_id; + +#endif diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 792cb13..1f56000 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -1150,7 +1150,7 @@ struct rpc_procinfo nfs_procedures[] = { PROC(STATFS, fhandle, statfsres, 0), }; -struct rpc_version nfs_version2 = { +const struct rpc_version nfs_version2 = { .number = 2, .nrprocs = ARRAY_SIZE(nfs_procedures), .procs = nfs_procedures diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 7ef2397..e4498dc 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .pages = pages, }; struct nfs3_getaclres res = { - 0 + NULL, }; struct rpc_message msg = { .rpc_argp = &args, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 91943953..5242eae 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -428,6 +428,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } +static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + static int nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) { @@ -445,6 +450,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } +static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} + static int nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, struct inode *new_dir) @@ -814,6 +824,11 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } +static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ + rpc_call_start(task); +} + static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (nfs3_async_handle_jukebox(task, data->inode)) @@ -828,6 +843,11 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } +static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ + rpc_call_start(task); +} + static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) { if (nfs3_async_handle_jukebox(task, data->inode)) @@ -864,9 +884,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .create = nfs3_proc_create, .remove = nfs3_proc_remove, .unlink_setup = nfs3_proc_unlink_setup, + .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, .unlink_done = nfs3_proc_unlink_done, .rename = nfs3_proc_rename, .rename_setup = nfs3_proc_rename_setup, + .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, .rename_done = nfs3_proc_rename_done, .link = nfs3_proc_link, .symlink = nfs3_proc_symlink, @@ -879,8 +901,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, .read_setup = nfs3_proc_read_setup, + .read_rpc_prepare = nfs3_proc_read_rpc_prepare, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, + .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_done = nfs3_commit_done, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 183c6b1..a77cc9a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -2461,7 +2461,7 @@ struct rpc_procinfo nfs3_procedures[] = { PROC(COMMIT, commit, commit, 5), }; -struct rpc_version nfs_version3 = { +const struct rpc_version nfs_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(nfs3_procedures), .procs = nfs3_procedures @@ -2489,7 +2489,7 @@ static struct rpc_procinfo nfs3_acl_procedures[] = { }, }; -struct rpc_version nfsacl_version3 = { +const struct rpc_version nfsacl_version3 = { .number = 3, .nrprocs = sizeof(nfs3_acl_procedures)/ sizeof(nfs3_acl_procedures[0]), diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4d7d0ae..97ecc86 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -20,7 +20,6 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_REBOOT, NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, - NFS4CLNT_LAYOUTRECALL, NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, @@ -44,7 +43,7 @@ struct nfs4_minor_version_ops { struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); - int (*validate_stateid)(struct nfs_delegation *, + bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -53,26 +52,25 @@ struct nfs4_minor_version_ops { const struct nfs4_state_maintenance_ops *state_renewal_ops; }; -/* - * struct rpc_sequence ensures that RPC calls are sent in the exact - * order that they appear on the list. - */ -struct rpc_sequence { - struct rpc_wait_queue wait; /* RPC call delay queue */ - spinlock_t lock; /* Protects the list */ - struct list_head list; /* Defines sequence of RPC calls */ +struct nfs_unique_id { + struct rb_node rb_node; + __u64 id; }; #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { - struct rpc_sequence *sequence; + int owner_id; int flags; u32 counter; + spinlock_t lock; /* Protects the list */ + struct list_head list; /* Defines sequence of RPC calls */ + struct rpc_wait_queue wait; /* RPC call delay queue */ }; struct nfs_seqid { struct nfs_seqid_counter *sequence; struct list_head list; + struct rpc_task *task; }; static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) @@ -81,18 +79,12 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status seqid->flags |= NFS_SEQID_CONFIRMED; } -struct nfs_unique_id { - struct rb_node rb_node; - __u64 id; -}; - /* * NFS4 state_owners and lock_owners are simply labels for ordered * sequences of RPC calls. Their sole purpose is to provide once-only * semantics by allowing the server to identify replayed requests. */ struct nfs4_state_owner { - struct nfs_unique_id so_owner_id; struct nfs_server *so_server; struct list_head so_lru; unsigned long so_expires; @@ -105,7 +97,6 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; - struct rpc_sequence so_sequence; }; enum { @@ -146,8 +137,6 @@ struct nfs4_lock_state { #define NFS_LOCK_INITIALIZED 1 int ls_flags; struct nfs_seqid_counter ls_seqid; - struct rpc_sequence ls_sequence; - struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; struct nfs4_lock_owner ls_owner; @@ -193,6 +182,7 @@ struct nfs4_exception { long timeout; int retry; struct nfs4_state *state; + struct inode *inode; }; struct nfs4_state_recovery_ops { @@ -224,7 +214,7 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, boo extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); -extern void nfs4_release_lockowner(const struct nfs4_lock_state *); +extern int nfs4_release_lockowner(struct nfs4_lock_state *); extern const struct xattr_handler *nfs4_xattr_handlers[]; #if defined(CONFIG_NFS_V4_1) @@ -233,12 +223,13 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser return server->nfs_client->cl_session; } +extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy); extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task); + struct rpc_task *task); extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task); + struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *); @@ -269,7 +260,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser static inline int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task) + struct rpc_task *task) { return 0; } @@ -319,7 +310,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) } #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t); extern void nfs4_put_state_owner(struct nfs4_state_owner *); extern void nfs4_purge_state_owners(struct nfs_server *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); @@ -327,6 +318,8 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); @@ -337,7 +330,8 @@ extern void nfs41_handle_server_scope(struct nfs_client *, struct server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); +extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, + fmode_t, fl_owner_t, pid_t); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -346,6 +340,8 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); + extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ @@ -357,6 +353,16 @@ struct nfs4_mount_data; extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; +static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst, src, sizeof(*dst)) == 0; +} + #else #define nfs4_close_state(a, b) do { } while (0) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 71ec086..634c0bc 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -33,7 +33,10 @@ #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/sunrpc/metrics.h> + #include "internal.h" +#include "delegation.h" #include "nfs4filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -84,12 +87,27 @@ static int filelayout_async_handle_error(struct rpc_task *task, struct nfs_client *clp, int *reset) { + struct nfs_server *mds_server = NFS_SERVER(state->inode); + struct nfs_client *mds_client = mds_server->nfs_client; + if (task->tk_status >= 0) return 0; - *reset = 0; switch (task->tk_status) { + /* MDS state errors */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + nfs4_schedule_stateid_recovery(mds_server, state); + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + nfs4_schedule_stateid_recovery(mds_server, state); + nfs4_schedule_lease_recovery(mds_client); + goto wait_on_recovery; + /* DS session errors */ case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: @@ -115,8 +133,14 @@ static int filelayout_async_handle_error(struct rpc_task *task, *reset = 1; break; } +out: task->tk_status = 0; return -EAGAIN; +wait_on_recovery: + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); + goto out; } /* NFS_PROTO call done callback routines */ @@ -173,7 +197,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, - 0, task)) + task)) return; rpc_call_start(task); @@ -189,10 +213,18 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) rdata->mds_ops->rpc_call_done(task, data); } +static void filelayout_read_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics); +} + static void filelayout_read_release(void *data) { struct nfs_read_data *rdata = (struct nfs_read_data *)data; + put_lseg(rdata->lseg); rdata->mds_ops->rpc_release(data); } @@ -254,7 +286,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, - 0, task)) + task)) return; rpc_call_start(task); @@ -268,10 +300,18 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data) wdata->mds_ops->rpc_call_done(task, data); } +static void filelayout_write_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics); +} + static void filelayout_write_release(void *data) { struct nfs_write_data *wdata = (struct nfs_write_data *)data; + put_lseg(wdata->lseg); wdata->mds_ops->rpc_release(data); } @@ -282,24 +322,28 @@ static void filelayout_commit_release(void *data) nfs_commit_release_pages(wdata); if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) nfs_commit_clear_lock(NFS_I(wdata->inode)); + put_lseg(wdata->lseg); nfs_commitdata_release(wdata); } -struct rpc_call_ops filelayout_read_call_ops = { +static const struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, + .rpc_count_stats = filelayout_read_count_stats, .rpc_release = filelayout_read_release, }; -struct rpc_call_ops filelayout_write_call_ops = { +static const struct rpc_call_ops filelayout_write_call_ops = { .rpc_call_prepare = filelayout_write_prepare, .rpc_call_done = filelayout_write_call_done, + .rpc_count_stats = filelayout_write_count_stats, .rpc_release = filelayout_write_release, }; -struct rpc_call_ops filelayout_commit_call_ops = { +static const struct rpc_call_ops filelayout_commit_call_ops = { .rpc_call_prepare = filelayout_write_prepare, .rpc_call_done = filelayout_write_call_done, + .rpc_count_stats = filelayout_write_count_stats, .rpc_release = filelayout_commit_release, }; @@ -367,7 +411,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) { - printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", + __func__); set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; @@ -575,7 +620,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, goto out_err_free; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { - printk(KERN_ERR "Too big fh %d received %d\n", + printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); goto out_err_free; } @@ -640,14 +685,16 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, int size = (fl->stripe_type == STRIPE_SPARSE) ? fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags); + fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags); if (!fl->commit_buckets) { filelayout_free_lseg(&fl->generic_hdr); return NULL; } fl->number_of_buckets = size; - for (i = 0; i < size; i++) - INIT_LIST_HEAD(&fl->commit_buckets[i]); + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&fl->commit_buckets[i].written); + INIT_LIST_HEAD(&fl->commit_buckets[i].committing); + } } return &fl->generic_hdr; } @@ -679,7 +726,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } -void +static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -696,7 +743,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, nfs_pageio_reset_read_mds(pgio); } -void +static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -725,11 +772,6 @@ static const struct nfs_pageio_ops filelayout_pg_write_ops = { .pg_doio = pnfs_generic_pg_writepages, }; -static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) -{ - return !FILELAYOUT_LSEG(lseg)->commit_through_mds; -} - static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) { if (fl->stripe_type == STRIPE_SPARSE) @@ -738,13 +780,49 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) return j; } -struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + */ +static void +filelayout_clear_request_commit(struct nfs_page *req) +{ + struct pnfs_layout_segment *freeme = NULL; + struct inode *inode = req->wb_context->dentry->d_inode; + + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) + goto out; + if (list_is_singular(&req->wb_list)) { + struct inode *inode = req->wb_context->dentry->d_inode; + struct pnfs_layout_segment *lseg; + + /* From here we can find the bucket, but for the moment, + * since there is only one relevant lseg... + */ + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW) { + freeme = lseg; + break; + } + } + } +out: + nfs_request_remove_commit_list(req); + spin_unlock(&inode->i_lock); + put_lseg(freeme); +} + +static struct list_head * +filelayout_choose_commit_list(struct nfs_page *req, + struct pnfs_layout_segment *lseg) { - struct pnfs_layout_segment *lseg = req->wb_commit_lseg; struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; struct list_head *list; + if (fl->commit_through_mds) + return &NFS_I(req->wb_context->dentry->d_inode)->commit_list; + /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive * alternative is to add a field to nfs_write_data and nfs_page @@ -754,14 +832,30 @@ struct list_head *filelayout_choose_commit_list(struct nfs_page *req) j = nfs4_fl_calc_j_index(lseg, (loff_t)req->wb_index << PAGE_CACHE_SHIFT); i = select_bucket_index(fl, j); - list = &fl->commit_buckets[i]; + list = &fl->commit_buckets[i].written; if (list_empty(list)) { - /* Non-empty buckets hold a reference on the lseg */ + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * filelayout_remove_commit_req + */ get_lseg(lseg); } + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); return list; } +static void +filelayout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg) +{ + struct list_head *list; + + list = filelayout_choose_commit_list(req, lseg); + nfs_request_add_commit_list(req, list); +} + static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) { struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); @@ -797,11 +891,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how) idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) { - printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", + __func__); set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); prepare_to_resend_writes(data); - data->mds_ops->rpc_release(data); + filelayout_commit_release(data); return -EAGAIN; } dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); @@ -817,24 +912,87 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how) /* * This is only useful while we are using whole file layouts. */ -static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +static struct pnfs_layout_segment * +find_only_write_lseg_locked(struct inode *inode) { - struct pnfs_layout_segment *lseg, *rv = NULL; + struct pnfs_layout_segment *lseg; - spin_lock(&inode->i_lock); list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) if (lseg->pls_range.iomode == IOMODE_RW) - rv = get_lseg(lseg); + return lseg; + return NULL; +} + +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *rv; + + spin_lock(&inode->i_lock); + rv = find_only_write_lseg_locked(inode); + if (rv) + get_lseg(rv); spin_unlock(&inode->i_lock); return rv; } -static int alloc_ds_commits(struct inode *inode, struct list_head *list) +static int +filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max, + spinlock_t *lock) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + if (cond_resched_lock(lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); + nfs_list_add_request(req, dst); + ret++; + if (ret == max) + break; + } + return ret; +} + +/* Move reqs from written to committing lists, returning count of number moved. + * Note called with i_lock held. + */ +static int filelayout_scan_commit_lists(struct inode *inode, int max, + spinlock_t *lock) +{ + struct pnfs_layout_segment *lseg; + struct nfs4_filelayout_segment *fl; + int i, rv = 0, cnt; + + lseg = find_only_write_lseg_locked(inode); + if (!lseg) + goto out_done; + fl = FILELAYOUT_LSEG(lseg); + if (fl->commit_through_mds) + goto out_done; + for (i = 0; i < fl->number_of_buckets && max != 0; i++) { + cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i], + max, lock); + max -= cnt; + rv += cnt; + } +out_done: + return rv; +} + +static unsigned int +alloc_ds_commits(struct inode *inode, struct list_head *list) { struct pnfs_layout_segment *lseg; struct nfs4_filelayout_segment *fl; struct nfs_write_data *data; int i, j; + unsigned int nreq = 0; /* Won't need this when non-whole file layout segments are supported * instead we will use a pnfs_layout_hdr structure */ @@ -843,28 +1001,27 @@ static int alloc_ds_commits(struct inode *inode, struct list_head *list) return 0; fl = FILELAYOUT_LSEG(lseg); for (i = 0; i < fl->number_of_buckets; i++) { - if (list_empty(&fl->commit_buckets[i])) + if (list_empty(&fl->commit_buckets[i].committing)) continue; data = nfs_commitdata_alloc(); if (!data) - goto out_bad; + break; data->ds_commit_index = i; data->lseg = lseg; list_add(&data->pages, list); + nreq++; } - put_lseg(lseg); - return 0; -out_bad: + /* Clean up on error */ for (j = i; j < fl->number_of_buckets; j++) { - if (list_empty(&fl->commit_buckets[i])) + if (list_empty(&fl->commit_buckets[i].committing)) continue; - nfs_retry_commit(&fl->commit_buckets[i], lseg); + nfs_retry_commit(&fl->commit_buckets[i].committing, lseg); put_lseg(lseg); /* associated with emptying bucket */ } put_lseg(lseg); /* Caller will clean up entries put on list */ - return -ENOMEM; + return nreq; } /* This follows nfs_commit_list pretty closely */ @@ -874,40 +1031,40 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, { struct nfs_write_data *data, *tmp; LIST_HEAD(list); + unsigned int nreq = 0; if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(); - if (!data) - goto out_bad; - data->lseg = NULL; - list_add(&data->pages, &list); + if (data != NULL) { + data->lseg = NULL; + list_add(&data->pages, &list); + nreq++; + } else + nfs_retry_commit(mds_pages, NULL); } - if (alloc_ds_commits(inode, &list)) - goto out_bad; + nreq += alloc_ds_commits(inode, &list); + + if (nreq == 0) { + nfs_commit_clear_lock(NFS_I(inode)); + goto out; + } + + atomic_add(nreq, &NFS_I(inode)->commits_outstanding); list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); - atomic_inc(&NFS_I(inode)->commits_outstanding); if (!data->lseg) { nfs_init_commit(data, mds_pages, NULL); nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); } else { - nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); + nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg); filelayout_initiate_commit(data, how); } } - return 0; - out_bad: - list_for_each_entry_safe(data, tmp, &list, pages) { - nfs_retry_commit(&data->pages, data->lseg); - list_del_init(&data->pages); - nfs_commit_free(data); - } - nfs_retry_commit(mds_pages, NULL); - nfs_commit_clear_lock(NFS_I(inode)); - return -ENOMEM; +out: + return PNFS_ATTEMPTED; } static void @@ -924,8 +1081,9 @@ static struct pnfs_layoutdriver_type filelayout_type = { .free_lseg = filelayout_free_lseg, .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, - .mark_pnfs_commit = filelayout_mark_pnfs_commit, - .choose_commit_list = filelayout_choose_commit_list, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = filelayout_clear_request_commit, + .scan_commit_lists = filelayout_scan_commit_lists, .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 2e42284..21190bb 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -74,6 +74,11 @@ struct nfs4_file_layout_dsaddr { struct nfs4_pnfs_ds *ds_list[1]; }; +struct nfs4_fl_commit_bucket { + struct list_head written; + struct list_head committing; +}; + struct nfs4_filelayout_segment { struct pnfs_layout_segment generic_hdr; u32 stripe_type; @@ -84,7 +89,7 @@ struct nfs4_filelayout_segment { struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ unsigned int num_fh; struct nfs_fh **fh_array; - struct list_head *commit_buckets; /* Sort commits to ds */ + struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */ int number_of_buckets; }; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 8ae9190..a866bbd 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -45,7 +45,7 @@ * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static DEFINE_SPINLOCK(nfs4_ds_cache_lock); static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ @@ -108,58 +108,40 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) return false; } -/* - * Lookup DS by addresses. The first matching address returns true. - * nfs4_ds_cache_lock is held - */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(struct list_head *dsaddrs) +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, + const struct list_head *dsaddrs2) { - struct nfs4_pnfs_ds *ds; struct nfs4_pnfs_ds_addr *da1, *da2; - list_for_each_entry(da1, dsaddrs, da_node) { - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { - list_for_each_entry(da2, &ds->ds_addrs, da_node) { - if (same_sockaddr( - (struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) - return ds; - } - } + /* step through both lists, comparing as we go */ + for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), + da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); + da1 != NULL && da2 != NULL; + da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), + da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { + if (!same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return false; } - return NULL; + if (da1 == NULL && da2 == NULL) + return true; + + return false; } /* - * Compare two lists of addresses. + * Lookup DS by addresses. nfs4_ds_cache_lock is held */ -static bool -_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, - struct list_head *dsaddrs2) +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs) { - struct nfs4_pnfs_ds_addr *da1, *da2; - size_t count1 = 0, - count2 = 0; - - list_for_each_entry(da1, dsaddrs1, da_node) - count1++; - - list_for_each_entry(da2, dsaddrs2, da_node) { - bool found = false; - count2++; - list_for_each_entry(da1, dsaddrs1, da_node) { - if (same_sockaddr((struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) { - found = true; - break; - } - } - if (!found) - return false; - } + struct nfs4_pnfs_ds *ds; - return (count1 == count2); + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + return ds; + return NULL; } /* @@ -356,11 +338,6 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { - if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, - dsaddrs)) { - dprintk("%s: multipath address mismatch: %s != %s", - __func__, tmp_ds->ds_remotestr, remotestr); - } kfree(remotestr); kfree(ds); atomic_inc(&tmp_ds->ds_count); @@ -378,7 +355,7 @@ out: * Currently only supports ipv4, ipv6 and one multi-path address. */ static struct nfs4_pnfs_ds_addr * -decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) +decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags) { struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; @@ -457,7 +434,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) INIT_LIST_HEAD(&da->da_node); - if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, + if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, sizeof(da->da_addr))) { dprintk("%s: error parsing address %s\n", __func__, buf); goto out_free_da; @@ -554,7 +531,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) cnt = be32_to_cpup(p); dprintk("%s stripe count %d\n", __func__, cnt); if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { - printk(KERN_WARNING "%s: stripe count %d greater than " + printk(KERN_WARNING "NFS: %s: stripe count %d greater than " "supported maximum %d\n", __func__, cnt, NFS4_PNFS_MAX_STRIPE_CNT); goto out_err_free_scratch; @@ -585,7 +562,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) num = be32_to_cpup(p); dprintk("%s ds_num %u\n", __func__, num); if (num > NFS4_PNFS_MAX_MULTI_CNT) { - printk(KERN_WARNING "%s: multipath count %d greater than " + printk(KERN_WARNING "NFS: %s: multipath count %d greater than " "supported maximum %d\n", __func__, num, NFS4_PNFS_MAX_MULTI_CNT); goto out_err_free_stripe_indices; @@ -593,7 +570,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) /* validate stripe indices are all < num */ if (max_stripe_index >= num) { - printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", + printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n", __func__, max_stripe_index, num); goto out_err_free_stripe_indices; } @@ -625,7 +602,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(&stream, gfp_flags); + da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net, + &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -686,7 +664,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl new = decode_device(inode, dev, gfp_flags); if (!new) { - printk(KERN_WARNING "%s: Could not decode or add device\n", + printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", __func__); return NULL; } @@ -835,7 +813,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; if (ds == NULL) { - printk(KERN_ERR "%s: No data server for offset index %d\n", + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", __func__, ds_idx); return NULL; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index bb80c49..9c8eca31 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -94,13 +94,14 @@ static int nfs4_validate_fspath(struct dentry *dentry, } static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen) + struct sockaddr *sa, size_t salen, struct nfs_server *server) { + struct net *net = rpc_net_ns(server->client); ssize_t ret; - ret = rpc_pton(string, len, sa, salen); + ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(string, len, sa, salen); + ret = nfs_dns_resolve_name(net, string, len, sa, salen); if (ret < 0) ret = 0; } @@ -137,7 +138,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, - mountdata->addr, addr_bufsize); + mountdata->addr, addr_bufsize, + NFS_SB(mountdata->sb)); if (mountdata->addrlen == 0) continue; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index caf92d0..e809d23 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -72,18 +72,21 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) +static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, struct nfs4_state *state); #ifdef CONFIG_NFS_V4_1 -static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); -static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); #endif /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) @@ -259,15 +262,28 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + struct inode *inode = exception->inode; int ret = errorcode; exception->retry = 0; switch(errorcode) { case 0: return 0; + case -NFS4ERR_OPENMODE: + if (nfs_have_delegation(inode, FMODE_READ)) { + nfs_inode_return_delegation(inode); + exception->retry = 1; + return 0; + } + if (state == NULL) + break; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); if (state == NULL) break; nfs4_schedule_stateid_recovery(server, state); @@ -360,16 +376,14 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * When updating highest_used_slotid there may be "holes" in the bitmap * so we need to scan down from highest_used_slotid to 0 looking for the now * highest slotid in use. - * If none found, highest_used_slotid is set to -1. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. * * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) +nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid) { - int slotid = free_slotid; - - BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); + BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); @@ -379,10 +393,16 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) if (slotid < tbl->max_slots) tbl->highest_used_slotid = slotid; else - tbl->highest_used_slotid = -1; + tbl->highest_used_slotid = NFS4_NO_SLOT; } - dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, - free_slotid, tbl->highest_used_slotid); + dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, + slotid, tbl->highest_used_slotid); +} + +bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + return true; } /* @@ -390,16 +410,13 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) */ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) { - struct rpc_task *task; - if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { - task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); - if (task) - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq, + nfs4_set_task_privileged, NULL); return; } - if (ses->fc_slot_table.highest_used_slotid != -1) + if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT) return; dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); @@ -412,7 +429,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) void nfs4_check_drain_bc_complete(struct nfs4_session *ses) { if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || - ses->bc_slot_table.highest_used_slotid != -1) + ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT) return; dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); complete(&ses->bc_slot_table.complete); @@ -507,25 +524,25 @@ static int nfs4_sequence_done(struct rpc_task *task, * nfs4_find_slot looks for an unset bit in the used_slots bitmap. * If found, we mark the slot as used, update the highest_used_slotid, * and respectively set up the sequence operation args. - * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise. + * The slot number is returned if found, or NFS4_NO_SLOT otherwise. * * Note: must be called with under the slot_tbl_lock. */ -static u8 +static u32 nfs4_find_slot(struct nfs4_slot_table *tbl) { - int slotid; - u8 ret_id = NFS4_MAX_SLOT_TABLE; - BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE); + u32 slotid; + u32 ret_id = NFS4_NO_SLOT; - dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n", + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, tbl->max_slots); slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); if (slotid >= tbl->max_slots) goto out; __set_bit(slotid, tbl->used_slots); - if (slotid > tbl->highest_used_slotid) + if (slotid > tbl->highest_used_slotid || + tbl->highest_used_slotid == NFS4_NO_SLOT) tbl->highest_used_slotid = slotid; ret_id = slotid; out: @@ -534,15 +551,25 @@ out: return ret_id; } +static void nfs41_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) +{ + args->sa_session = NULL; + args->sa_cache_this = 0; + if (cache_reply) + args->sa_cache_this = 1; + res->sr_session = NULL; + res->sr_slot = NULL; +} + int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task) { struct nfs4_slot *slot; struct nfs4_slot_table *tbl; - u8 slotid; + u32 slotid; dprintk("--> %s\n", __func__); /* slot already allocated? */ @@ -570,7 +597,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, } slotid = nfs4_find_slot(tbl); - if (slotid == NFS4_MAX_SLOT_TABLE) { + if (slotid == NFS4_NO_SLOT) { rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); dprintk("<-- %s: no free slots\n", __func__); @@ -582,7 +609,6 @@ int nfs41_setup_sequence(struct nfs4_session *session, slot = tbl->slots + slotid; args->sa_session = session; args->sa_slotid = slotid; - args->sa_cache_this = cache_reply; dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); @@ -602,24 +628,19 @@ EXPORT_SYMBOL_GPL(nfs41_setup_sequence); int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task) { struct nfs4_session *session = nfs4_get_session(server); int ret = 0; - if (session == NULL) { - args->sa_session = NULL; - res->sr_session = NULL; + if (session == NULL) goto out; - } dprintk("--> %s clp %p session %p sr_slot %td\n", __func__, session->clp, session, res->sr_slot ? res->sr_slot - session->fc_slot_table.slots : -1); - ret = nfs41_setup_sequence(session, args, res, cache_reply, - task); + ret = nfs41_setup_sequence(session, args, res, task); out: dprintk("<-- %s status=%d\n", __func__, ret); return ret; @@ -629,7 +650,6 @@ struct nfs41_call_sync_data { const struct nfs_server *seq_server; struct nfs4_sequence_args *seq_args; struct nfs4_sequence_res *seq_res; - int cache_reply; }; static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) @@ -639,7 +659,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); if (nfs4_setup_sequence(data->seq_server, data->seq_args, - data->seq_res, data->cache_reply, task)) + data->seq_res, task)) return; rpc_call_start(task); } @@ -657,12 +677,12 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) nfs41_sequence_done(task, data->seq_res); } -struct rpc_call_ops nfs41_call_sync_ops = { +static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_prepare = nfs41_call_sync_prepare, .rpc_call_done = nfs41_call_sync_done, }; -struct rpc_call_ops nfs41_call_priv_sync_ops = { +static const struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_prepare = nfs41_call_priv_sync_prepare, .rpc_call_done = nfs41_call_sync_done, }; @@ -672,7 +692,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, int privileged) { int ret; @@ -681,7 +700,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .seq_server = server, .seq_args = args, .seq_res = res, - .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { .rpc_client = clnt, @@ -690,7 +708,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - res->sr_slot = NULL; if (privileged) task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); @@ -710,10 +727,17 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); + nfs41_init_sequence(args, res, cache_reply); + return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0); } #else +static inline +void nfs41_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) +{ +} + static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -728,7 +752,7 @@ int _nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { - args->sa_session = res->sr_session = NULL; + nfs41_init_sequence(args, res, cache_reply); return rpc_call_sync(clnt, msg, 0); } @@ -815,20 +839,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); p->o_arg.clientid = server->nfs_client->cl_clientid; - p->o_arg.id = sp->so_owner_id.id; + p->o_arg.id = sp->so_seqid.owner_id; p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.dir_bitmask = server->cache_consistency_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - if (flags & O_CREAT) { - u32 *s; + if (attrs != NULL && attrs->ia_valid != 0) { + __be32 verf[2]; p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); - s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; + + verf[0] = jiffies; + verf[1] = current->pid; + memcpy(p->o_arg.u.verifier.data, verf, + sizeof(p->o_arg.u.verifier.data)); } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; @@ -878,7 +904,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode { int ret = 0; - if (open_mode & O_EXCL) + if (open_mode & (O_EXCL|O_TRUNC)) goto out; switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: @@ -927,8 +953,8 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) { if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); - memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -956,7 +982,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s */ write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { - memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); + nfs4_stateid_copy(&state->stateid, deleg_stateid); set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) @@ -987,7 +1013,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat if (delegation == NULL) delegation = &deleg_cur->stateid; - else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) + else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); @@ -1026,7 +1052,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & O_EXCL; + int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1048,7 +1074,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) break; } /* Save the delegation */ - memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); + nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); if (ret != 0) @@ -1090,6 +1116,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (state == NULL) goto err_put_inode; if (data->o_res.delegation_type != 0) { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; int delegation_flags = 0; rcu_read_lock(); @@ -1101,7 +1128,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data pr_err_ratelimited("NFS: Broken NFSv4 server %s is " "returning a delegation for " "OPEN(CLAIM_DELEGATE_CUR)\n", - NFS_CLIENT(inode)->cl_server); + clp->cl_hostname); } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, data->owner->so_cred, @@ -1210,10 +1237,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * * Check if we need to update the current stateid. */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && - memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { + !nfs4_stateid_match(&state->stateid, &state->open_stateid)) { write_seqlock(&state->seqlock); if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); } return 0; @@ -1282,8 +1309,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs if (IS_ERR(opendata)) return PTR_ERR(opendata); opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - memcpy(opendata->o_arg.u.delegation.data, stateid->data, - sizeof(opendata->o_arg.u.delegation.data)); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); ret = nfs4_open_recover(opendata, state); nfs4_opendata_put(opendata); return ret; @@ -1319,8 +1345,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state * The show must go on: exit, but mark the * stateid as needing recovery. */ + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + nfs_inode_find_state_and_recover(state->inode, + stateid); nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* @@ -1345,8 +1374,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; if (data->rpc_status == 0) { - memcpy(data->o_res.stateid.data, data->c_res.stateid.data, - sizeof(data->o_res.stateid.data)); + nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid); nfs_confirm_seqid(&data->owner->so_seqid, 0); renew_lease(data->o_res.server, data->timestamp); data->rpc_done = 1; @@ -1440,7 +1468,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) rcu_read_unlock(); } /* Update sequence id. */ - data->o_arg.id = sp->so_owner_id.id; + data->o_arg.id = sp->so_seqid.owner_id; data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; @@ -1449,7 +1477,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, 1, task)) + &data->o_res.seq_res, task)) return; rpc_call_start(task); return; @@ -1551,6 +1579,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) }; int status; + nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -1712,15 +1741,32 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta } #if defined(CONFIG_NFS_V4_1) -static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags) { - int status; + int status = NFS_OK; struct nfs_server *server = NFS_SERVER(state->inode); - status = nfs41_test_stateid(server, state); - if (status == NFS_OK) - return 0; - nfs41_free_stateid(server, state); + if (state->flags & flags) { + status = nfs41_test_stateid(server, stateid); + if (status != NFS_OK) { + nfs41_free_stateid(server, stateid); + state->flags &= ~flags; + } + } + return status; +} + +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int deleg_status, open_status; + int deleg_flags = 1 << NFS_DELEGATED_STATE; + int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE); + + deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags); + open_status = nfs41_check_expired_stateid(state, &state->open_stateid, open_flags); + + if ((deleg_status == NFS_OK) && (open_status == NFS_OK)) + return NFS_OK; return nfs4_open_expired(sp, state); } #endif @@ -1754,7 +1800,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode /* Protect against reboot recovery conflicts */ status = -ENOMEM; - if (!(sp = nfs4_get_state_owner(server, cred))) { + sp = nfs4_get_state_owner(server, cred, GFP_KERNEL); + if (sp == NULL) { dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } @@ -1829,7 +1876,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, * the user though... */ if (status == -NFS4ERR_BAD_SEQID) { - printk(KERN_WARNING "NFS: v4 server %s " + pr_warn_ratelimited("NFS: v4 server %s " " returned a bad sequence-id error!\n", NFS_SERVER(dir)->nfs_client->cl_hostname); exception.retry = 1; @@ -1882,12 +1929,14 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, nfs_fattr_init(fattr); - if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + if (state != NULL) { + nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, + current->files, current->tgid); + } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, + FMODE_WRITE)) { /* Use that stateid */ - } else if (state != NULL) { - nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); } else - memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) @@ -1900,7 +1949,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + .inode = inode, + }; int err; do { err = nfs4_handle_exception(server, @@ -1954,6 +2006,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); + dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; /* hmm. we are done with the inode, and in the process of freeing @@ -1981,6 +2034,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) } nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); + dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); } static void nfs4_close_prepare(struct rpc_task *task, void *data) @@ -1989,6 +2043,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; int call_close = 0; + dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; @@ -2013,7 +2068,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; - return; + goto out; } if (calldata->arg.fmode == 0) { @@ -2022,17 +2077,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, task, NULL); - return; + goto out; } } nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), - &calldata->arg.seq_args, &calldata->res.seq_res, - 1, task)) - return; + &calldata->arg.seq_args, + &calldata->res.seq_res, + task)) + goto out; rpc_call_start(task); +out: + dprintk("%s: done!\n", __func__); } static const struct rpc_call_ops nfs4_close_ops = { @@ -2074,6 +2132,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; + nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); @@ -2182,6 +2241,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->acl_bitmask = res.acl_bitmask; + server->fh_expire_type = res.fh_expire_type; } return status; @@ -2303,7 +2363,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_map_errors(status); } -static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when @@ -2420,6 +2479,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } } + /* Deal with open(O_TRUNC) */ + if (sattr->ia_valid & ATTR_OPEN) + sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + status = nfs4_do_setattr(inode, cred, fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(inode, sattr); @@ -2494,7 +2557,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->attr_bitmask, + .bitmask = server->cache_consistency_bitmask, }; struct nfs4_accessres res = { .server = server, @@ -2712,8 +2775,18 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) args->bitmask = server->cache_consistency_bitmask; res->server = server; - res->seq_res.sr_slot = NULL; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; + nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + if (nfs4_setup_sequence(NFS_SERVER(data->dir), + &data->args.seq_args, + &data->res.seq_res, + task)) + return; + rpc_call_start(task); } static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) @@ -2738,6 +2811,17 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; arg->bitmask = server->attr_bitmask; res->server = server; + nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), + &data->args.seq_args, + &data->res.seq_res, + task)) + return; + rpc_call_start(task); } static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, @@ -3232,6 +3316,17 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message data->timestamp = jiffies; data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); +} + +static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task)) + return; + rpc_call_start(task); } /* Reset the the nfs_read_data to send the read to the MDS. */ @@ -3305,6 +3400,17 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag data->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +} + +static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task)) + return; + rpc_call_start(task); } static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3339,6 +3445,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa data->write_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } struct nfs4_renewdata { @@ -3714,8 +3821,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (task->tk_status >= 0) return 0; switch(task->tk_status) { + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: if (state == NULL) break; @@ -3764,6 +3874,16 @@ wait_on_recovery: return -EAGAIN; } +static void nfs4_construct_boot_verifier(struct nfs_client *clp, + nfs4_verifier *bootverf) +{ + __be32 verf[2]; + + verf[0] = htonl((u32)clp->cl_boot_time.tv_sec); + verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec); + memcpy(bootverf->data, verf, sizeof(bootverf->data)); +} + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred, struct nfs4_setclientid_res *res) @@ -3780,15 +3900,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - __be32 *p; int loop = 0; int status; - p = (__be32*)sc_verifier.data; - *p++ = htonl((u32)clp->cl_boot_time.tv_sec); - *p = htonl((u32)clp->cl_boot_time.tv_nsec); + nfs4_construct_boot_verifier(clp, &sc_verifier); for(;;) { + rcu_read_lock(); setclientid.sc_name_len = scnprintf(setclientid.sc_name, sizeof(setclientid.sc_name), "%s/%s %s %s %u", clp->cl_ipaddr, @@ -3805,6 +3923,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); + rcu_read_unlock(); status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status != -NFS4ERR_CLID_INUSE) @@ -3891,7 +4010,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) if (nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, - &d_data->res.seq_res, 1, task)) + &d_data->res.seq_res, task)) return; rpc_call_start(task); } @@ -3925,11 +4044,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->attr_bitmask; nfs_copy_fh(&data->fh, NFS_FH(inode)); - memcpy(&data->stateid, stateid, sizeof(data->stateid)); + nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; data->res.server = server; nfs_fattr_init(data->res.fattr); @@ -4016,7 +4136,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; lsp = request->fl_u.nfs4_fl.owner; - arg.lock_owner.id = lsp->ls_id.id; + arg.lock_owner.id = lsp->ls_seqid.owner_id; arg.lock_owner.s_dev = server->s_dev; status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { @@ -4112,9 +4232,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) return; switch (task->tk_status) { case 0: - memcpy(calldata->lsp->ls_stateid.data, - calldata->res.stateid.data, - sizeof(calldata->lsp->ls_stateid.data)); + nfs4_stateid_copy(&calldata->lsp->ls_stateid, + &calldata->res.stateid); renew_lease(calldata->server, calldata->timestamp); break; case -NFS4ERR_BAD_STATEID: @@ -4142,7 +4261,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, &calldata->arg.seq_args, - &calldata->res.seq_res, 1, task)) + &calldata->res.seq_res, task)) return; rpc_call_start(task); } @@ -4182,6 +4301,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } + nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -4261,7 +4381,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, goto out_free_seqid; p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; - p->arg.lock_owner.id = lsp->ls_id.id; + p->arg.lock_owner.id = lsp->ls_seqid.owner_id; p->arg.lock_owner.s_dev = server->s_dev; p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; @@ -4297,7 +4417,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, - &data->res.seq_res, 1, task)) + &data->res.seq_res, task)) return; rpc_call_start(task); dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); @@ -4326,8 +4446,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) goto out; } if (data->rpc_status == 0) { - memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, - sizeof(data->lsp->ls_stateid.data)); + nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); } @@ -4415,6 +4534,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->arg.reclaim = NFS_LOCK_RECLAIM; task_setup_data.callback_ops = &nfs4_recover_lock_ops; } + nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -4479,15 +4599,34 @@ out: } #if defined(CONFIG_NFS_V4_1) -static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +static int nfs41_check_expired_locks(struct nfs4_state *state) { - int status; + int status, ret = NFS_OK; + struct nfs4_lock_state *lsp; struct nfs_server *server = NFS_SERVER(state->inode); - status = nfs41_test_stateid(server, state); + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + status = nfs41_test_stateid(server, &lsp->ls_stateid); + if (status != NFS_OK) { + nfs41_free_stateid(server, &lsp->ls_stateid); + lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; + ret = status; + } + } + }; + + return ret; +} + +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status = NFS_OK; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) + status = nfs41_check_expired_locks(state); if (status == NFS_OK) - return 0; - nfs41_free_stateid(server, state); + return status; return nfs4_lock_expired(state, request); } #endif @@ -4523,7 +4662,8 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Note: we always want to sleep here! */ request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); + printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock " + "manager!\n", __func__); out_unlock: up_read(&nfsi->rwsem); out: @@ -4533,7 +4673,9 @@ out: static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + }; int err; do { @@ -4603,8 +4745,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); switch (err) { default: - printk(KERN_ERR "%s: unhandled error %d.\n", - __func__, err); + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); case 0: case -ESTALE: goto out; @@ -4626,6 +4768,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) * The show must go on: exit, but mark the * stateid as needing recovery. */ + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: @@ -4655,33 +4798,44 @@ out: return err; } +struct nfs_release_lockowner_data { + struct nfs4_lock_state *lsp; + struct nfs_server *server; + struct nfs_release_lockowner_args args; +}; + static void nfs4_release_lockowner_release(void *calldata) { + struct nfs_release_lockowner_data *data = calldata; + nfs4_free_lock_state(data->server, data->lsp); kfree(calldata); } -const struct rpc_call_ops nfs4_release_lockowner_ops = { +static const struct rpc_call_ops nfs4_release_lockowner_ops = { .rpc_release = nfs4_release_lockowner_release, }; -void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +int nfs4_release_lockowner(struct nfs4_lock_state *lsp) { struct nfs_server *server = lsp->ls_state->owner->so_server; - struct nfs_release_lockowner_args *args; + struct nfs_release_lockowner_data *data; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], }; if (server->nfs_client->cl_mvops->minor_version != 0) - return; - args = kmalloc(sizeof(*args), GFP_NOFS); - if (!args) - return; - args->lock_owner.clientid = server->nfs_client->cl_clientid; - args->lock_owner.id = lsp->ls_id.id; - args->lock_owner.s_dev = server->s_dev; - msg.rpc_argp = args; - rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); + return -EINVAL; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return -ENOMEM; + data->lsp = lsp; + data->server = server; + data->args.lock_owner.clientid = server->nfs_client->cl_clientid; + data->args.lock_owner.id = lsp->ls_seqid.owner_id; + data->args.lock_owner.s_dev = server->s_dev; + msg.rpc_argp = &data->args; + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); + return 0; } #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" @@ -4727,11 +4881,11 @@ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || (fattr->valid & NFS_ATTR_FATTR_FILEID)) && (fattr->valid & NFS_ATTR_FATTR_FSID) && - (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS))) return; fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | - NFS_ATTR_FATTR_NLINK; + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL; fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; fattr->nlink = 2; } @@ -4798,7 +4952,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct return status; } -int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +static int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, + struct nfs4_secinfo_flavors *flavors) { struct nfs4_exception exception = { }; int err; @@ -4852,6 +5007,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) { nfs4_verifier verifier; struct nfs41_exchange_id_args args = { + .verifier = &verifier, .client = clp, .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, }; @@ -4865,15 +5021,11 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) .rpc_resp = &res, .rpc_cred = cred, }; - __be32 *p; dprintk("--> %s\n", __func__); BUG_ON(clp == NULL); - p = (u32 *)verifier.data; - *p++ = htonl((u32)clp->cl_boot_time.tv_sec); - *p = htonl((u32)clp->cl_boot_time.tv_nsec); - args.verifier = &verifier; + nfs4_construct_boot_verifier(clp, &verifier); args.id_len = scnprintf(args.id, sizeof(args.id), "%s/%s.%s/%u", @@ -4888,11 +5040,24 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) goto out; } + res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL); + if (unlikely(!res.impl_id)) { + status = -ENOMEM; + goto out_server_scope; + } + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); if (!status) { + /* use the most recent implementation id */ + kfree(clp->impl_id); + clp->impl_id = res.impl_id; + } else + kfree(res.impl_id); + + if (!status) { if (clp->server_scope && !nfs41_same_server_scope(clp->server_scope, res.server_scope)) { @@ -4908,8 +5073,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) goto out; } } + +out_server_scope: kfree(res.server_scope); out: + if (clp->impl_id) + dprintk("%s: Server Implementation ID: " + "domain: %s, name: %s, date: %llu,%u\n", + __func__, clp->impl_id->domain, clp->impl_id->name, + clp->impl_id->date.seconds, + clp->impl_id->date.nseconds); dprintk("<-- %s status= %d\n", __func__, status); return status; } @@ -4933,7 +5106,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, since we're invoked within one */ ret = nfs41_setup_sequence(data->clp->cl_session, &data->args->la_seq_args, - &data->res->lr_seq_res, 0, task); + &data->res->lr_seq_res, task); BUG_ON(ret == -EAGAIN); rpc_call_start(task); @@ -4966,7 +5139,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("<-- %s\n", __func__); } -struct rpc_call_ops nfs4_get_lease_time_ops = { +static const struct rpc_call_ops nfs4_get_lease_time_ops = { .rpc_call_prepare = nfs4_get_lease_time_prepare, .rpc_call_done = nfs4_get_lease_time_done, }; @@ -4997,6 +5170,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) }; int status; + nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -5113,13 +5287,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) return NULL; tbl = &session->fc_slot_table; - tbl->highest_used_slotid = -1; + tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); init_completion(&tbl->complete); tbl = &session->bc_slot_table; - tbl->highest_used_slotid = -1; + tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); init_completion(&tbl->complete); @@ -5132,11 +5306,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) void nfs4_destroy_session(struct nfs4_session *session) { + struct rpc_xprt *xprt; + nfs4_proc_destroy_session(session); + + rcu_read_lock(); + xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); dprintk("%s Destroy backchannel for xprt %p\n", - __func__, session->clp->cl_rpcclient->cl_xprt); - xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt, - NFS41_BC_MIN_CALLBACKS); + __func__, xprt); + xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); nfs4_destroy_slot_tables(session); kfree(session); } @@ -5164,7 +5343,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; - args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; + args->fc_attrs.max_reqs = max_session_slots; dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " "max_ops=%u max_reqs=%u\n", @@ -5204,6 +5383,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_reqs == 0) return -EINVAL; + if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE) + rcvd->max_reqs = NFS4_MAX_SLOT_TABLE; return 0; } @@ -5219,9 +5400,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; /* These would render the backchannel useless: */ - if (rcvd->max_ops == 0) + if (rcvd->max_ops != sent->max_ops) return -EINVAL; - if (rcvd->max_reqs == 0) + if (rcvd->max_reqs != sent->max_reqs) return -EINVAL; return 0; } @@ -5324,7 +5505,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) if (status) printk(KERN_WARNING - "Got error %d from the server on DESTROY_SESSION. " + "NFS: Got error %d from the server on DESTROY_SESSION. " "Session has been destroyed regardless...\n", status); dprintk("<-- nfs4_proc_destroy_session\n"); @@ -5447,7 +5628,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + if (nfs41_setup_sequence(clp->cl_session, args, res, task)) return; rpc_call_start(task); } @@ -5479,6 +5660,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_ nfs_put_client(clp); return ERR_PTR(-ENOMEM); } + nfs41_init_sequence(&calldata->args, &calldata->res, 0); msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -5540,7 +5722,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); if (nfs41_setup_sequence(calldata->clp->cl_session, &calldata->arg.seq_args, - &calldata->res.seq_res, 0, task)) + &calldata->res.seq_res, task)) return; rpc_call_start(task); @@ -5619,6 +5801,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) calldata->clp = clp; calldata->arg.one_fs = 0; + nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -5650,7 +5833,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) * to be no way to prevent it completely. */ if (nfs4_setup_sequence(server, &lgp->args.seq_args, - &lgp->res.seq_res, 0, task)) + &lgp->res.seq_res, task)) return; if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, @@ -5725,6 +5908,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; + nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -5745,7 +5929,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, - &lrp->res.seq_res, 0, task)) + &lrp->res.seq_res, task)) return; rpc_call_start(task); } @@ -5811,6 +5995,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) int status; dprintk("--> %s\n", __func__); + nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -5911,7 +6096,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(data->args.inode); if (nfs4_setup_sequence(server, &data->args.seq_args, - &data->res.seq_res, 1, task)) + &data->res.seq_res, task)) return; rpc_call_start(task); } @@ -5998,6 +6183,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) data->args.lastbytewritten, data->args.inode->i_ino); + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -6091,11 +6277,12 @@ out_freepage: out: return err; } -static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) + +static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) { int status; struct nfs41_test_stateid_args args = { - .stateid = &state->stateid, + .stateid = stateid, }; struct nfs41_test_stateid_res res; struct rpc_message msg = { @@ -6103,28 +6290,31 @@ static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *sta .rpc_argp = &args, .rpc_resp = &res, }; - args.seq_args.sa_session = res.seq_res.sr_session = NULL; - status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + + nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + + if (status == NFS_OK) + return res.status; return status; } -static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs41_test_stateid(server, state), + _nfs41_test_stateid(server, stateid), &exception); } while (exception.retry); return err; } -static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) { - int status; struct nfs41_free_stateid_args args = { - .stateid = &state->stateid, + .stateid = stateid, }; struct nfs41_free_stateid_res res; struct rpc_message msg = { @@ -6133,25 +6323,46 @@ static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *stat .rpc_resp = &res, }; - args.seq_args.sa_session = res.seq_res.sr_session = NULL; - status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); - return status; + nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); + return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); } -static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_free_stateid(server, state), + _nfs4_free_stateid(server, stateid), &exception); } while (exception.retry); return err; } + +static bool nfs41_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) + return false; + + if (s1->seqid == s2->seqid) + return true; + if (s1->seqid == 0 || s2->seqid == 0) + return true; + + return false; +} + #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +static bool nfs4_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + return nfs4_stateid_match(s1, s2); +} + + +static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, @@ -6161,7 +6372,7 @@ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { }; #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, @@ -6172,7 +6383,7 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { }; #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs4_open_expired, @@ -6182,7 +6393,7 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { }; #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs41_open_expired, @@ -6192,14 +6403,14 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { }; #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { .sched_state_renewal = nfs4_proc_async_renew, .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, .renew_lease = nfs4_proc_renew, }; #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { .sched_state_renewal = nfs41_proc_async_sequence, .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, .renew_lease = nfs4_proc_sequence, @@ -6209,7 +6420,7 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .call_sync = _nfs4_call_sync, - .validate_stateid = nfs4_validate_delegation_stateid, + .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, @@ -6220,7 +6431,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .call_sync = _nfs4_call_sync_session, - .validate_stateid = nfs41_validate_delegation_stateid, + .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -6260,9 +6471,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .create = nfs4_proc_create, .remove = nfs4_proc_remove, .unlink_setup = nfs4_proc_unlink_setup, + .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, .unlink_done = nfs4_proc_unlink_done, .rename = nfs4_proc_rename, .rename_setup = nfs4_proc_rename_setup, + .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, .rename_done = nfs4_proc_rename_done, .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, @@ -6276,8 +6489,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, .read_setup = nfs4_proc_read_setup, + .read_rpc_prepare = nfs4_proc_read_rpc_prepare, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, + .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_done = nfs4_commit_done, @@ -6301,6 +6516,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = { NULL }; +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " + "requests the client will negotiate"); + /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4539203..0f43414 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -146,6 +146,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) struct rpc_cred *cred = NULL; struct nfs_server *server; + /* Use machine credentials if available */ + cred = nfs4_get_machine_cred_locked(clp); + if (cred != NULL) + goto out; + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { cred = nfs4_get_renew_cred_server_locked(server); @@ -153,6 +158,8 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) break; } rcu_read_unlock(); + +out: return cred; } @@ -190,30 +197,29 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; + struct nfs4_slot_table *tbl; int max_slots; if (ses == NULL) return; + tbl = &ses->fc_slot_table; if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { - spin_lock(&ses->fc_slot_table.slot_tbl_lock); - max_slots = ses->fc_slot_table.max_slots; + spin_lock(&tbl->slot_tbl_lock); + max_slots = tbl->max_slots; while (max_slots--) { - struct rpc_task *task; - - task = rpc_wake_up_next(&ses->fc_slot_table. - slot_tbl_waitq); - if (!task) + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, + nfs4_set_task_privileged, + NULL) == NULL) break; - rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); } - spin_unlock(&ses->fc_slot_table.slot_tbl_lock); + spin_unlock(&tbl->slot_tbl_lock); } } static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) { spin_lock(&tbl->slot_tbl_lock); - if (tbl->highest_used_slotid != -1) { + if (tbl->highest_used_slotid != NFS4_NO_SLOT) { INIT_COMPLETION(tbl->complete); spin_unlock(&tbl->slot_tbl_lock); return wait_for_completion_interruptible(&tbl->complete); @@ -317,62 +323,6 @@ out: return cred; } -static void nfs_alloc_unique_id_locked(struct rb_root *root, - struct nfs_unique_id *new, - __u64 minval, int maxbits) -{ - struct rb_node **p, *parent; - struct nfs_unique_id *pos; - __u64 mask = ~0ULL; - - if (maxbits < 64) - mask = (1ULL << maxbits) - 1ULL; - - /* Ensure distribution is more or less flat */ - get_random_bytes(&new->id, sizeof(new->id)); - new->id &= mask; - if (new->id < minval) - new->id += minval; -retry: - p = &root->rb_node; - parent = NULL; - - while (*p != NULL) { - parent = *p; - pos = rb_entry(parent, struct nfs_unique_id, rb_node); - - if (new->id < pos->id) - p = &(*p)->rb_left; - else if (new->id > pos->id) - p = &(*p)->rb_right; - else - goto id_exists; - } - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, root); - return; -id_exists: - for (;;) { - new->id++; - if (new->id < minval || (new->id & mask) != new->id) { - new->id = minval; - break; - } - parent = rb_next(parent); - if (parent == NULL) - break; - pos = rb_entry(parent, struct nfs_unique_id, rb_node); - if (new->id < pos->id) - break; - } - goto retry; -} - -static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) -{ - rb_erase(&id->rb_node, root); -} - static struct nfs4_state_owner * nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) { @@ -405,6 +355,7 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; + int err; while (*p != NULL) { parent = *p; @@ -421,8 +372,9 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) return sp; } } - nfs_alloc_unique_id_locked(&server->openowner_id, - &new->so_owner_id, 1, 64); + err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); + if (err) + return ERR_PTR(err); rb_link_node(&new->so_server_node, parent, p); rb_insert_color(&new->so_server_node, &server->state_owners); return new; @@ -435,7 +387,23 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) if (!RB_EMPTY_NODE(&sp->so_server_node)) rb_erase(&sp->so_server_node, &server->state_owners); - nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id); + ida_remove(&server->openowner_id, sp->so_seqid.owner_id); +} + +static void +nfs4_init_seqid_counter(struct nfs_seqid_counter *sc) +{ + sc->flags = 0; + sc->counter = 0; + spin_lock_init(&sc->lock); + INIT_LIST_HEAD(&sc->list); + rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue"); +} + +static void +nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) +{ + rpc_destroy_wait_queue(&sc->wait); } /* @@ -444,19 +412,20 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) * */ static struct nfs4_state_owner * -nfs4_alloc_state_owner(void) +nfs4_alloc_state_owner(struct nfs_server *server, + struct rpc_cred *cred, + gfp_t gfp_flags) { struct nfs4_state_owner *sp; - sp = kzalloc(sizeof(*sp),GFP_NOFS); + sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_server = server; + sp->so_cred = get_rpccred(cred); spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); - rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); - sp->so_seqid.sequence = &sp->so_sequence; - spin_lock_init(&sp->so_sequence.lock); - INIT_LIST_HEAD(&sp->so_sequence.list); + nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); return sp; @@ -478,7 +447,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp) static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { - rpc_destroy_wait_queue(&sp->so_sequence.wait); + nfs4_destroy_seqid_counter(&sp->so_seqid); put_rpccred(sp->so_cred); kfree(sp); } @@ -516,7 +485,8 @@ static void nfs4_gc_state_owners(struct nfs_server *server) * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, - struct rpc_cred *cred) + struct rpc_cred *cred, + gfp_t gfp_flags) { struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; @@ -526,20 +496,18 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, spin_unlock(&clp->cl_lock); if (sp != NULL) goto out; - new = nfs4_alloc_state_owner(); + new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) goto out; - new->so_server = server; - new->so_cred = cred; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner_locked(new); - spin_unlock(&clp->cl_lock); - if (sp == new) - get_rpccred(cred); - else { - rpc_destroy_wait_queue(&new->so_sequence.wait); - kfree(new); - } + do { + if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) + break; + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); + } while (sp == ERR_PTR(-EAGAIN)); + if (sp != new) + nfs4_free_state_owner(new); out: nfs4_gc_state_owners(server); return sp; @@ -795,15 +763,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; - struct nfs_client *clp = server->nfs_client; lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) return NULL; - rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); - spin_lock_init(&lsp->ls_sequence.lock); - INIT_LIST_HEAD(&lsp->ls_sequence.list); - lsp->ls_seqid.sequence = &lsp->ls_sequence; + nfs4_init_seqid_counter(&lsp->ls_seqid); atomic_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner.lo_type = type; @@ -815,25 +779,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f lsp->ls_owner.lo_u.posix_owner = fl_owner; break; default: - kfree(lsp); - return NULL; + goto out_free; } - spin_lock(&clp->cl_lock); - nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64); - spin_unlock(&clp->cl_lock); + lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); + if (lsp->ls_seqid.owner_id < 0) + goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); return lsp; +out_free: + kfree(lsp); + return NULL; } -static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct nfs_server *server = lsp->ls_state->owner->so_server; - struct nfs_client *clp = server->nfs_client; - - spin_lock(&clp->cl_lock); - nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id); - spin_unlock(&clp->cl_lock); - rpc_destroy_wait_queue(&lsp->ls_sequence.wait); + ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -865,7 +826,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ } spin_unlock(&state->state_lock); if (new != NULL) - nfs4_free_lock_state(new); + nfs4_free_lock_state(state->owner->so_server, new); return lsp; } @@ -886,9 +847,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - if (lsp->ls_flags & NFS_LOCK_INITIALIZED) - nfs4_release_lockowner(lsp); - nfs4_free_lock_state(lsp); + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + if (nfs4_release_lockowner(lsp) == 0) + return; + } + nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -918,7 +881,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_flags & FL_POSIX) lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); else if (fl->fl_flags & FL_FLOCK) - lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); + lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid, + NFS4_FLOCK_LOCK_TYPE); else return -EINVAL; if (lsp == NULL) @@ -928,28 +892,49 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) return 0; } -/* - * Byte-range lock aware utility to initialize the stateid of read/write - * requests. - */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) +static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, + fl_owner_t fl_owner, pid_t fl_pid) { struct nfs4_lock_state *lsp; - int seq; + bool ret = false; - do { - seq = read_seqbegin(&state->seqlock); - memcpy(dst, &state->stateid, sizeof(*dst)); - } while (read_seqretry(&state->seqlock, seq)); if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) - return; + goto out; spin_lock(&state->state_lock); lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); - if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { + nfs4_stateid_copy(dst, &lsp->ls_stateid); + ret = true; + } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); +out: + return ret; +} + +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ + int seq; + + do { + seq = read_seqbegin(&state->seqlock); + nfs4_stateid_copy(dst, &state->stateid); + } while (read_seqretry(&state->seqlock, seq)); +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, + fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) +{ + if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) + return; + if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) + return; + nfs4_copy_open_stateid(dst, state); } struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -960,20 +945,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m if (new != NULL) { new->sequence = counter; INIT_LIST_HEAD(&new->list); + new->task = NULL; } return new; } void nfs_release_seqid(struct nfs_seqid *seqid) { - if (!list_empty(&seqid->list)) { - struct rpc_sequence *sequence = seqid->sequence->sequence; + struct nfs_seqid_counter *sequence; - spin_lock(&sequence->lock); - list_del_init(&seqid->list); - spin_unlock(&sequence->lock); - rpc_wake_up(&sequence->wait); + if (list_empty(&seqid->list)) + return; + sequence = seqid->sequence; + spin_lock(&sequence->lock); + list_del_init(&seqid->list); + if (!list_empty(&sequence->list)) { + struct nfs_seqid *next; + + next = list_first_entry(&sequence->list, + struct nfs_seqid, list); + rpc_wake_up_queued_task(&sequence->wait, next->task); } + spin_unlock(&sequence->lock); } void nfs_free_seqid(struct nfs_seqid *seqid) @@ -989,14 +982,14 @@ void nfs_free_seqid(struct nfs_seqid *seqid) */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { - BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid); + BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid); switch (status) { case 0: break; case -NFS4ERR_BAD_SEQID: if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) return; - printk(KERN_WARNING "NFS: v4 server returned a bad" + pr_warn_ratelimited("NFS: v4 server returned a bad" " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); @@ -1040,10 +1033,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) { - struct rpc_sequence *sequence = seqid->sequence->sequence; + struct nfs_seqid_counter *sequence = seqid->sequence; int status = 0; spin_lock(&sequence->lock); + seqid->task = task; if (list_empty(&seqid->list)) list_add_tail(&seqid->list, &sequence->list); if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) @@ -1072,19 +1066,28 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp) void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; + char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); atomic_inc(&clp->cl_count); - task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); - if (!IS_ERR(task)) - return; - nfs4_clear_state_manager_bit(clp); - nfs_put_client(clp); - module_put(THIS_MODULE); + + /* The rcu_read_lock() is not strictly necessary, as the state + * manager is the only thread that ever changes the rpc_xprt + * after it's initialized. At this point, we're single threaded. */ + rcu_read_lock(); + snprintf(buf, sizeof(buf), "%s-manager", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + task = kthread_run(nfs4_run_state_manager, clp, buf); + if (IS_ERR(task)) { + printk(KERN_ERR "%s: kthread_run: %ld\n", + __func__, PTR_ERR(task)); + nfs4_clear_state_manager_bit(clp); + nfs_put_client(clp); + module_put(THIS_MODULE); + } } /* @@ -1098,10 +1101,25 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); nfs4_schedule_state_manager(clp); } +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); + +/* + * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a + * resend of the SETCLIENTID and hence re-establish the + * callback channel. Then return all existing delegations. + */ +static void nfs40_handle_cb_pathdown(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs_expire_all_delegations(clp); +} void nfs4_schedule_path_down_recovery(struct nfs_client *clp) { - nfs_handle_cb_pathdown(clp); + nfs40_handle_cb_pathdown(clp); nfs4_schedule_state_manager(clp); } @@ -1132,11 +1150,37 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 { struct nfs_client *clp = server->nfs_client; - if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags)) - nfs_async_inode_return_delegation(state->inode, &state->stateid); nfs4_state_mark_reclaim_nograce(clp, state); nfs4_schedule_state_manager(clp); } +EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); + +void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + bool found = false; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (!nfs4_stateid_match(&state->stateid, stateid)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + found = true; + } + spin_unlock(&inode->i_lock); + if (found) + nfs4_schedule_state_manager(clp); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { @@ -1175,8 +1219,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: - printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", - __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d. " + "Zeroing state\n", __func__, status); case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: @@ -1222,8 +1266,9 @@ restart: spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) - printk("%s: Lock reclaim failed!\n", - __func__); + pr_warn_ratelimited("NFS: " + "%s: Lock reclaim " + "failed!\n", __func__); } spin_unlock(&state->state_lock); nfs4_put_open_state(state); @@ -1232,8 +1277,8 @@ restart: } switch (status) { default: - printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", - __func__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d. " + "Zeroing state\n", __func__, status); case -ENOENT: case -ENOMEM: case -ESTALE: @@ -1241,8 +1286,8 @@ restart: * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. */ - memset(state->stateid.data, 0, - sizeof(state->stateid.data)); + memset(&state->stateid, 0, + sizeof(state->stateid)); /* Mark the file as being 'closed' */ state->state = 0; break; @@ -1420,7 +1465,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case 0: break; case -NFS4ERR_CB_PATH_DOWN: - nfs_handle_cb_pathdown(clp); + nfs40_handle_cb_pathdown(clp); break; case -NFS4ERR_NO_GRACE: nfs4_state_end_reclaim_reboot(clp); @@ -1801,7 +1846,7 @@ static void nfs4_state_manager(struct nfs_client *clp) } while (atomic_read(&clp->cl_count) > 1); return; out_error: - printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" + pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s" " with error %d\n", clp->cl_hostname, -status); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 33bd8d0..c74fdb1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -44,6 +44,8 @@ #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/kdev_t.h> +#include <linux/module.h> +#include <linux/utsname.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/gss_api.h> @@ -271,7 +273,12 @@ static int nfs4_stat_to_errno(int); 1 /* flags */ + \ 1 /* spa_how */ + \ 0 /* SP4_NONE (for now) */ + \ - 1 /* zero implemetation id array */) + 1 /* implementation id array of size 1 */ + \ + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 3 /* nii_date */) #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 2 /* eir_clientid */ + \ 1 /* eir_sequenceid */ + \ @@ -284,7 +291,11 @@ static int nfs4_stat_to_errno(int); /* eir_server_scope<> */ \ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ 1 /* eir_server_impl_id array length */ + \ - 0 /* ignored eir_server_impl_id contents */) + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 3 /* nii_date */) #define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */) #define decode_channel_attrs_maxsz (6 + \ 1 /* ca_rdma_ird.len */ + \ @@ -838,6 +849,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + XDR_UNIT); #endif /* CONFIG_NFS_V4_1 */ +static unsigned short send_implementation_id = 1; + +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, + "Send implementation ID with NFSv4.1 exchange_id"); + static const umode_t nfs_type2fmt[] = { [NF4BAD] = 0, [NF4REG] = S_IFREG, @@ -868,15 +885,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) return p; } +static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, len); + xdr_encode_opaque_fixed(p, buf, len); +} + static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { __be32 *p; - p = xdr_reserve_space(xdr, 4 + len); - BUG_ON(p == NULL); + p = reserve_space(xdr, 4 + len); xdr_encode_opaque(p, str, len); } +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(n); +} + +static void encode_uint64(struct xdr_stream *xdr, u64 n) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + xdr_encode_hyper(p, n); +} + +static void encode_nfs4_seqid(struct xdr_stream *xdr, + const struct nfs_seqid *seqid) +{ + encode_uint32(xdr, seqid->sequence->counter); +} + static void encode_compound_hdr(struct xdr_stream *xdr, struct rpc_rqst *req, struct compound_hdr *hdr) @@ -889,28 +935,37 @@ static void encode_compound_hdr(struct xdr_stream *xdr, * but this is not required as a MUST for the server to do so. */ hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; - dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - p = reserve_space(xdr, 4 + hdr->taglen + 8); - p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + encode_string(xdr, hdr->taglen, hdr->tag); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; *p = cpu_to_be32(hdr->nops); } +static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, + uint32_t replen, + struct compound_hdr *hdr) +{ + encode_uint32(xdr, op); + hdr->nops++; + hdr->replen += replen; +} + static void encode_nops(struct compound_hdr *hdr) { BUG_ON(hdr->nops > NFS4_MAX_OPS); *hdr->nops_p = htonl(hdr->nops); } -static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid) { - __be32 *p; + encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - BUG_ON(p == NULL); - xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) @@ -1023,7 +1078,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const * Now we backfill the bitmap and the attribute buffer length. */ if (len != ((char *)p - (char *)q) + 4) { - printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", + printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n", len, ((char *)p - (char *)q) + 4); BUG(); } @@ -1037,46 +1092,33 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(OP_ACCESS); - *p = cpu_to_be32(access); - hdr->nops++; - hdr->replen += decode_access_maxsz; + encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr); + encode_uint32(xdr, access); } static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_CLOSE); - *p++ = cpu_to_be32(arg->seqid->sequence->counter); - xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); - hdr->nops++; - hdr->replen += decode_close_maxsz; + encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); + encode_nfs4_seqid(xdr, arg->seqid); + encode_nfs4_stateid(xdr, arg->stateid); } static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(OP_COMMIT); + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); *p = cpu_to_be32(args->count); - hdr->nops++; - hdr->replen += decode_commit_maxsz; } static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(OP_CREATE); - *p = cpu_to_be32(create->ftype); + encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr); + encode_uint32(xdr, create->ftype); switch (create->ftype) { case NF4LNK: @@ -1096,9 +1138,6 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - hdr->nops++; - hdr->replen += decode_create_maxsz; - encode_attrs(xdr, create->attrs, create->server); } @@ -1106,25 +1145,21 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(OP_GETATTR); + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 8); *p++ = cpu_to_be32(1); *p = cpu_to_be32(bitmap); - hdr->nops++; - hdr->replen += decode_getattr_maxsz; } static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(OP_GETATTR); + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 12); *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bm0); *p = cpu_to_be32(bm1); - hdr->nops++; - hdr->replen += decode_getattr_maxsz; } static void @@ -1134,8 +1169,7 @@ encode_getattr_three(struct xdr_stream *xdr, { __be32 *p; - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_GETATTR); + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); if (bm2) { p = reserve_space(xdr, 16); *p++ = cpu_to_be32(3); @@ -1152,8 +1186,6 @@ encode_getattr_three(struct xdr_stream *xdr, *p++ = cpu_to_be32(1); *p = cpu_to_be32(bm0); } - hdr->nops++; - hdr->replen += decode_getattr_maxsz; } static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1179,23 +1211,13 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_GETFH); - hdr->nops++; - hdr->replen += decode_getfh_maxsz; + encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr); } static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8 + name->len); - *p++ = cpu_to_be32(OP_LINK); - xdr_encode_opaque(p, name->name, name->len); - hdr->nops++; - hdr->replen += decode_link_maxsz; + encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr); + encode_string(xdr, name->len, name->name); } static inline int nfs4_lock_type(struct file_lock *fl, int block) @@ -1232,79 +1254,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - p = reserve_space(xdr, 32); - *p++ = cpu_to_be32(OP_LOCK); + encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr); + p = reserve_space(xdr, 28); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); *p++ = cpu_to_be32(args->reclaim); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); - *p++ = cpu_to_be32(args->open_seqid->sequence->counter); - p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + encode_nfs4_seqid(xdr, args->open_seqid); + encode_nfs4_stateid(xdr, args->open_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); encode_lockowner(xdr, &args->lock_owner); } else { - p = reserve_space(xdr, NFS4_STATEID_SIZE+4); - p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); - *p = cpu_to_be32(args->lock_seqid->sequence->counter); + encode_nfs4_stateid(xdr, args->lock_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); } - hdr->nops++; - hdr->replen += decode_lock_maxsz; } static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 24); - *p++ = cpu_to_be32(OP_LOCKT); + encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr); + p = reserve_space(xdr, 20); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); encode_lockowner(xdr, &args->lock_owner); - hdr->nops++; - hdr->replen += decode_lockt_maxsz; } static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); - *p++ = cpu_to_be32(OP_LOCKU); - *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); - *p++ = cpu_to_be32(args->seqid->sequence->counter); - p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); + encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); + encode_nfs4_seqid(xdr, args->seqid); + encode_nfs4_stateid(xdr, args->stateid); + p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->fl->fl_start); xdr_encode_hyper(p, nfs4_lock_length(args->fl)); - hdr->nops++; - hdr->replen += decode_locku_maxsz; } static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); + encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr); encode_lockowner(xdr, lowner); - hdr->nops++; - hdr->replen += decode_release_lockowner_maxsz; } static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { - int len = name->len; - __be32 *p; - - p = reserve_space(xdr, 8 + len); - *p++ = cpu_to_be32(OP_LOOKUP); - xdr_encode_opaque(p, name->name, len); - hdr->nops++; - hdr->replen += decode_lookup_maxsz; + encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr); + encode_string(xdr, name->len, name->name); } static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) @@ -1335,9 +1338,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(OP_OPEN); - *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->fmode); p = reserve_space(xdr, 32); p = xdr_encode_hyper(p, arg->clientid); @@ -1437,14 +1438,15 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + encode_nfs4_stateid(xdr, stateid); encode_string(xdr, name->len, name->name); } static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { + encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); encode_openhdr(xdr, arg); encode_opentype(xdr, arg); switch (arg->claim) { @@ -1460,88 +1462,64 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, default: BUG(); } - hdr->nops++; - hdr->replen += decode_open_maxsz; } static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); - *p++ = cpu_to_be32(OP_OPEN_CONFIRM); - p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); - *p = cpu_to_be32(arg->seqid->sequence->counter); - hdr->nops++; - hdr->replen += decode_open_confirm_maxsz; + encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr); + encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); } static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); - *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); - p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); - *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); + encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->fmode); - hdr->nops++; - hdr->replen += decode_open_downgrade_maxsz; } static void encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) { - int len = fh->size; - __be32 *p; - - p = reserve_space(xdr, 8 + len); - *p++ = cpu_to_be32(OP_PUTFH); - xdr_encode_opaque(p, fh->data, len); - hdr->nops++; - hdr->replen += decode_putfh_maxsz; + encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr); + encode_string(xdr, fh->size, fh->data); } static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_PUTROOTFH); - hdr->nops++; - hdr->replen += decode_putrootfh_maxsz; + encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) +static void encode_open_stateid(struct xdr_stream *xdr, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode, + int zero_seqid) { nfs4_stateid stateid; - __be32 *p; - p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { - nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + nfs4_select_rw_stateid(&stateid, ctx->state, + fmode, l_ctx->lockowner, l_ctx->pid); if (zero_seqid) - stateid.stateid.seqid = 0; - xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); + stateid.seqid = 0; + encode_nfs4_stateid(xdr, &stateid); } else - xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + encode_nfs4_stateid(xdr, &zero_stateid); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_READ); - - encode_stateid(xdr, args->context, args->lock_context, - hdr->minorversion); + encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); + encode_open_stateid(xdr, args->context, args->lock_context, + FMODE_READ, hdr->minorversion); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); *p = cpu_to_be32(args->count); - hdr->nops++; - hdr->replen += decode_read_maxsz; } static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) @@ -1551,7 +1529,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_MOUNTED_ON_FILEID, }; uint32_t dircount = readdir->count >> 1; - __be32 *p; + __be32 *p, verf[2]; if (readdir->plus) { attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1566,80 +1544,54 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) attrs[0] |= FATTR4_WORD0_FILEID; - p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); - *p++ = cpu_to_be32(OP_READDIR); - p = xdr_encode_hyper(p, readdir->cookie); - p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); + encode_uint64(xdr, readdir->cookie); + encode_nfs4_verifier(xdr, &readdir->verifier); + p = reserve_space(xdr, 20); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); - hdr->nops++; - hdr->replen += decode_readdir_maxsz; + memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", __func__, (unsigned long long)readdir->cookie, - ((u32 *)readdir->verifier.data)[0], - ((u32 *)readdir->verifier.data)[1], + verf[0], verf[1], attrs[0] & readdir->bitmask[0], attrs[1] & readdir->bitmask[1]); } static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_READLINK); - hdr->nops++; - hdr->replen += decode_readlink_maxsz; + encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr); } static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8 + name->len); - *p++ = cpu_to_be32(OP_REMOVE); - xdr_encode_opaque(p, name->name, name->len); - hdr->nops++; - hdr->replen += decode_remove_maxsz; + encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr); + encode_string(xdr, name->len, name->name); } static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_RENAME); + encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr); encode_string(xdr, oldname->len, oldname->name); encode_string(xdr, newname->len, newname->name); - hdr->nops++; - hdr->replen += decode_rename_maxsz; } -static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) +static void encode_renew(struct xdr_stream *xdr, clientid4 clid, + struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(OP_RENEW); - xdr_encode_hyper(p, client_stateid->cl_clientid); - hdr->nops++; - hdr->replen += decode_renew_maxsz; + encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr); + encode_uint64(xdr, clid); } static void encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_RESTOREFH); - hdr->nops++; - hdr->replen += decode_restorefh_maxsz; + encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } static void @@ -1647,9 +1599,8 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_SETATTR); - xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); + encode_nfs4_stateid(xdr, &zero_stateid); p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p = cpu_to_be32(FATTR4_WORD0_ACL); @@ -1657,30 +1608,18 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun p = reserve_space(xdr, 4); *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); - hdr->nops++; - hdr->replen += decode_setacl_maxsz; } static void encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_SAVEFH); - hdr->nops++; - hdr->replen += decode_savefh_maxsz; + encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr); } static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_SETATTR); - xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); - hdr->nops++; - hdr->replen += decode_setattr_maxsz; + encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); + encode_nfs4_stateid(xdr, &arg->stateid); encode_attrs(xdr, arg->iap, server); } @@ -1688,9 +1627,8 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); - *p++ = cpu_to_be32(OP_SETCLIENTID); - xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); + encode_nfs4_verifier(xdr, setclientid->sc_verifier); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); p = reserve_space(xdr, 4); @@ -1699,31 +1637,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); *p = cpu_to_be32(setclientid->sc_cb_ident); - hdr->nops++; - hdr->replen += decode_setclientid_maxsz; } static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); - *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); - p = xdr_encode_hyper(p, arg->clientid); - xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); - hdr->nops++; - hdr->replen += decode_setclientid_confirm_maxsz; + encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM, + decode_setclientid_confirm_maxsz, hdr); + encode_uint64(xdr, arg->clientid); + encode_nfs4_verifier(xdr, &arg->confirm); } static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) { __be32 *p; - p = reserve_space(xdr, 4); - *p = cpu_to_be32(OP_WRITE); - - encode_stateid(xdr, args->context, args->lock_context, - hdr->minorversion); + encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); + encode_open_stateid(xdr, args->context, args->lock_context, + FMODE_WRITE, hdr->minorversion); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1731,32 +1661,18 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); - hdr->nops++; - hdr->replen += decode_write_maxsz; } static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - - *p++ = cpu_to_be32(OP_DELEGRETURN); - xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); - hdr->nops++; - hdr->replen += decode_delegreturn_maxsz; + encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr); + encode_nfs4_stateid(xdr, stateid); } static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { - int len = name->len; - __be32 *p; - - p = reserve_space(xdr, 8 + len); - *p++ = cpu_to_be32(OP_SECINFO); - xdr_encode_opaque(p, name->name, len); - hdr->nops++; - hdr->replen += decode_secinfo_maxsz; + encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr); + encode_string(xdr, name->len, name->name); } #if defined(CONFIG_NFS_V4_1) @@ -1766,19 +1682,39 @@ static void encode_exchange_id(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; + char impl_name[NFS4_OPAQUE_LIMIT]; + int len = 0; - p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); - *p++ = cpu_to_be32(OP_EXCHANGE_ID); - xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); + encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); + encode_nfs4_verifier(xdr, args->verifier); encode_string(xdr, args->id_len, args->id); p = reserve_space(xdr, 12); *p++ = cpu_to_be32(args->flags); *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ - *p = cpu_to_be32(0); /* zero length implementation id array */ - hdr->nops++; - hdr->replen += decode_exchange_id_maxsz; + + if (send_implementation_id && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) + <= NFS4_OPAQUE_LIMIT + 1) + len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", + utsname()->sysname, utsname()->release, + utsname()->version, utsname()->machine); + + if (len > 0) { + *p = cpu_to_be32(1); /* implementation id array length=1 */ + + encode_string(xdr, + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, + CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN); + encode_string(xdr, len, impl_name); + /* just send zeros for nii_date - the date is in nii_name */ + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, 0); + *p = cpu_to_be32(0); + } else + *p = cpu_to_be32(0); /* implementation id array length=0 */ } static void encode_create_session(struct xdr_stream *xdr, @@ -1801,8 +1737,8 @@ static void encode_create_session(struct xdr_stream *xdr, len = scnprintf(machine_name, sizeof(machine_name), "%s", clp->cl_ipaddr); - p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); - *p++ = cpu_to_be32(OP_CREATE_SESSION); + encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); + p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); p = xdr_encode_hyper(p, clp->cl_clientid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ @@ -1835,33 +1771,22 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ *p = cpu_to_be32(0); /* No more gids */ - hdr->nops++; - hdr->replen += decode_create_session_maxsz; } static void encode_destroy_session(struct xdr_stream *xdr, struct nfs4_session *session, struct compound_hdr *hdr) { - __be32 *p; - p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(OP_DESTROY_SESSION); - xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - hdr->nops++; - hdr->replen += decode_destroy_session_maxsz; + encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr); + encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); } static void encode_reclaim_complete(struct xdr_stream *xdr, struct nfs41_reclaim_complete_args *args, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); - *p++ = cpu_to_be32(args->one_fs); - hdr->nops++; - hdr->replen += decode_reclaim_complete_maxsz; + encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr); + encode_uint32(xdr, args->one_fs); } #endif /* CONFIG_NFS_V4_1 */ @@ -1883,8 +1808,7 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); - *p++ = cpu_to_be32(OP_SEQUENCE); + encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1898,13 +1822,12 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); + p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); *p++ = cpu_to_be32(slot->seq_nr); *p++ = cpu_to_be32(args->sa_slotid); *p++ = cpu_to_be32(tp->highest_used_slotid); *p = cpu_to_be32(args->sa_cache_this); - hdr->nops++; - hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ } @@ -1919,14 +1842,12 @@ encode_getdevicelist(struct xdr_stream *xdr, .data = "dummmmmy", }; - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(OP_GETDEVICELIST); + encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); + p = reserve_space(xdr, 16); *p++ = cpu_to_be32(args->layoutclass); *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); xdr_encode_hyper(p, 0ULL); /* cookie */ encode_nfs4_verifier(xdr, &dummy); - hdr->nops++; - hdr->replen += decode_getdevicelist_maxsz; } static void @@ -1936,15 +1857,13 @@ encode_getdeviceinfo(struct xdr_stream *xdr, { __be32 *p; - p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); - *p++ = cpu_to_be32(OP_GETDEVICEINFO); + encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); + p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ *p++ = cpu_to_be32(0); /* bitmap length 0 */ - hdr->nops++; - hdr->replen += decode_getdeviceinfo_maxsz; } static void @@ -1954,16 +1873,16 @@ encode_layoutget(struct xdr_stream *xdr, { __be32 *p; - p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_LAYOUTGET); + encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr); + p = reserve_space(xdr, 36); *p++ = cpu_to_be32(0); /* Signal layout available */ *p++ = cpu_to_be32(args->type); *p++ = cpu_to_be32(args->range.iomode); p = xdr_encode_hyper(p, args->range.offset); p = xdr_encode_hyper(p, args->range.length); p = xdr_encode_hyper(p, args->minlength); - p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); - *p = cpu_to_be32(args->maxcount); + encode_nfs4_stateid(xdr, &args->stateid); + encode_uint32(xdr, args->maxcount); dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", __func__, @@ -1972,8 +1891,6 @@ encode_layoutget(struct xdr_stream *xdr, (unsigned long)args->range.offset, (unsigned long)args->range.length, args->maxcount); - hdr->nops++; - hdr->replen += decode_layoutget_maxsz; } static int @@ -1987,13 +1904,14 @@ encode_layoutcommit(struct xdr_stream *xdr, dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, NFS_SERVER(args->inode)->pnfs_curr_ld->id); - p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr); + p = reserve_space(xdr, 20); /* Only whole file layouts */ p = xdr_encode_hyper(p, 0); /* offset */ p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ - *p++ = cpu_to_be32(0); /* reclaim */ - p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(0); /* reclaim */ + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 20); *p++ = cpu_to_be32(1); /* newoffset = TRUE */ p = xdr_encode_hyper(p, args->lastbytewritten); *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ @@ -2002,13 +1920,9 @@ encode_layoutcommit(struct xdr_stream *xdr, if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( NFS_I(inode)->layout, xdr, args); - else { - p = reserve_space(xdr, 4); - *p = cpu_to_be32(0); /* no layout-type payload */ - } + else + encode_uint32(xdr, 0); /* no layout-type payload */ - hdr->nops++; - hdr->replen += decode_layoutcommit_maxsz; return 0; } @@ -2019,27 +1933,23 @@ encode_layoutreturn(struct xdr_stream *xdr, { __be32 *p; - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(OP_LAYOUTRETURN); + encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); + p = reserve_space(xdr, 16); *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ *p++ = cpu_to_be32(args->layout_type); *p++ = cpu_to_be32(IOMODE_ANY); *p = cpu_to_be32(RETURN_FILE); - p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); + p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, 0); p = xdr_encode_hyper(p, NFS4_MAX_UINT64); spin_lock(&args->inode->i_lock); - xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); + encode_nfs4_stateid(xdr, &args->stateid); spin_unlock(&args->inode->i_lock); if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( NFS_I(args->inode)->layout, xdr, args); - } else { - p = reserve_space(xdr, 4); - *p = cpu_to_be32(0); - } - hdr->nops++; - hdr->replen += decode_layoutreturn_maxsz; + } else + encode_uint32(xdr, 0); } static int @@ -2047,12 +1957,8 @@ encode_secinfo_no_name(struct xdr_stream *xdr, const struct nfs41_secinfo_no_name_args *args, struct compound_hdr *hdr) { - __be32 *p; - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(OP_SECINFO_NO_NAME); - *p++ = cpu_to_be32(args->style); - hdr->nops++; - hdr->replen += decode_secinfo_no_name_maxsz; + encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr); + encode_uint32(xdr, args->style); return 0; } @@ -2060,26 +1966,17 @@ static void encode_test_stateid(struct xdr_stream *xdr, struct nfs41_test_stateid_args *args, struct compound_hdr *hdr) { - __be32 *p; - - p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_TEST_STATEID); - *p++ = cpu_to_be32(1); - xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); - hdr->nops++; - hdr->replen += decode_test_stateid_maxsz; + encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); + encode_uint32(xdr, 1); + encode_nfs4_stateid(xdr, args->stateid); } static void encode_free_stateid(struct xdr_stream *xdr, struct nfs41_free_stateid_args *args, struct compound_hdr *hdr) { - __be32 *p; - p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); - *p++ = cpu_to_be32(OP_FREE_STATEID); - xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); - hdr->nops++; - hdr->replen += decode_free_stateid_maxsz; + encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); + encode_nfs4_stateid(xdr, args->stateid); } #endif /* CONFIG_NFS_V4_1 */ @@ -2633,6 +2530,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + FATTR4_WORD0_FH_EXPIRE_TYPE| FATTR4_WORD0_LINK_SUPPORT| FATTR4_WORD0_SYMLINK_SUPPORT| FATTR4_WORD0_ACLSUPPORT, &hdr); @@ -2650,7 +2548,7 @@ static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, }; encode_compound_hdr(xdr, req, &hdr); - encode_renew(xdr, clp, &hdr); + encode_renew(xdr, clp->cl_clientid, &hdr); encode_nops(&hdr); } @@ -3180,6 +3078,28 @@ out_overflow: return -EIO; } +static int decode_attr_fh_expire_type(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *type) +{ + __be32 *p; + + *type = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; + } + dprintk("%s: expire type=0x%x\n", __func__, *type); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) { __be32 *p; @@ -3513,16 +3433,17 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) n = be32_to_cpup(p); if (n == 0) goto root_path; - dprintk("path "); + dprintk("pathname4: "); path->ncomponents = 0; while (path->ncomponents < n) { struct nfs4_string *component = &path->components[path->ncomponents]; status = decode_opaque_inline(xdr, &component->len, &component->data); if (unlikely(status != 0)) goto out_eio; - if (path->ncomponents != n) - dprintk("/"); - dprintk("%s", component->data); + ifdebug (XDR) + pr_cont("%s%.*s ", + (path->ncomponents != n ? "/ " : ""), + component->len, component->data); if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) path->ncomponents++; else { @@ -3531,14 +3452,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) } } out: - dprintk("\n"); return status; root_path: /* a root pathname is sent as a zero component4 */ path->ncomponents = 1; path->components[0].len=0; path->components[0].data=NULL; - dprintk("path /\n"); + dprintk("pathname4: /\n"); goto out; out_eio: dprintk(" status %d", status); @@ -3560,7 +3480,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = 0; if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) goto out; - dprintk("%s: fsroot ", __func__); + status = -EIO; + /* Ignore borken servers that return unrequested attrs */ + if (unlikely(res == NULL)) + goto out; + dprintk("%s: fsroot:\n", __func__); status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; @@ -3581,7 +3505,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st m = be32_to_cpup(p); loc->nservers = 0; - dprintk("%s: servers ", __func__); + dprintk("%s: servers:\n", __func__); while (loc->nservers < m) { struct nfs4_string *server = &loc->servers[loc->nservers]; status = decode_opaque_inline(xdr, &server->len, &server->data); @@ -3613,7 +3537,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st res->nlocations++; } if (res->nlocations != 0) - status = NFS_ATTR_FATTR_V4_REFERRAL; + status = NFS_ATTR_FATTR_V4_LOCATIONS; out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; @@ -4157,7 +4081,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { - return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); + return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); } static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) @@ -4174,7 +4098,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) static int decode_verifier(struct xdr_stream *xdr, void *verifier) { - return decode_opaque_fixed(xdr, verifier, 8); + return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE); } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) @@ -4224,6 +4148,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_fh_expire_type(xdr, bitmap, + &res->fh_expire_type)) != 0) + goto xdr_error; if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0) goto xdr_error; if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) @@ -4294,6 +4221,7 @@ xdr_error: static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, + struct nfs4_fs_locations *fs_loc, const struct nfs_server *server) { int status; @@ -4341,9 +4269,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, - struct nfs4_fs_locations, - fattr)); + status = decode_attr_fs_locations(xdr, bitmap, fs_loc); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -4407,7 +4333,8 @@ xdr_error: } static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, - struct nfs_fh *fh, const struct nfs_server *server) + struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, + const struct nfs_server *server) { __be32 *savep; uint32_t attrlen, @@ -4426,7 +4353,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); if (status < 0) goto xdr_error; @@ -4439,7 +4366,7 @@ xdr_error: static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, server); + return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); } /* @@ -4463,8 +4390,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, return 0; } if (num > 1) - printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " - "per filesystem not supported\n", __func__); + printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", __func__); /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, num * 4); @@ -4863,17 +4790,16 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n size_t hdrlen; u32 recvd, pglen = rcvbuf->page_len; int status; + __be32 verf[2]; status = decode_op_hdr(xdr, OP_READDIR); if (!status) status = decode_verifier(xdr, readdir->verifier.data); if (unlikely(status)) return status; + memcpy(verf, readdir->verifier.data, sizeof(verf)); dprintk("%s: verifier = %08x:%08x\n", - __func__, - ((u32 *)readdir->verifier.data)[0], - ((u32 *)readdir->verifier.data)[1]); - + __func__, verf[0], verf[1]); hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; @@ -5120,7 +5046,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) goto out_overflow; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); - memcpy(res->verf->verifier, p, 8); + memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5214,6 +5140,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, char *dummy_str; int status; struct nfs_client *clp = res->client; + uint32_t impl_id_count; status = decode_op_hdr(xdr, OP_EXCHANGE_ID); if (status) @@ -5255,11 +5182,38 @@ static int decode_exchange_id(struct xdr_stream *xdr, memcpy(res->server_scope->server_scope, dummy_str, dummy); res->server_scope->server_scope_sz = dummy; - /* Throw away Implementation id array */ - status = decode_opaque_inline(xdr, &dummy, &dummy_str); - if (unlikely(status)) - return status; + /* Implementation Id */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + impl_id_count = be32_to_cpup(p++); + + if (impl_id_count) { + /* nii_domain */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->domain, dummy_str, dummy); + /* nii_name */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->name, dummy_str, dummy); + + /* nii_date */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->impl_id->date.seconds); + res->impl_id->date.nseconds = be32_to_cpup(p); + + /* if there's more than one entry, ignore the rest */ + } return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5285,8 +5239,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr, attrs->max_reqs = be32_to_cpup(p++); nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { - printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", - __func__, nr_attrs); + printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs " + "count %u\n", __func__, nr_attrs); return -EINVAL; } if (nr_attrs == 1) { @@ -5436,14 +5390,14 @@ static int decode_getdevicelist(struct xdr_stream *xdr, p += 2; /* Read verifier */ - p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); + p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE); res->num_devs = be32_to_cpup(p); dprintk("%s: num_dev %d\n", __func__, res->num_devs); if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { - printk(KERN_ERR "%s too many result dev_num %u\n", + printk(KERN_ERR "NFS: %s too many result dev_num %u\n", __func__, res->num_devs); return -EIO; } @@ -5537,11 +5491,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) return status; - p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p); + decode_stateid(xdr, &res->stateid); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - res->return_on_close = be32_to_cpup(p++); - p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); layout_count = be32_to_cpup(p); if (!layout_count) { dprintk("%s: server responded with empty layout array\n", @@ -5666,7 +5623,8 @@ static int decode_test_stateid(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; res->status = be32_to_cpup(p++); - return res->status; + + return status; out_overflow: print_overflow_msg(__func__, xdr); out: @@ -6583,8 +6541,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (status) goto out; xdr_enter_page(xdr, PAGE_SIZE); - status = decode_getfattr(xdr, &res->fs_locations->fattr, - res->fs_locations->server); + status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, + NULL, res->fs_locations, + res->fs_locations->server); out: return status; } @@ -6964,7 +6923,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - entry->server) < 0) + NULL, entry->server) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; @@ -7112,7 +7071,7 @@ struct rpc_procinfo nfs4_procedures[] = { #endif /* CONFIG_NFS_V4_1 */ }; -struct rpc_version nfs_version4 = { +const struct rpc_version nfs_version4 = { .number = 4, .nrprocs = ARRAY_SIZE(nfs4_procedures), .procs = nfs4_procedures diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index c4744e1..cd3c910 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -104,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; /* server:export path string passed to super.c */ static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG /* * When the "nfsrootdebug" kernel command line option is specified, * enable debugging messages for NFSROOT. diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 55d0128..4bff4a3 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -137,6 +137,7 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; + bool retry_flag = true; int err; ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); @@ -171,10 +172,18 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, goto out; } +retry_lookup: od = osduld_info_lookup(&odi); if (unlikely(IS_ERR(od))) { err = PTR_ERR(od); dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + if (err == -ENODEV && retry_flag) { + err = objlayout_autologin(deviceaddr); + if (likely(!err)) { + retry_flag = false; + goto retry_lookup; + } + } goto out; } @@ -205,25 +214,36 @@ static void copy_single_comp(struct ore_components *oc, unsigned c, int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, struct objio_segment **pseg) { - struct __alloc_objio_segment { - struct objio_segment olseg; - struct ore_dev *ods[numdevs]; - struct ore_comp comps[numdevs]; - } *aolseg; - - aolseg = kzalloc(sizeof(*aolseg), gfp_flags); - if (unlikely(!aolseg)) { +/* This is the in memory structure of the objio_segment + * + * struct __alloc_objio_segment { + * struct objio_segment olseg; + * struct ore_dev *ods[numdevs]; + * struct ore_comp comps[numdevs]; + * } *aolseg; + * NOTE: The code as above compiles and runs perfectly. It is elegant, + * type safe and compact. At some Past time Linus has decided he does not + * like variable length arrays, For the sake of this principal we uglify + * the code as below. + */ + struct objio_segment *lseg; + size_t lseg_size = sizeof(*lseg) + + numdevs * sizeof(lseg->oc.ods[0]) + + numdevs * sizeof(*lseg->oc.comps); + + lseg = kzalloc(lseg_size, gfp_flags); + if (unlikely(!lseg)) { dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, - numdevs, sizeof(*aolseg)); + numdevs, lseg_size); return -ENOMEM; } - aolseg->olseg.oc.numdevs = numdevs; - aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; - aolseg->olseg.oc.comps = aolseg->comps; - aolseg->olseg.oc.ods = aolseg->ods; + lseg->oc.numdevs = numdevs; + lseg->oc.single_comp = EC_MULTPLE_COMPS; + lseg->oc.ods = (void *)(lseg + 1); + lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); - *pseg = &aolseg->olseg; + *pseg = lseg; return 0; } @@ -582,10 +602,10 @@ objlayout_init(void) if (ret) printk(KERN_INFO - "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", + "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", __func__, ret); else - printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", + printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", __func__); return ret; } @@ -594,7 +614,7 @@ static void __exit objlayout_exit(void) { pnfs_unregister_layoutdriver(&objlayout_type); - printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", + printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", __func__); } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index b3c2903..8d45f1c 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -37,6 +37,9 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/kmod.h> +#include <linux/moduleparam.h> +#include <linux/ratelimit.h> #include <scsi/osd_initiator.h> #include "objlayout.h" @@ -156,7 +159,7 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -void _fix_verify_io_params(struct pnfs_layout_segment *lseg, +static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, struct page ***p_pages, unsigned *p_pgbase, u64 offset, unsigned long count) { @@ -490,9 +493,9 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) if (!ioerr->oer_errno) continue; - printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " - "dev(%llx:%llx) par=0x%llx obj=0x%llx " - "offset=0x%llx length=0x%llx\n", + printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " + "is_write=%d dev(%llx:%llx) par=0x%llx " + "obj=0x%llx offset=0x%llx length=0x%llx\n", __func__, i, ioerr->oer_errno, ioerr->oer_iswrite, _DEVID_LO(&ioerr->oer_component.oid_device_id), @@ -651,3 +654,134 @@ void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) __free_page(odi->page); kfree(odi); } + +enum { + OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, + OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, + OSD_LOGIN_UPCALL_PATHLEN = 256 +}; + +static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; + +module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), + 0600); +MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); + +struct __auto_login { + char uri[OBJLAYOUT_MAX_URI_LEN]; + char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; + char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; +}; + +static int __objlayout_upcall(struct __auto_login *login) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int ret; + + if (unlikely(!osd_login_prog[0])) { + dprintk("%s: osd_login_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s uri: %s\n", __func__, login->uri); + dprintk("%s osdname %s\n", __func__, login->osdname); + dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); + + argv[0] = (char *)osd_login_prog; + argv[1] = "-u"; + argv[2] = login->uri; + argv[3] = "-o"; + argv[4] = login->osdname; + argv[5] = "-s"; + argv[6] = login->systemid_hex; + argv[7] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the objlayoutdriver.osd_login_prog module parameter once + * the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + printk(KERN_ERR "PNFS-OBJ: %s was not found please set " + "objlayoutdriver.osd_login_prog kernel parameter!\n", + osd_login_prog); + osd_login_prog[0] = '\0'; + } + dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); + + return ret; +} + +/* Assume dest is all zeros */ +static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, + char *dest, int max_len, + const char *var_name) +{ + if (!s.len) + return; + + if (s.len >= max_len) { + pr_warn_ratelimited( + "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", + var_name, s.len, max_len); + s.len = max_len - 1; /* space for null terminator */ + } + + memcpy(dest, s.data, s.len); +} + +/* Assume sysid is all zeros */ +static void _sysid_2_hex(struct nfs4_string s, + char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) +{ + int i; + char *cur; + + if (!s.len) + return; + + if (s.len != OSD_SYSTEMID_LEN) { + pr_warn_ratelimited( + "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", + s.len); + if (s.len > OSD_SYSTEMID_LEN) + s.len = OSD_SYSTEMID_LEN; + } + + cur = sysid; + for (i = 0; i < s.len; i++) + cur = hex_byte_pack(cur, s.data[i]); +} + +int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) +{ + int rc; + struct __auto_login login; + + if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) + return -ENODEV; + + memset(&login, 0, sizeof(login)); + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_targetaddr.ota_netaddr.r_addr, + login.uri, sizeof(login.uri), "URI"); + + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_osdname, + login.osdname, sizeof(login.osdname), "OSDNAME"); + + _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); + + rc = __objlayout_upcall(&login); + if (rc > 0) /* script returns positive values */ + rc = -ENODEV; + + return rc; +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 8ec3472..880ba08 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -184,4 +184,6 @@ extern void objlayout_encode_layoutreturn( struct xdr_stream *, const struct nfs4_layoutreturn_args *); +extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); + #endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 5668f7c..d21fcea 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/sched.h> #include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/nfs_page.h> @@ -106,36 +107,6 @@ void nfs_unlock_request(struct nfs_page *req) nfs_release_request(req); } -/** - * nfs_set_page_tag_locked - Tag a request as locked - * @req: - */ -int nfs_set_page_tag_locked(struct nfs_page *req) -{ - if (!nfs_lock_request_dontget(req)) - return 0; - if (test_bit(PG_MAPPED, &req->wb_flags)) - radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - return 1; -} - -/** - * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers - */ -void nfs_clear_page_tag_locked(struct nfs_page *req) -{ - if (test_bit(PG_MAPPED, &req->wb_flags)) { - struct inode *inode = req->wb_context->dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); - - spin_lock(&inode->i_lock); - radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - nfs_unlock_request(req); - spin_unlock(&inode->i_lock); - } else - nfs_unlock_request(req); -} - /* * nfs_clear_request - Free up all resources allocated to the request * @req: @@ -425,67 +396,6 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) } } -#define NFS_SCAN_MAXENTRIES 16 -/** - * nfs_scan_list - Scan a list for matching requests - * @nfsi: NFS inode - * @dst: Destination list - * @idx_start: lower bound of page->index to scan - * @npages: idx_start + npages sets the upper bound to scan. - * @tag: tag to scan for - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space - * starting at index idx_start, is scanned. - * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's i_lock when calling this function - */ -int nfs_scan_list(struct nfs_inode *nfsi, - struct list_head *dst, pgoff_t idx_start, - unsigned int npages, int tag) -{ - struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; - struct nfs_page *req; - pgoff_t idx_end; - int found, i; - int res; - struct list_head *list; - - res = 0; - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; - - for (;;) { - found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, - (void **)&pgvec[0], idx_start, - NFS_SCAN_MAXENTRIES, tag); - if (found <= 0) - break; - for (i = 0; i < found; i++) { - req = pgvec[i]; - if (req->wb_index > idx_end) - goto out; - idx_start = req->wb_index + 1; - if (nfs_set_page_tag_locked(req)) { - kref_get(&req->wb_kref); - radix_tree_tag_clear(&nfsi->nfs_page_tree, - req->wb_index, tag); - list = pnfs_choose_commit_list(req, dst); - nfs_list_add_request(req, list); - res++; - if (res == INT_MAX) - goto out; - } - } - /* for latency reduction */ - cond_resched_lock(&nfsi->vfs_inode.i_lock); - } -out: - return res; -} - int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 17149a4..b5d4515 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -101,8 +101,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, - id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", + __func__, id, server->nfs_client->cl_exchange_flags); goto out_no_driver; } ld_type = find_pnfs_driver(id); @@ -122,8 +122,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { - printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", - __func__, id); + printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " + "driver %u.\n", __func__, id); module_put(ld_type->owner); goto out_no_driver; } @@ -143,11 +143,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) struct pnfs_layoutdriver_type *tmp; if (ld_type->id == 0) { - printk(KERN_ERR "%s id 0 is reserved\n", __func__); + printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); return status; } if (!ld_type->alloc_lseg || !ld_type->free_lseg) { - printk(KERN_ERR "%s Layout driver must provide " + printk(KERN_ERR "NFS: %s Layout driver must provide " "alloc_lseg and free_lseg.\n", __func__); return status; } @@ -160,7 +160,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, ld_type->name); } else { - printk(KERN_ERR "%s Module with id %d already loaded!\n", + printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", __func__, ld_type->id); } spin_unlock(&pnfs_spinlock); @@ -496,12 +496,12 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, { u32 oldseq, newseq; - oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); - newseq = be32_to_cpu(new->stateid.seqid); + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); if ((int)(newseq - oldseq) > 0) { - memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); + nfs4_stateid_copy(&lo->plh_stateid, new); if (update_barrier) { - u32 new_barrier = be32_to_cpu(new->stateid.seqid); + u32 new_barrier = be32_to_cpu(new->seqid); if ((int)(new_barrier - lo->plh_barrier)) lo->plh_barrier = new_barrier; @@ -525,7 +525,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, int lget) { if ((stateid) && - (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) + (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0) return true; return lo->plh_block_lgets || test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || @@ -549,11 +549,10 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, do { seq = read_seqbegin(&open_state->seqlock); - memcpy(dst->data, open_state->stateid.data, - sizeof(open_state->stateid.data)); + nfs4_stateid_copy(dst, &open_state->stateid); } while (read_seqretry(&open_state->seqlock, seq)); } else - memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); + nfs4_stateid_copy(dst, &lo->plh_stateid); spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); return status; @@ -590,7 +589,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; max_pages = max_resp_sz >> PAGE_SHIFT; - pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); if (!pages) goto out_err_free; @@ -760,7 +759,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier) } if (!found) { struct pnfs_layout_hdr *lo = nfsi->layout; - u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); + u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); /* Since close does not return a layout stateid for use as * a barrier, we choose the worst-case barrier. @@ -966,8 +965,7 @@ pnfs_update_layout(struct inode *ino, } /* Do we even need to bother with this? */ - if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } @@ -1032,7 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; - struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; int status = 0; /* Inject layout blob into I/O device driver */ @@ -1048,8 +1045,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_lock(&ino->i_lock); - if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { dprintk("%s forget reply due to recall\n", __func__); goto out_forget_reply; } @@ -1214,6 +1210,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data) } data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages); } + put_lseg(data->lseg); data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); @@ -1227,6 +1224,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_list_add_request(data->req, &desc->pg_list); nfs_pageio_reset_write_mds(desc); desc->pg_recoalesce = 1; + put_lseg(data->lseg); nfs_writedata_release(data); } @@ -1327,6 +1325,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data) data->mds_ops->rpc_call_done(&data->task, data); } else pnfs_ld_handle_read_error(data); + put_lseg(data->lseg); data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1530,8 +1529,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) end_pos = nfsi->layout->plh_lwb; nfsi->layout->plh_lwb = 0; - memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, - sizeof(nfsi->layout->plh_stateid.data)); + nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); data->args.inode = inode; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 53d593a..442ebf6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,11 +94,10 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_read_ops; const struct nfs_pageio_ops *pg_write_ops; - /* Returns true if layoutdriver wants to divert this request to - * driver's commit routine. - */ - bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); - struct list_head * (*choose_commit_list) (struct nfs_page *req); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg); + void (*clear_request_commit) (struct nfs_page *req); + int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock); int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); /* @@ -229,7 +228,6 @@ struct nfs4_deviceid_node { atomic_t ref; }; -void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, @@ -262,20 +260,6 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) return nfss->pnfs_curr_ld != NULL; } -static inline void -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) -{ - if (lseg) { - struct pnfs_layoutdriver_type *ld; - - ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; - if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { - set_bit(PG_PNFS_COMMIT, &req->wb_flags); - req->wb_commit_lseg = get_lseg(lseg); - } - } -} - static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) { @@ -284,27 +268,42 @@ pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); } -static inline struct list_head * -pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { - struct list_head *rv; + struct inode *inode = req->wb_context->dentry->d_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { - struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; + if (lseg == NULL || ld->mark_request_commit == NULL) + return false; + ld->mark_request_commit(req, lseg); + return true; +} - set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); - rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); - /* matched by ref taken when PG_PNFS_COMMIT is set */ - put_lseg(req->wb_commit_lseg); - } else - rv = mds; - return rv; +static inline bool +pnfs_clear_request_commit(struct nfs_page *req) +{ + struct inode *inode = req->wb_context->dentry->d_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->clear_request_commit == NULL) + return false; + ld->clear_request_commit(req); + return true; } -static inline void pnfs_clear_request_commit(struct nfs_page *req) +static inline int +pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock) { - if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) - put_lseg(req->wb_commit_lseg); + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + int ret; + + if (ld == NULL || ld->scan_commit_lists == NULL) + return 0; + ret = ld->scan_commit_lists(inode, max, lock); + if (ret != 0) + set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); + return ret; } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -328,6 +327,13 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +#ifdef NFS_DEBUG +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +#else +static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) +{ +} +#endif /* NFS_DEBUG */ #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -400,35 +406,35 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st return false; } -static inline void -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) -{ -} - static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) { return PNFS_NOT_ATTEMPTED; } -static inline struct list_head * -pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { - return mds; + return false; } -static inline void pnfs_clear_request_commit(struct nfs_page *req) +static inline bool +pnfs_clear_request_commit(struct nfs_page *req) { + return false; } -static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +static inline int +pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock) { return 0; } -static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { + return 0; } + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 4f359d2..73f701f 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,6 +43,7 @@ static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); +#ifdef NFS_DEBUG void nfs4_print_deviceid(const struct nfs4_deviceid *id) { @@ -52,6 +53,7 @@ nfs4_print_deviceid(const struct nfs4_deviceid *id) p[0], p[1], p[2], p[3]); } EXPORT_SYMBOL_GPL(nfs4_print_deviceid); +#endif static inline u32 nfs4_deviceid_hash(const struct nfs4_deviceid *id) @@ -92,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, * @clp nfs_client associated with deviceid * @id deviceid to look up */ -struct nfs4_deviceid_node * +static struct nfs4_deviceid_node * _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, const struct nfs_client *clp, const struct nfs4_deviceid *id, long hash) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 0c672588..b63b6f4 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -358,6 +358,11 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } +static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { if (nfs_async_handle_expired_key(task)) @@ -372,6 +377,11 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } +static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} + static int nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, struct inode *new_dir) @@ -651,6 +661,11 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message * msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } +static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ + rpc_call_start(task); +} + static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (nfs_async_handle_expired_key(task)) @@ -668,6 +683,11 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } +static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ + rpc_call_start(task); +} + static void nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) { @@ -721,9 +741,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .create = nfs_proc_create, .remove = nfs_proc_remove, .unlink_setup = nfs_proc_unlink_setup, + .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, .unlink_done = nfs_proc_unlink_done, .rename = nfs_proc_rename, .rename_setup = nfs_proc_rename_setup, + .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, .rename_done = nfs_proc_rename_done, .link = nfs_proc_link, .symlink = nfs_proc_symlink, @@ -736,8 +758,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .pathconf = nfs_proc_pathconf, .decode_dirent = nfs2_decode_dirent, .read_setup = nfs_proc_read_setup, + .read_rpc_prepare = nfs_proc_read_rpc_prepare, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, + .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, .lock = nfs_proc_lock, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index cfa175c..cc1f758 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -66,7 +66,6 @@ void nfs_readdata_free(struct nfs_read_data *p) void nfs_readdata_release(struct nfs_read_data *rdata) { - put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } @@ -465,23 +464,14 @@ static void nfs_readpage_release_partial(void *calldata) nfs_readdata_release(calldata); } -#if defined(CONFIG_NFS_V4_1) void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - - if (nfs4_setup_sequence(NFS_SERVER(data->inode), - &data->args.seq_args, &data->res.seq_res, - 0, task)) - return; - rpc_call_start(task); + NFS_PROTO(data->inode)->read_rpc_prepare(task, data); } -#endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_read_partial_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_readpage_result_partial, .rpc_release = nfs_readpage_release_partial, }; @@ -545,9 +535,7 @@ static void nfs_readpage_release_full(void *calldata) } static const struct rpc_call_ops nfs_read_full_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_readpage_result_full, .rpc_release = nfs_readpage_release_full, }; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3dfa4f1..ccc4cdb 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -52,6 +52,8 @@ #include <linux/nfs_xdr.h> #include <linux/magic.h> #include <linux/parser.h> +#include <linux/nsproxy.h> +#include <linux/rcupdate.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -79,7 +81,6 @@ enum { Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, Opt_v4, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, @@ -97,10 +98,10 @@ enum { Opt_namelen, Opt_mountport, Opt_mountvers, - Opt_nfsvers, Opt_minorversion, /* Mount options that take string arguments */ + Opt_nfsvers, Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, Opt_lookupcache, @@ -132,9 +133,6 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_noac, "noac" }, { Opt_lock, "lock" }, { Opt_nolock, "nolock" }, - { Opt_v2, "v2" }, - { Opt_v3, "v3" }, - { Opt_v4, "v4" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -163,9 +161,10 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_namelen, "namlen=%s" }, { Opt_mountport, "mountport=%s" }, { Opt_mountvers, "mountvers=%s" }, + { Opt_minorversion, "minorversion=%s" }, + { Opt_nfsvers, "nfsvers=%s" }, { Opt_nfsvers, "vers=%s" }, - { Opt_minorversion, "minorversion=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -179,6 +178,9 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_fscache_uniq, "fsc=%s" }, { Opt_local_lock, "local_lock=%s" }, + /* The following needs to be listed after all other options */ + { Opt_nfsvers, "v%s" }, + { Opt_err, NULL } }; @@ -259,6 +261,22 @@ static match_table_t nfs_local_lock_tokens = { { Opt_local_lock_err, NULL } }; +enum { + Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, + Opt_vers_4_1, + + Opt_vers_err +}; + +static match_table_t nfs_vers_tokens = { + { Opt_vers_2, "2" }, + { Opt_vers_3, "3" }, + { Opt_vers_4, "4" }, + { Opt_vers_4_0, "4.0" }, + { Opt_vers_4_1, "4.1" }, + + { Opt_vers_err, NULL } +}; static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -620,7 +638,6 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, struct nfs_client *clp = nfss->nfs_client; seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); - seq_printf(m, ",minorversion=%u", clp->cl_minorversion); } #else static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, @@ -629,6 +646,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, } #endif +static void nfs_show_nfs_version(struct seq_file *m, + unsigned int version, + unsigned int minorversion) +{ + seq_printf(m, ",vers=%u", version); + if (version == 4) + seq_printf(m, ".%u", minorversion); +} + /* * Describe the mount options in force on this server representation */ @@ -656,7 +682,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, u32 version = clp->rpc_ops->version; int local_flock, local_fcntl; - seq_printf(m, ",vers=%u", version); + nfs_show_nfs_version(m, version, clp->cl_minorversion); seq_printf(m, ",rsize=%u", nfss->rsize); seq_printf(m, ",wsize=%u", nfss->wsize); if (nfss->bsize != 0) @@ -676,8 +702,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else seq_puts(m, nfs_infop->nostr); } + rcu_read_lock(); seq_printf(m, ",proto=%s", rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); + rcu_read_unlock(); if (version == 4) { if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); @@ -726,9 +754,11 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root) nfs_show_mount_options(m, nfss, 0); + rcu_read_lock(); seq_printf(m, ",addr=%s", rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); return 0; } @@ -745,7 +775,6 @@ static void show_sessions(struct seq_file *m, struct nfs_server *server) {} #endif #endif -#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 static void show_pnfs(struct seq_file *m, struct nfs_server *server) { @@ -755,9 +784,26 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server) else seq_printf(m, "not configured"); } + +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ + if (nfss->nfs_client && nfss->nfs_client->impl_id) { + struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id; + seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," + "date='%llu,%u'", + impl_id->name, impl_id->domain, + impl_id->date.seconds, impl_id->date.nseconds); + } +} #else -static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#ifdef CONFIG_NFS_V4 +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +} #endif +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +} #endif static int nfs_show_devname(struct seq_file *m, struct dentry *root) @@ -806,6 +852,8 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + show_implementation_id(m, nfss); + seq_printf(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); seq_printf(m, ",wtmult=%u", nfss->wtmult); @@ -908,6 +956,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data->auth_flavor_len = 1; data->version = version; data->minorversion = 0; + data->net = current->nsproxy->net_ns; security_init_mnt_opts(&data->lsm_opts); } return data; @@ -1052,6 +1101,40 @@ static int nfs_parse_security_flavors(char *value, return 1; } +static int nfs_parse_version_string(char *string, + struct nfs_parsed_mount_data *mnt, + substring_t *args) +{ + mnt->flags &= ~NFS_MOUNT_VER3; + switch (match_token(string, nfs_vers_tokens, args)) { + case Opt_vers_2: + mnt->version = 2; + break; + case Opt_vers_3: + mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; + case Opt_vers_4: + /* Backward compatibility option. In future, + * the mount program should always supply + * a NFSv4 minor version number. + */ + mnt->version = 4; + break; + case Opt_vers_4_0: + mnt->version = 4; + mnt->minorversion = 0; + break; + case Opt_vers_4_1: + mnt->version = 4; + mnt->minorversion = 1; + break; + default: + return 0; + } + return 1; +} + static int nfs_get_option_str(substring_t args[], char **option) { kfree(*option); @@ -1157,18 +1240,6 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); break; - case Opt_v2: - mnt->flags &= ~NFS_MOUNT_VER3; - mnt->version = 2; - break; - case Opt_v3: - mnt->flags |= NFS_MOUNT_VER3; - mnt->version = 3; - break; - case Opt_v4: - mnt->flags &= ~NFS_MOUNT_VER3; - mnt->version = 4; - break; case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1295,26 +1366,6 @@ static int nfs_parse_mount_options(char *raw, goto out_invalid_value; mnt->mount_server.version = option; break; - case Opt_nfsvers: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - switch (option) { - case NFS2_VERSION: - mnt->flags &= ~NFS_MOUNT_VER3; - mnt->version = 2; - break; - case NFS3_VERSION: - mnt->flags |= NFS_MOUNT_VER3; - mnt->version = 3; - break; - case NFS4_VERSION: - mnt->flags &= ~NFS_MOUNT_VER3; - mnt->version = 4; - break; - default: - goto out_invalid_value; - } - break; case Opt_minorversion: if (nfs_get_option_ul(args, &option)) goto out_invalid_value; @@ -1326,6 +1377,15 @@ static int nfs_parse_mount_options(char *raw, /* * options that take text values */ + case Opt_nfsvers: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = nfs_parse_version_string(string, mnt, args); + kfree(string); + if (!rc) + goto out_invalid_value; + break; case Opt_sec: string = match_strdup(args); if (string == NULL) @@ -1405,7 +1465,7 @@ static int nfs_parse_mount_options(char *raw, if (string == NULL) goto out_nomem; mnt->nfs_server.addrlen = - rpc_pton(string, strlen(string), + rpc_pton(mnt->net, string, strlen(string), (struct sockaddr *) &mnt->nfs_server.address, sizeof(mnt->nfs_server.address)); @@ -1427,7 +1487,7 @@ static int nfs_parse_mount_options(char *raw, if (string == NULL) goto out_nomem; mnt->mount_server.addrlen = - rpc_pton(string, strlen(string), + rpc_pton(mnt->net, string, strlen(string), (struct sockaddr *) &mnt->mount_server.address, sizeof(mnt->mount_server.address)); @@ -1516,6 +1576,9 @@ static int nfs_parse_mount_options(char *raw, if (!sloppy && invalid_option) return 0; + if (mnt->minorversion && mnt->version != 4) + goto out_minorversion_mismatch; + /* * verify that any proto=/mountproto= options match the address * familiies in the addr=/mountaddr= options. @@ -1549,6 +1612,10 @@ out_invalid_address: out_invalid_value: printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); return 0; +out_minorversion_mismatch: + printk(KERN_INFO "NFS: mount option vers=%u does not support " + "minorversion=%u\n", mnt->version, mnt->minorversion); + return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); return 0; @@ -1622,6 +1689,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, .auth_flav_len = &server_authlist_len, .auth_flavs = server_authlist, + .net = args->net, }; int status; @@ -2047,7 +2115,7 @@ static inline void nfs_initialise_sb(struct super_block *sb) /* We probably want something more informative here */ snprintf(sb->s_id, sizeof(sb->s_id), - "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev)); if (sb->s_blocksize == 0) sb->s_blocksize = nfs_block_bits(server->wsize, @@ -2499,12 +2567,6 @@ static int nfs4_validate_text_mount_data(void *options, return -EINVAL; } - if (args->client_address == NULL) { - dfprintk(MOUNT, - "NFS4: mount program didn't pass callback address\n"); - return -EINVAL; - } - return nfs_parse_devname(dev_name, &args->nfs_server.hostname, NFS4_MAXNAMLEN, @@ -2663,8 +2725,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, if (!s->s_root) { /* initial superblock/root creation */ nfs4_fill_super(s); - nfs_fscache_get_super_cookie( - s, data ? data->fscache_uniq : NULL, NULL); + nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL); } mntroot = nfs4_get_root(s, mntfh, dev_name); diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 978aaeb..ad4d2e7 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,7 +32,6 @@ static ctl_table nfs_cb_sysctls[] = { .extra1 = (int *)&nfs_set_port_min, .extra2 = (int *)&nfs_set_port_max, }, -#ifndef CONFIG_NFS_USE_NEW_IDMAPPER { .procname = "idmap_cache_timeout", .data = &nfs_idmap_cache_timeout, @@ -40,7 +39,6 @@ static ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ #endif { .procname = "nfs_mountpoint_timeout", diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 4f9319a..3210a03 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -20,15 +20,6 @@ #include "iostat.h" #include "delegation.h" -struct nfs_unlinkdata { - struct hlist_node list; - struct nfs_removeargs args; - struct nfs_removeres res; - struct inode *dir; - struct rpc_cred *cred; - struct nfs_fattr dir_attr; -}; - /** * nfs_free_unlinkdata - release data from a sillydelete operation. * @data: pointer to unlink structure. @@ -107,25 +98,16 @@ static void nfs_async_unlink_release(void *calldata) nfs_sb_deactive(sb); } -#if defined(CONFIG_NFS_V4_1) -void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) { struct nfs_unlinkdata *data = calldata; - struct nfs_server *server = NFS_SERVER(data->dir); - - if (nfs4_setup_sequence(server, &data->args.seq_args, - &data->res.seq_res, 1, task)) - return; - rpc_call_start(task); + NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data); } -#endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_done = nfs_async_unlink_done, .rpc_release = nfs_async_unlink_release, -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_unlink_prepare, -#endif /* CONFIG_NFS_V4_1 */ }; static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) @@ -341,18 +323,6 @@ nfs_cancel_async_unlink(struct dentry *dentry) spin_unlock(&dentry->d_lock); } -struct nfs_renamedata { - struct nfs_renameargs args; - struct nfs_renameres res; - struct rpc_cred *cred; - struct inode *old_dir; - struct dentry *old_dentry; - struct nfs_fattr old_fattr; - struct inode *new_dir; - struct dentry *new_dentry; - struct nfs_fattr new_fattr; -}; - /** * nfs_async_rename_done - Sillyrename post-processing * @task: rpc_task of the sillyrename @@ -403,25 +373,16 @@ static void nfs_async_rename_release(void *calldata) kfree(data); } -#if defined(CONFIG_NFS_V4_1) static void nfs_rename_prepare(struct rpc_task *task, void *calldata) { struct nfs_renamedata *data = calldata; - struct nfs_server *server = NFS_SERVER(data->old_dir); - - if (nfs4_setup_sequence(server, &data->args.seq_args, - &data->res.seq_res, 1, task)) - return; - rpc_call_start(task); + NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data); } -#endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_rename_ops = { .rpc_call_done = nfs_async_rename_done, .rpc_release = nfs_async_rename_release, -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_rename_prepare, -#endif /* CONFIG_NFS_V4_1 */ }; /** diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 834f0fe..2c68818 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -100,7 +100,6 @@ void nfs_writedata_free(struct nfs_write_data *p) void nfs_writedata_release(struct nfs_write_data *wdata) { - put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -236,10 +235,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo req = nfs_page_find_request_locked(page); if (req == NULL) break; - if (nfs_set_page_tag_locked(req)) + if (nfs_lock_request_dontget(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_set_page_tag_locked() will always + * then the call to nfs_lock_request_dontget() will always * succeed provided that someone hasn't already marked the * request as dirty (in which case we don't care). */ @@ -375,21 +374,14 @@ out_err: /* * Insert a write request into an inode */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = radix_tree_preload(GFP_NOFS); - if (error != 0) - goto out; /* Lock the request! */ nfs_lock_request_dontget(req); spin_lock(&inode->i_lock); - error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); - BUG_ON(error); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) inode->i_version++; set_bit(PG_MAPPED, &req->wb_flags); @@ -397,12 +389,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; kref_get(&req->wb_kref); - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, - NFS_PAGE_TAG_LOCKED); spin_unlock(&inode->i_lock); - radix_tree_preload_end(); -out: - return error; } /* @@ -419,7 +406,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); clear_bit(PG_MAPPED, &req->wb_flags); - radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; spin_unlock(&inode->i_lock); nfs_release_request(req); @@ -432,39 +418,90 @@ nfs_mark_request_dirty(struct nfs_page *req) } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -/* - * Add a request to the inode's commit list. +/** + * nfs_request_add_commit_list - add request to a commit list + * @req: pointer to a struct nfs_page + * @head: commit list head + * + * This sets the PG_CLEAN bit, updates the inode global count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must _not_ hold the inode->i_lock, but must be + * holding the nfs_page lock. */ -static void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +void +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head) { struct inode *inode = req->wb_context->dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); set_bit(PG_CLEAN, &(req)->wb_flags); - radix_tree_tag_set(&nfsi->nfs_page_tree, - req->wb_index, - NFS_PAGE_TAG_COMMIT); - nfsi->ncommit++; + spin_lock(&inode->i_lock); + nfs_list_add_request(req, head); + NFS_I(inode)->ncommit++; spin_unlock(&inode->i_lock); - pnfs_mark_request_commit(req, lseg); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); -static int +/** + * nfs_request_remove_commit_list - Remove request from a commit list + * @req: pointer to a nfs_page + * + * This clears the PG_CLEAN bit, and updates the inode global count of + * number of outstanding requests requiring a commit + * It does not update the MM page stats. + * + * The caller _must_ hold the inode->i_lock and the nfs_page lock. + */ +void +nfs_request_remove_commit_list(struct nfs_page *req) +{ + struct inode *inode = req->wb_context->dentry->d_inode; + + if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) + return; + nfs_list_remove_request(req); + NFS_I(inode)->ncommit--; +} +EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); + + +/* + * Add a request to the inode's commit list. + */ +static void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + struct inode *inode = req->wb_context->dentry->d_inode; + + if (pnfs_mark_request_commit(req, lseg)) + return; + nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list); +} + +static void +nfs_clear_page_commit(struct page *page) +{ + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); +} + +static void nfs_clear_request_commit(struct nfs_page *req) { - struct page *page = req->wb_page; + if (test_bit(PG_CLEAN, &req->wb_flags)) { + struct inode *inode = req->wb_context->dentry->d_inode; - if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { - dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); - return 1; + if (!pnfs_clear_request_commit(req)) { + spin_lock(&inode->i_lock); + nfs_request_remove_commit_list(req); + spin_unlock(&inode->i_lock); + } + nfs_clear_page_commit(req->wb_page); } - return 0; } static inline @@ -491,15 +528,14 @@ int nfs_reschedule_unstable_write(struct nfs_page *req, return 0; } #else -static inline void +static void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { } -static inline int +static void nfs_clear_request_commit(struct nfs_page *req) { - return 0; } static inline @@ -520,46 +556,65 @@ int nfs_reschedule_unstable_write(struct nfs_page *req, static int nfs_need_commit(struct nfs_inode *nfsi) { - return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); + return nfsi->ncommit > 0; +} + +/* i_lock held by caller */ +static int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max, + spinlock_t *lock) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + if (cond_resched_lock(lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req); + nfs_list_add_request(req, dst); + ret++; + if (ret == max) + break; + } + return ret; } /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan * @dst: destination list - * @idx_start: lower bound of page->index to scan. - * @npages: idx_start + npages sets the upper bound to scan. * * Moves requests from the inode's 'commit' request list. * The requests are *not* checked to ensure that they form a contiguous set. */ static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +nfs_scan_commit(struct inode *inode, struct list_head *dst) { struct nfs_inode *nfsi = NFS_I(inode); - int ret; - - if (!nfs_need_commit(nfsi)) - return 0; + int ret = 0; spin_lock(&inode->i_lock); - ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); - if (ret > 0) - nfsi->ncommit -= ret; - spin_unlock(&inode->i_lock); - - if (nfs_need_commit(NFS_I(inode))) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + if (nfsi->ncommit > 0) { + const int max = INT_MAX; + ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max, + &inode->i_lock); + ret += pnfs_scan_commit_lists(inode, max - ret, + &inode->i_lock); + } + spin_unlock(&inode->i_lock); return ret; } + #else static inline int nfs_need_commit(struct nfs_inode *nfsi) { return 0; } -static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst) { return 0; } @@ -604,7 +659,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, || end < req->wb_offset) goto out_flushme; - if (nfs_set_page_tag_locked(req)) + if (nfs_lock_request_dontget(req)) break; /* The request is locked, so wait and then retry */ @@ -616,13 +671,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); } - if (nfs_clear_request_commit(req) && - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { - NFS_I(inode)->ncommit--; - pnfs_clear_request_commit(req); - } - /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { req->wb_offset = offset; @@ -634,6 +682,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = rqend - req->wb_offset; out_unlock: spin_unlock(&inode->i_lock); + nfs_clear_request_commit(req); return req; out_flushme: spin_unlock(&inode->i_lock); @@ -655,7 +704,6 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, { struct inode *inode = page->mapping->host; struct nfs_page *req; - int error; req = nfs_try_to_update_request(inode, page, offset, bytes); if (req != NULL) @@ -663,11 +711,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, req = nfs_create_request(ctx, inode, page, offset, bytes); if (IS_ERR(req)) goto out; - error = nfs_inode_add_request(inode, req); - if (error != 0) { - nfs_release_request(req); - req = ERR_PTR(error); - } + nfs_inode_add_request(inode, req); out: return req; } @@ -684,7 +728,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); nfs_mark_request_dirty(req); - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); return 0; } @@ -777,7 +821,7 @@ static void nfs_writepage_release(struct nfs_page *req, if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) nfs_inode_remove_request(req); - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); nfs_end_page_writeback(page); } @@ -925,7 +969,7 @@ static void nfs_redirty_request(struct nfs_page *req) struct page *page = req->wb_page; nfs_mark_request_dirty(req); - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); nfs_end_page_writeback(page); } @@ -1128,23 +1172,14 @@ out: nfs_writedata_release(calldata); } -#if defined(CONFIG_NFS_V4_1) void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - - if (nfs4_setup_sequence(NFS_SERVER(data->inode), - &data->args.seq_args, - &data->res.seq_res, 1, task)) - return; - rpc_call_start(task); + NFS_PROTO(data->inode)->write_rpc_prepare(task, data); } -#endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_write_partial_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_writeback_done_partial, .rpc_release = nfs_writeback_release_partial, }; @@ -1199,16 +1234,14 @@ static void nfs_writeback_release_full(void *calldata) remove_request: nfs_inode_remove_request(req); next: - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); nfs_end_page_writeback(page); } nfs_writedata_release(calldata); } static const struct rpc_call_ops nfs_write_full_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_writeback_done_full, .rpc_release = nfs_writeback_release_full, }; @@ -1325,7 +1358,6 @@ void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; - put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } @@ -1411,7 +1443,7 @@ void nfs_retry_commit(struct list_head *page_list, dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); } } EXPORT_SYMBOL_GPL(nfs_retry_commit); @@ -1460,7 +1492,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_clear_request_commit(req); + nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%lld %d@%lld)", req->wb_context->dentry->d_sb->s_id, @@ -1486,7 +1518,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data) dprintk(" mismatch\n"); nfs_mark_request_dirty(req); next: - nfs_clear_page_tag_locked(req); + nfs_unlock_request(req); } } EXPORT_SYMBOL_GPL(nfs_commit_release_pages); @@ -1501,9 +1533,7 @@ static void nfs_commit_release(void *calldata) } static const struct rpc_call_ops nfs_commit_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs_commit_done, .rpc_release = nfs_commit_release, }; @@ -1517,7 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_commit_set_lock(NFS_I(inode), may_wait); if (res <= 0) goto out_mark_dirty; - res = nfs_scan_commit(inode, &head, 0, 0); + res = nfs_scan_commit(inode, &head); if (res) { int error; @@ -1635,6 +1665,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) if (req == NULL) break; if (nfs_lock_request_dontget(req)) { + nfs_clear_request_commit(req); nfs_inode_remove_request(req); /* * In case nfs_inode_remove_request has marked the diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6f3ebb4..0e262f3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -605,24 +605,24 @@ static struct rpc_version nfs_cb_version4 = { .procs = nfs4_cb_procedures }; -static struct rpc_version *nfs_cb_version[] = { +static const struct rpc_version *nfs_cb_version[] = { &nfs_cb_version4, }; -static struct rpc_program cb_program; +static const struct rpc_program cb_program; static struct rpc_stat cb_stats = { .program = &cb_program }; #define NFS4_CALLBACK 0x40000000 -static struct rpc_program cb_program = { +static const struct rpc_program cb_program = { .name = "nfs4_cb", .number = NFS4_CALLBACK, .nrvers = ARRAY_SIZE(nfs_cb_version), .version = nfs_cb_version, .stats = &cb_stats, - .pipe_dir_name = "/nfsd4_cb", + .pipe_dir_name = "nfsd4_cb", }; static int max_cb_time(void) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e8c98f0..c5cddd6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1308,7 +1308,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r else goto out_err; - conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val, se->se_callback_addr_len, (struct sockaddr *)&conn->cb_addr, sizeof(conn->cb_addr)); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 748eda9..64c24af 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -223,7 +223,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - if (rpc_pton(fo_path, size, sap, salen) == 0) + if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0) return -EINVAL; return nlmsvc_unlock_all_by_ip(sap); @@ -722,7 +722,7 @@ static ssize_t __write_ports_addxprt(char *buf) nfsd_serv->sv_nrthreads--; return 0; out_close: - xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); + xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port); if (xprt != NULL) { svc_close_xprt(xprt); svc_xprt_put(xprt); @@ -748,7 +748,7 @@ static ssize_t __write_ports_delxprt(char *buf) if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) return -EINVAL; - xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); + xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port); if (xprt == NULL) return -ENOTCONN; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index eda7d7e..fce472f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -251,13 +251,13 @@ static void nfsd_shutdown(void) nfsd_up = false; } -static void nfsd_last_thread(struct svc_serv *serv) +static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { /* When last nfsd thread exits we need to do some clean-up */ nfsd_serv = NULL; nfsd_shutdown(); - svc_rpcb_cleanup(serv); + svc_rpcb_cleanup(serv, net); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index a2e2402..6d4521f 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/sunrpc/stats.h> #include <linux/nfsd/stats.h> +#include <net/net_namespace.h> #include "nfsd.h" @@ -94,11 +95,11 @@ static const struct file_operations nfsd_proc_fops = { void nfsd_stat_init(void) { - svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops); + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); } void nfsd_stat_shutdown(void) { - svc_proc_unregister("nfsd"); + svc_proc_unregister(&init_net, "nfsd"); } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index ee18815..c887b13 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -447,7 +447,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, return event; } -__init int fsnotify_notification_init(void) +static __init int fsnotify_notification_init(void) { fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); @@ -461,4 +461,3 @@ __init int fsnotify_notification_init(void) return 0; } subsys_initcall(fsnotify_notification_init); - @@ -13,6 +13,7 @@ #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> +#include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> diff --git a/fs/posix_acl.c b/fs/posix_acl.c index cea4623..5e325a42 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -18,7 +18,7 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/errno.h> diff --git a/fs/proc/array.c b/fs/proc/array.c index c602b8d..fbb53c2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -462,59 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); - seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ -%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", - pid_nr_ns(pid, ns), - tcomm, - state, - ppid, - pgid, - sid, - tty_nr, - tty_pgrp, - task->flags, - min_flt, - cmin_flt, - maj_flt, - cmaj_flt, - cputime_to_clock_t(utime), - cputime_to_clock_t(stime), - cputime_to_clock_t(cutime), - cputime_to_clock_t(cstime), - priority, - nice, - num_threads, - start_time, - vsize, - mm ? get_mm_rss(mm) : 0, - rsslim, - mm ? (permitted ? mm->start_code : 1) : 0, - mm ? (permitted ? mm->end_code : 1) : 0, - (permitted && mm) ? mm->start_stack : 0, - esp, - eip, - /* The signal information here is obsolete. - * It must be decimal for Linux 2.0 compatibility. - * Use /proc/#/status for real-time signals. - */ - task->pending.signal.sig[0] & 0x7fffffffUL, - task->blocked.sig[0] & 0x7fffffffUL, - sigign .sig[0] & 0x7fffffffUL, - sigcatch .sig[0] & 0x7fffffffUL, - wchan, - 0UL, - 0UL, - task->exit_signal, - task_cpu(task), - task->rt_priority, - task->policy, - (unsigned long long)delayacct_blkio_ticks(task), - cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime), - (mm && permitted) ? mm->start_data : 0, - (mm && permitted) ? mm->end_data : 0, - (mm && permitted) ? mm->start_brk : 0); + seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ll(m, ' ', ppid); + seq_put_decimal_ll(m, ' ', pgid); + seq_put_decimal_ll(m, ' ', sid); + seq_put_decimal_ll(m, ' ', tty_nr); + seq_put_decimal_ll(m, ' ', tty_pgrp); + seq_put_decimal_ull(m, ' ', task->flags); + seq_put_decimal_ull(m, ' ', min_flt); + seq_put_decimal_ull(m, ' ', cmin_flt); + seq_put_decimal_ull(m, ' ', maj_flt); + seq_put_decimal_ull(m, ' ', cmaj_flt); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, ' ', priority); + seq_put_decimal_ll(m, ' ', nice); + seq_put_decimal_ll(m, ' ', num_threads); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', start_time); + seq_put_decimal_ull(m, ' ', vsize); + seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', rsslim); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, ' ', esp); + seq_put_decimal_ull(m, ' ', eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', wchan); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ll(m, ' ', task->exit_signal); + seq_put_decimal_ll(m, ' ', task_cpu(task)); + seq_put_decimal_ull(m, ' ', task->rt_priority); + seq_put_decimal_ull(m, ' ', task->policy); + seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); + seq_putc(m, '\n'); if (mm) mmput(mm); return 0; @@ -542,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - size, resident, shared, text, data); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, 0, size); + seq_put_decimal_ull(m, ' ', resident); + seq_put_decimal_ull(m, ' ', shared); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c44efe1..5f79bb8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -10,12 +10,15 @@ */ #include <linux/proc_fs.h> +struct ctl_table_header; extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } #endif #ifdef CONFIG_NET extern int proc_net_init(void); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e5e69af..86c67ee 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -157,7 +157,8 @@ static int kcore_update_ram(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* calculate vmemmap's address from given system ram pfn and register it */ -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; unsigned long nr_pages = ent->size >> PAGE_SHIFT; @@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) } #else -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { return 1; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 27da860..3551f1f 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei->ns_ops = ns_ops; ei->ns = ns; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 67bbf6e..21d836f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/namei.h> #include <linux/mm.h> +#include <linux/module.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -26,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IFDIR|S_IRUGO|S_IXUGO, + }, + { } +}; +static struct ctl_table_root sysctl_table_root = { + .default_set.dir.header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table }}, + .ctl_table_arg = root_table, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int minlen; + int cmp; + + minlen = len1; + if (minlen > len2) + minlen = len2; + + cmp = memcmp(name1, name2, minlen); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; + + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; + } + } + return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + printk(KERN_ERR "sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + printk(KERN_CONT "/%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_node *node, struct ctl_table *table) +{ + head->ctl_table = table; + head->ctl_table_arg = table; + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; + head->node = node; + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) { + rb_init_node(&node->node); + node->header = head; + } + } +} + +static void erase_header(struct ctl_table_header *head) +{ + struct ctl_table *entry; + for (entry = head->ctl_table; entry->procname; entry++) + erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ + struct ctl_table *entry; + int err; + + dir->header.nreg++; + header->parent = dir; + err = insert_links(header); + if (err) + goto fail_links; + for (entry = header->ctl_table; entry->procname; entry++) { + err = insert_entry(header, entry); + if (err) + goto fail; + } + return 0; +fail: + erase_header(header); + put_links(header); +fail_links: + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + erase_header(p); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + spin_lock(&sysctl_lock); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + spin_unlock(&sysctl_lock); + return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ + struct ctl_node *ctl_node; + + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; + } + return NULL; +} + +static void first_entry(struct ctl_dir *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = NULL; + struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; + + spin_lock(&sysctl_lock); + ctl_node = first_usable_entry(rb_first(&dir->root)); + spin_unlock(&sysctl_lock); + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +void register_sysctl_root(struct ctl_table_root *root) +{ +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current_euid()) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ + int mode; + + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -45,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_mode = table->mode; - if (!table->child) { + if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - clear_nlink(inode); inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } @@ -59,70 +424,42 @@ out: return inode; } -static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) -{ - int len; - for ( ; p->procname; p++) { - - if (!p->procname) - continue; - - len = strlen(p->procname); - if (len != name->len) - continue; - - if (memcmp(p->procname, name->name, len) != 0) - continue; - - /* I have a match */ - return p; - } - return NULL; -} - static struct ctl_table_header *grab_header(struct inode *inode) { - if (PROC_I(inode)->sysctl) - return sysctl_head_grab(PROC_I(inode)->sysctl); - else - return sysctl_head_next(NULL); + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &sysctl_table_root.default_set.dir.header; + return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct ctl_table_header *head = grab_header(dir); - struct ctl_table *table = PROC_I(dir)->sysctl_entry; struct ctl_table_header *h = NULL; struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; + int ret; if (IS_ERR(head)) return ERR_CAST(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } - - table = table ? table->child : head->ctl_table; - - p = find_in_table(table, name); - if (!p) { - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - p = find_in_table(h->attached_by, name); - if (p) - break; - } - } + ctl_dir = container_of(head, struct ctl_dir, header); + p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + } + err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); if (h) @@ -190,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, static int proc_sys_open(struct inode *inode, struct file *filp) { + struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); + sysctl_head_finish(head); + return 0; } static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - unsigned long event = (unsigned long)filp->private_data; unsigned int ret = DEFAULT_POLLMASK; + unsigned long event; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return POLLERR | POLLHUP; if (!table->proc_handler) goto out; @@ -211,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) if (!table->poll) goto out; + event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { @@ -219,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) } out: + sysctl_head_finish(head); + return ret; } @@ -260,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); } +static int proc_sys_link_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) +{ + int err, ret = 0; + head = sysctl_head_grab(head); + + if (S_ISLNK(table->mode)) { + /* It is not an error if we can not follow the link ignore it */ + err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + } + + ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); +out: + sysctl_head_finish(head); + return ret; +} + static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, void *dirent, filldir_t filldir) { + int res; - for (; table->procname; table++, (*pos)++) { - int res; - - /* Can't do anything without a proc name */ - if (!table->procname) - continue; - - if (*pos < file->f_pos) - continue; + if ((*pos)++ < file->f_pos) + return 0; + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); + else res = proc_sys_fill_cache(file, dirent, filldir, head, table); - if (res) - return res; - file->f_pos = *pos + 1; - } - return 0; + if (res == 0) + file->f_pos = *pos; + + return res; } static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) @@ -289,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; struct ctl_table_header *h = NULL; + struct ctl_table *entry; + struct ctl_dir *ctl_dir; unsigned long pos; int ret = -EINVAL; if (IS_ERR(head)) return PTR_ERR(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } - - table = table ? table->child : head->ctl_table; + ctl_dir = container_of(head, struct ctl_dir, header); ret = 0; /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ @@ -320,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) } pos = 2; - ret = scan(head, table, &pos, filp, dirent, filldir); - if (ret) - goto out; - - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { + ret = scan(h, entry, &pos, filp, dirent, filldir); if (ret) { sysctl_head_finish(h); break; @@ -447,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry) return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + static int proc_sys_compare(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -472,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + entry = find_entry(&head, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + struct ctl_node *node; + char *new_name; + + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); + if (!new) + return NULL; + + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->dir.header.root, set, node, table); + + return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_set *set = dir->header.set; + struct ctl_dir *subdir, *new = NULL; + int err; + + spin_lock(&sysctl_lock); + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + /* Was the subdir added while we dropped the lock? */ + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + /* Nope. Use the our freshly made directory entry. */ + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) + goto failed; + subdir = new; +found: + subdir->header.nreg++; +failed: + if (unlikely(IS_ERR(subdir))) { + printk(KERN_ERR "sysctl could not get directory: "); + sysctl_print_dir(dir); + printk(KERN_CONT "/%*.*s %ld\n", + namelen, namelen, name, PTR_ERR(subdir)); + } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + ret = 0; + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root, namespaces); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); + + va_end(args); + return -EINVAL; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ + int err = 0; + for (; table->procname; table++) { + if (table->child) + err = sysctl_err(path, table, "Not a file"); + + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + err = sysctl_err(path, table, "No data"); + if (!table->maxlen) + err = sysctl_err(path, table, "No maxlen"); + } + if (!table->proc_handler) + err = sysctl_err(path, table, "No proc_handler"); + + if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) + err = sysctl_err(path, table, "bogus .mode 0%o", + table->mode); + } + return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + struct ctl_node *node; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + for (entry = table; entry->procname; entry++) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); + link_name = (char *)&link_table[nr_entries + 1]; + + for (link = link_table, entry = table; entry->procname; link++, entry++) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + } + init_header(links, dir->header.root, dir->header.set, node, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent = NULL; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table) +{ + struct ctl_table_root *root = set->dir.header.root; + struct ctl_table_header *header; + const char *name, *nextname; + struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + for (entry = table; entry->procname; entry++) + nr_entries++; + + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + if (!header) + return NULL; + + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + dir = &set->dir; + /* Reference moved down the diretory tree get_subdir */ + dir->header.nreg++; + spin_unlock(&sysctl_lock); + + /* Find the directory for the ctl_table */ + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + goto fail; + } + + spin_lock(&sysctl_lock); + if (insert_header(dir, header)) + goto fail_put_dir_locked; + + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); + + return header; + +fail_put_dir_locked: + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, struct ctl_table_set *set, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + for (new = files, entry = table; entry->procname; entry++) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(set, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + for (entry = table; entry->procname; entry++) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + set, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = table; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } + if (nr_subheaders == 1) { + header = __register_sysctl_table(set, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; + header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + set, table)) + goto err_register_leaves; + } + +out: + kfree(new_path); + return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + for (entry = header->ctl_table; entry->procname; entry++) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + printk(KERN_ERR "sysctl link missing during unregister: "); + sysctl_print_dir(parent); + printk(KERN_CONT "/%s\n", name); + } + } +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ + struct ctl_dir *parent = header->parent; + + if (--header->nreg) + return; + + put_links(header); + start_unregistering(header); + if (!--header->count) + kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + int nr_subheaders; + might_sleep(); + + if (header == NULL) + return; + + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + + spin_lock(&sysctl_lock); + drop_sysctl_table(header); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ + memset(set, 0, sizeof(*set)); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} + int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; @@ -480,5 +1601,6 @@ int __init proc_sys_init(void) proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return 0; + + return sysctl_init(); } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 121f77c..6a0c62d 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -89,18 +89,19 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); + seq_puts(p, "cpu "); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -113,26 +114,24 @@ static int show_stat(struct seq_file *p, void *v) steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; - seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - i, - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); } seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_printf(p, " %u", kstat_irqs(j)); + seq_put_decimal_ull(p, ' ', kstat_irqs(j)); seq_printf(p, "\nctxt %llu\n" @@ -149,7 +148,7 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) - seq_printf(p, " %u", per_softirq_sums[i]); + seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); seq_putc(p, '\n'); return 0; @@ -157,11 +156,14 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - unsigned size = 4096 * (1 + num_possible_cpus() / 32); + unsigned size = 1024 + 128 * num_possible_cpus(); char *buf; struct seq_file *m; int res; + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + /* don't ask for more than the kmalloc() max size */ if (size > KMALLOC_MAX_SIZE) size = KMALLOC_MAX_SIZE; @@ -173,7 +175,7 @@ static int stat_open(struct inode *inode, struct file *file) if (!res) { m = file->private_data; m->buf = buf; - m->size = size; + m->size = ksize(buf); } else kfree(buf); return res; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 9ec22d3..82c585f 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -68,9 +68,25 @@ void pstore_set_kmsg_bytes(int bytes) /* Tag each group of saved records with a sequence number */ static int oopscount; -static char *reason_str[] = { - "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency" -}; +static const char *get_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_RESTART: + return "Restart"; + case KMSG_DUMP_HALT: + return "Halt"; + case KMSG_DUMP_POWEROFF: + return "Poweroff"; + default: + return "Unknown"; + } +} /* * callback from kmsg_dump. (s2,l2) has the most recently @@ -85,17 +101,15 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long s1_start, s2_start; unsigned long l1_cpy, l2_cpy; unsigned long size, total = 0; - char *dst, *why; + char *dst; + const char *why; u64 id; int hsize, ret; unsigned int part = 1; unsigned long flags = 0; int is_locked = 0; - if (reason < ARRAY_SIZE(reason_str)) - why = reason_str[reason]; - else - why = "Unknown"; + why = get_reason_str(reason); if (in_nmi()) { is_locked = spin_trylock(&psinfo->buf_lock); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8b4f12b..d69a1d1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1110,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) clear_bit(DQ_BLKS_B, &dquot->dq_flags); } +struct dquot_warn { + struct super_block *w_sb; + qid_t w_dq_id; + short w_dq_type; + short w_type; +}; + static int warning_issued(struct dquot *dquot, const int warntype) { int flag = (warntype == QUOTA_NL_BHARDWARN || @@ -1125,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype) #ifdef CONFIG_PRINT_QUOTA_WARNING static int flag_print_warnings = 1; -static int need_print_warning(struct dquot *dquot) +static int need_print_warning(struct dquot_warn *warn) { if (!flag_print_warnings) return 0; - switch (dquot->dq_type) { + switch (warn->w_dq_type) { case USRQUOTA: - return current_fsuid() == dquot->dq_id; + return current_fsuid() == warn->w_dq_id; case GRPQUOTA: - return in_group_p(dquot->dq_id); + return in_group_p(warn->w_dq_id); } return 0; } /* Print warning to user which exceeded quota */ -static void print_warning(struct dquot *dquot, const int warntype) +static void print_warning(struct dquot_warn *warn) { char *msg = NULL; struct tty_struct *tty; + int warntype = warn->w_type; if (warntype == QUOTA_NL_IHARDBELOW || warntype == QUOTA_NL_ISOFTBELOW || warntype == QUOTA_NL_BHARDBELOW || - warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) return; tty = get_current_tty(); if (!tty) return; - tty_write_message(tty, dquot->dq_sb->s_id); + tty_write_message(tty, warn->w_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); - tty_write_message(tty, quotatypes[dquot->dq_type]); + tty_write_message(tty, quotatypes[warn->w_dq_type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; @@ -1185,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype) } #endif +static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, + int warntype) +{ + if (warning_issued(dquot, warntype)) + return; + warn->w_type = warntype; + warn->w_sb = dquot->dq_sb; + warn->w_dq_id = dquot->dq_id; + warn->w_dq_type = dquot->dq_type; +} + /* * Write warnings to the console and send warning messages over netlink. * - * Note that this function can sleep. + * Note that this function can call into tty and networking code. */ -static void flush_warnings(struct dquot *const *dquots, char *warntype) +static void flush_warnings(struct dquot_warn *warn) { - struct dquot *dq; int i; for (i = 0; i < MAXQUOTAS; i++) { - dq = dquots[i]; - if (dq && warntype[i] != QUOTA_NL_NOWARN && - !warning_issued(dq, warntype[i])) { + if (warn[i].w_type == QUOTA_NL_NOWARN) + continue; #ifdef CONFIG_PRINT_QUOTA_WARNING - print_warning(dq, warntype[i]); + print_warning(&warn[i]); #endif - quota_send_warning(dq->dq_type, dq->dq_id, - dq->dq_sb->s_dev, warntype[i]); - } + quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id, + warn[i].w_sb->s_dev, warn[i].w_type); } } @@ -1218,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot) } /* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) +static int check_idq(struct dquot *dquot, qsize_t inodes, + struct dquot_warn *warn) { qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; - *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1230,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { - *warntype = QUOTA_NL_IHARDWARN; + prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); return -EDQUOT; } @@ -1239,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) dquot->dq_dqb.dqb_itime && get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { - *warntype = QUOTA_NL_ISOFTLONGWARN; + prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { - *warntype = QUOTA_NL_ISOFTWARN; + prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } @@ -1255,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) } /* needs dq_data_lock */ -static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, + struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; - *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1272,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) - *warntype = QUOTA_NL_BHARDWARN; + prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); return -EDQUOT; } @@ -1282,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war get_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) - *warntype = QUOTA_NL_BSOFTLONGWARN; + prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); return -EDQUOT; } @@ -1290,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { - *warntype = QUOTA_NL_BSOFTWARN; + prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = get_seconds() + sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; } @@ -1543,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0; - char warntype[MAXQUOTAS]; - int warn = flags & DQUOT_SPACE_WARN; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - int nofail = flags & DQUOT_SPACE_NOFAIL; /* * First test before acquiring mutex - solves deadlocks when we @@ -1559,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype[cnt] = QUOTA_NL_NOWARN; + warn[cnt].w_type = QUOTA_NL_NOWARN; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - ret = check_bdq(inode->i_dquot[cnt], number, !warn, - warntype+cnt); - if (ret && !nofail) { + ret = check_bdq(dquots[cnt], number, + !(flags & DQUOT_SPACE_WARN), &warn[cnt]); + if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { spin_unlock(&dq_data_lock); goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; if (reserve) - dquot_resv_space(inode->i_dquot[cnt], number); + dquot_resv_space(dquots[cnt], number); else - dquot_incr_space(inode->i_dquot[cnt], number); + dquot_incr_space(dquots[cnt], number); } inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); if (reserve) goto out_flush_warn; - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(dquots); out_flush_warn: - flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); out: return ret; } @@ -1600,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space); int dquot_alloc_inode(const struct inode *inode) { int cnt, ret = 0; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots = inode->i_dquot; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype[cnt] = QUOTA_NL_NOWARN; + warn[cnt].w_type = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + ret = check_idq(dquots[cnt], 1, &warn[cnt]); if (ret) goto warn_put_all; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], 1); + dquot_incr_inodes(dquots[cnt], 1); } warn_put_all: spin_unlock(&dq_data_lock); if (ret == 0) - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); + mark_all_dquot_dirty(dquots); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); @@ -1669,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; /* First test before acquiring mutex - solves deadlocks when we @@ -1682,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) continue; - warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); + wtype = info_bdq_free(dquots[cnt], number); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); if (reserve) - dquot_free_reserved_space(inode->i_dquot[cnt], number); + dquot_free_reserved_space(dquots[cnt], number); else - dquot_decr_space(inode->i_dquot[cnt], number); + dquot_decr_space(dquots[cnt], number); } inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); if (reserve) goto out_unlock; - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(dquots); out_unlock: - flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); @@ -1708,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space); void dquot_free_inode(const struct inode *inode) { unsigned int cnt; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots = inode->i_dquot; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1718,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); - dquot_decr_inodes(inode->i_dquot[cnt], 1); + wtype = info_idq_free(dquots[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); + dquot_decr_inodes(dquots[cnt], 1); } spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); + mark_all_dquot_dirty(dquots); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); @@ -1747,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char is_valid[MAXQUOTAS] = {}; - char warntype_to[MAXQUOTAS]; - char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; + struct dquot_warn warn_to[MAXQUOTAS]; + struct dquot_warn warn_from_inodes[MAXQUOTAS]; + struct dquot_warn warn_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype_to[cnt] = QUOTA_NL_NOWARN; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + warn_to[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1778,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = inode->i_dquot[cnt]; - ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); if (ret) goto over_quota; - ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); if (ret) goto over_quota; } @@ -1794,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { - warntype_from_inodes[cnt] = - info_idq_free(transfer_from[cnt], 1); - warntype_from_space[cnt] = - info_bdq_free(transfer_from[cnt], space); + int wtype; + wtype = info_idq_free(transfer_from[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_inodes[cnt], + transfer_from[cnt], wtype); + wtype = info_bdq_free(transfer_from[cnt], space); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_space[cnt], + transfer_from[cnt], wtype); dquot_decr_inodes(transfer_from[cnt], 1); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], @@ -1815,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); - flush_warnings(transfer_to, warntype_to); - flush_warnings(transfer_from, warntype_from_inodes); - flush_warnings(transfer_from, warntype_from_space); + flush_warnings(warn_to); + flush_warnings(warn_from_inodes); + flush_warnings(warn_from_space); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) @@ -1826,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) over_quota: spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - flush_warnings(transfer_to, warntype_to); + flush_warnings(warn_to); return ret; } EXPORT_SYMBOL(__dquot_transfer); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index fc2c438..9a39120 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -282,10 +282,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); case Q_XQUOTASYNC: - /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb, WB_REASON_SYNC); + /* XFS quotas are fully coherent now, making this call a noop */ return 0; default: return -EINVAL; diff --git a/fs/read_write.c b/fs/read_write.c index 5ad4248..ffc99d2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -11,7 +11,7 @@ #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> diff --git a/fs/readdir.c b/fs/readdir.c index 356f715..cc0a822 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -6,7 +6,7 @@ #include <linux/stddef.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/errno.h> diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 445d768..a59d271 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/bug.h> #include <linux/workqueue.h> #include <asm/unaligned.h> #include <linux/bitops.h> diff --git a/fs/select.c b/fs/select.c index e782258..6fb8943 100644 --- a/fs/select.c +++ b/fs/select.c @@ -17,7 +17,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ @@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - entry->key = p->key; + entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -386,13 +386,11 @@ get_max: static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit) { - if (wait) { - wait->key = POLLEX_SET; - if (in & bit) - wait->key |= POLLIN_SET; - if (out & bit) - wait->key |= POLLOUT_SET; - } + wait->_key = POLLEX_SET; + if (in & bit) + wait->_key |= POLLIN_SET; + if (out & bit) + wait->_key |= POLLOUT_SET; } int do_select(int n, fd_set_bits *fds, struct timespec *end_time) @@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - wait = NULL; + wait->_qproc = NULL; timed_out = 1; } @@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } } } @@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) *rexp = res_ex; cond_resched(); } - wait = NULL; + wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { @@ -720,7 +718,7 @@ struct poll_list { * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, - * if non-NULL. + * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { @@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) if (file != NULL) { mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) { - if (pwait) - pwait->key = pollfd->events | - POLLERR | POLLHUP; + pwait->_key = pollfd->events|POLLERR|POLLHUP; mask = file->f_op->poll(file, pwait); } /* Mask out unneeded events. */ @@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - pt = NULL; + pt->_qproc = NULL; timed_out = 1; } @@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it - * and kill the poll_table, so we don't + * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt)) { count++; - pt = NULL; + pt->_qproc = NULL; } } } /* * All waiters have already been registered, so don't provide - * a poll_table to them on the next loop iteration. + * a poll_table->_qproc to them on the next loop iteration. */ - pt = NULL; + pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) diff --git a/fs/seq_file.c b/fs/seq_file.c index aa242dc..0cbd049 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,13 +6,29 @@ */ #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <asm/uaccess.h> #include <asm/page.h> + +/* + * seq_files have a buffer which can may overflow. When this happens a larger + * buffer is reallocated and all the data will be printed again. + * The overflow state is true when m->count == m->size. + */ +static bool seq_overflow(struct seq_file *m) +{ + return m->count == m->size; +} + +static void seq_set_overflow(struct seq_file *m) +{ + m->count = m->size; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -92,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) error = 0; m->count = 0; } - if (m->count == m->size) + if (seq_overflow(m)) goto Eoverflow; if (pos + m->count > offset) { m->from = offset - pos; @@ -234,7 +250,7 @@ Fill: break; } err = m->op->show(m, p); - if (m->count == m->size || err) { + if (seq_overflow(m) || err) { m->count = offs; if (likely(err <= 0)) break; @@ -361,7 +377,7 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc) *p++ = '0' + (c & 07); continue; } - m->count = m->size; + seq_set_overflow(m); return -1; } m->count = p - m->buf; @@ -383,7 +399,7 @@ int seq_printf(struct seq_file *m, const char *f, ...) return 0; } } - m->count = m->size; + seq_set_overflow(m); return -1; } EXPORT_SYMBOL(seq_printf); @@ -512,7 +528,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits, return 0; } } - m->count = m->size; + seq_set_overflow(m); return -1; } EXPORT_SYMBOL(seq_bitmap); @@ -528,7 +544,7 @@ int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, return 0; } } - m->count = m->size; + seq_set_overflow(m); return -1; } EXPORT_SYMBOL(seq_bitmap_list); @@ -639,11 +655,63 @@ int seq_puts(struct seq_file *m, const char *s) m->count += len; return 0; } - m->count = m->size; + seq_set_overflow(m); return -1; } EXPORT_SYMBOL(seq_puts); +/* + * A helper routine for putting decimal numbers without rich format of printf(). + * only 'unsigned long long' is supported. + * This routine will put one byte delimiter + number into seq_file. + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +int seq_put_decimal_ull(struct seq_file *m, char delimiter, + unsigned long long num) +{ + int len; + + if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ + goto overflow; + + if (delimiter) + m->buf[m->count++] = delimiter; + + if (num < 10) { + m->buf[m->count++] = num + '0'; + return 0; + } + + len = num_to_str(m->buf + m->count, m->size - m->count, num); + if (!len) + goto overflow; + m->count += len; + return 0; +overflow: + seq_set_overflow(m); + return -1; +} +EXPORT_SYMBOL(seq_put_decimal_ull); + +int seq_put_decimal_ll(struct seq_file *m, char delimiter, + long long num) +{ + if (num < 0) { + if (m->count + 3 >= m->size) { + seq_set_overflow(m); + return -1; + } + if (delimiter) + m->buf[m->count++] = delimiter; + num = -num; + delimiter = '-'; + } + return seq_put_decimal_ull(m, delimiter, num); + +} +EXPORT_SYMBOL(seq_put_decimal_ll); + /** * seq_write - write arbitrary data to buffer * @seq: seq_file identifying the buffer to which data should be written @@ -659,7 +727,7 @@ int seq_write(struct seq_file *seq, const void *data, size_t len) seq->count += len; return 0; } - seq->count = seq->size; + seq_set_overflow(seq); return -1; } EXPORT_SYMBOL(seq_write); diff --git a/fs/splice.c b/fs/splice.c index f16402e..5f883de 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -25,7 +25,7 @@ #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/security.h> @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h> #include <linux/fs.h> #include <linux/fs_stack.h> @@ -4,7 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> #include <linux/file.h> diff --git a/fs/statfs.c b/fs/statfs.c index 2aa6a22..43e6b6f 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -1,5 +1,5 @@ #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> @@ -20,7 +20,7 @@ * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/blkdev.h> @@ -6,7 +6,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/writeback.h> diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index f922cba..1934084 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -36,7 +36,7 @@ #ifdef CONFIG_UBIFS_FS_DEBUG -DEFINE_SPINLOCK(dbg_lock); +static DEFINE_SPINLOCK(dbg_lock); static const char *get_key_fmt(int fmt) { @@ -221,15 +221,15 @@ const char *dbg_jhead(int jhead) static void dump_ch(const struct ubifs_ch *ch) { - printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); - printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); - printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, + printk(KERN_ERR "\tmagic %#x\n", le32_to_cpu(ch->magic)); + printk(KERN_ERR "\tcrc %#x\n", le32_to_cpu(ch->crc)); + printk(KERN_ERR "\tnode_type %d (%s)\n", ch->node_type, dbg_ntype(ch->node_type)); - printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, + printk(KERN_ERR "\tgroup_type %d (%s)\n", ch->group_type, dbg_gtype(ch->group_type)); - printk(KERN_DEBUG "\tsqnum %llu\n", + printk(KERN_ERR "\tsqnum %llu\n", (unsigned long long)le64_to_cpu(ch->sqnum)); - printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); + printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len)); } void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) @@ -240,43 +240,43 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) struct ubifs_dent_node *dent, *pdent = NULL; int count = 2; - printk(KERN_DEBUG "Dump in-memory inode:"); - printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); - printk(KERN_DEBUG "\tsize %llu\n", + printk(KERN_ERR "Dump in-memory inode:"); + printk(KERN_ERR "\tinode %lu\n", inode->i_ino); + printk(KERN_ERR "\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); - printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); - printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); - printk(KERN_DEBUG "\tatime %u.%u\n", + printk(KERN_ERR "\tnlink %u\n", inode->i_nlink); + printk(KERN_ERR "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_ERR "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_ERR "\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_DEBUG "\tmtime %u.%u\n", + printk(KERN_ERR "\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_DEBUG "\tctime %u.%u\n", + printk(KERN_ERR "\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); - printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); - printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); - printk(KERN_DEBUG "\txattr %u\n", ui->xattr); - printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); - printk(KERN_DEBUG "\tsynced_i_size %llu\n", + printk(KERN_ERR "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_ERR "\txattr_size %u\n", ui->xattr_size); + printk(KERN_ERR "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_ERR "\txattr_names %u\n", ui->xattr_names); + printk(KERN_ERR "\tdirty %u\n", ui->dirty); + printk(KERN_ERR "\txattr %u\n", ui->xattr); + printk(KERN_ERR "\tbulk_read %u\n", ui->xattr); + printk(KERN_ERR "\tsynced_i_size %llu\n", (unsigned long long)ui->synced_i_size); - printk(KERN_DEBUG "\tui_size %llu\n", + printk(KERN_ERR "\tui_size %llu\n", (unsigned long long)ui->ui_size); - printk(KERN_DEBUG "\tflags %d\n", ui->flags); - printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); - printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); - printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); - printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); + printk(KERN_ERR "\tflags %d\n", ui->flags); + printk(KERN_ERR "\tcompr_type %d\n", ui->compr_type); + printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_ERR "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_ERR "\tdata_len %d\n", ui->data_len); if (!S_ISDIR(inode->i_mode)) return; - printk(KERN_DEBUG "List of directory entries:\n"); + printk(KERN_ERR "List of directory entries:\n"); ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); lowest_dent_key(c, &key, inode->i_ino); @@ -284,11 +284,11 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { if (PTR_ERR(dent) != -ENOENT) - printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent)); + printk(KERN_ERR "error %ld\n", PTR_ERR(dent)); break; } - printk(KERN_DEBUG "\t%d: %s (%s)\n", + printk(KERN_ERR "\t%d: %s (%s)\n", count++, dent->name, get_dent_type(dent->type)); nm.name = dent->name; @@ -312,8 +312,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) /* If the magic is incorrect, just hexdump the first bytes */ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { - printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, (void *)node, UBIFS_CH_SZ, 1); return; } @@ -326,7 +326,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_pad_node *pad = node; - printk(KERN_DEBUG "\tpad_len %u\n", + printk(KERN_ERR "\tpad_len %u\n", le32_to_cpu(pad->pad_len)); break; } @@ -335,50 +335,50 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_sb_node *sup = node; unsigned int sup_flags = le32_to_cpu(sup->flags); - printk(KERN_DEBUG "\tkey_hash %d (%s)\n", + printk(KERN_ERR "\tkey_hash %d (%s)\n", (int)sup->key_hash, get_key_hash(sup->key_hash)); - printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", + printk(KERN_ERR "\tkey_fmt %d (%s)\n", (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); - printk(KERN_DEBUG "\tflags %#x\n", sup_flags); - printk(KERN_DEBUG "\t big_lpt %u\n", + printk(KERN_ERR "\tflags %#x\n", sup_flags); + printk(KERN_ERR "\t big_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); - printk(KERN_DEBUG "\t space_fixup %u\n", + printk(KERN_ERR "\t space_fixup %u\n", !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); - printk(KERN_DEBUG "\tmin_io_size %u\n", + printk(KERN_ERR "\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); - printk(KERN_DEBUG "\tleb_size %u\n", + printk(KERN_ERR "\tleb_size %u\n", le32_to_cpu(sup->leb_size)); - printk(KERN_DEBUG "\tleb_cnt %u\n", + printk(KERN_ERR "\tleb_cnt %u\n", le32_to_cpu(sup->leb_cnt)); - printk(KERN_DEBUG "\tmax_leb_cnt %u\n", + printk(KERN_ERR "\tmax_leb_cnt %u\n", le32_to_cpu(sup->max_leb_cnt)); - printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", + printk(KERN_ERR "\tmax_bud_bytes %llu\n", (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); - printk(KERN_DEBUG "\tlog_lebs %u\n", + printk(KERN_ERR "\tlog_lebs %u\n", le32_to_cpu(sup->log_lebs)); - printk(KERN_DEBUG "\tlpt_lebs %u\n", + printk(KERN_ERR "\tlpt_lebs %u\n", le32_to_cpu(sup->lpt_lebs)); - printk(KERN_DEBUG "\torph_lebs %u\n", + printk(KERN_ERR "\torph_lebs %u\n", le32_to_cpu(sup->orph_lebs)); - printk(KERN_DEBUG "\tjhead_cnt %u\n", + printk(KERN_ERR "\tjhead_cnt %u\n", le32_to_cpu(sup->jhead_cnt)); - printk(KERN_DEBUG "\tfanout %u\n", + printk(KERN_ERR "\tfanout %u\n", le32_to_cpu(sup->fanout)); - printk(KERN_DEBUG "\tlsave_cnt %u\n", + printk(KERN_ERR "\tlsave_cnt %u\n", le32_to_cpu(sup->lsave_cnt)); - printk(KERN_DEBUG "\tdefault_compr %u\n", + printk(KERN_ERR "\tdefault_compr %u\n", (int)le16_to_cpu(sup->default_compr)); - printk(KERN_DEBUG "\trp_size %llu\n", + printk(KERN_ERR "\trp_size %llu\n", (unsigned long long)le64_to_cpu(sup->rp_size)); - printk(KERN_DEBUG "\trp_uid %u\n", + printk(KERN_ERR "\trp_uid %u\n", le32_to_cpu(sup->rp_uid)); - printk(KERN_DEBUG "\trp_gid %u\n", + printk(KERN_ERR "\trp_gid %u\n", le32_to_cpu(sup->rp_gid)); - printk(KERN_DEBUG "\tfmt_version %u\n", + printk(KERN_ERR "\tfmt_version %u\n", le32_to_cpu(sup->fmt_version)); - printk(KERN_DEBUG "\ttime_gran %u\n", + printk(KERN_ERR "\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); - printk(KERN_DEBUG "\tUUID %pUB\n", + printk(KERN_ERR "\tUUID %pUB\n", sup->uuid); break; } @@ -386,61 +386,61 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_mst_node *mst = node; - printk(KERN_DEBUG "\thighest_inum %llu\n", + printk(KERN_ERR "\thighest_inum %llu\n", (unsigned long long)le64_to_cpu(mst->highest_inum)); - printk(KERN_DEBUG "\tcommit number %llu\n", + printk(KERN_ERR "\tcommit number %llu\n", (unsigned long long)le64_to_cpu(mst->cmt_no)); - printk(KERN_DEBUG "\tflags %#x\n", + printk(KERN_ERR "\tflags %#x\n", le32_to_cpu(mst->flags)); - printk(KERN_DEBUG "\tlog_lnum %u\n", + printk(KERN_ERR "\tlog_lnum %u\n", le32_to_cpu(mst->log_lnum)); - printk(KERN_DEBUG "\troot_lnum %u\n", + printk(KERN_ERR "\troot_lnum %u\n", le32_to_cpu(mst->root_lnum)); - printk(KERN_DEBUG "\troot_offs %u\n", + printk(KERN_ERR "\troot_offs %u\n", le32_to_cpu(mst->root_offs)); - printk(KERN_DEBUG "\troot_len %u\n", + printk(KERN_ERR "\troot_len %u\n", le32_to_cpu(mst->root_len)); - printk(KERN_DEBUG "\tgc_lnum %u\n", + printk(KERN_ERR "\tgc_lnum %u\n", le32_to_cpu(mst->gc_lnum)); - printk(KERN_DEBUG "\tihead_lnum %u\n", + printk(KERN_ERR "\tihead_lnum %u\n", le32_to_cpu(mst->ihead_lnum)); - printk(KERN_DEBUG "\tihead_offs %u\n", + printk(KERN_ERR "\tihead_offs %u\n", le32_to_cpu(mst->ihead_offs)); - printk(KERN_DEBUG "\tindex_size %llu\n", + printk(KERN_ERR "\tindex_size %llu\n", (unsigned long long)le64_to_cpu(mst->index_size)); - printk(KERN_DEBUG "\tlpt_lnum %u\n", + printk(KERN_ERR "\tlpt_lnum %u\n", le32_to_cpu(mst->lpt_lnum)); - printk(KERN_DEBUG "\tlpt_offs %u\n", + printk(KERN_ERR "\tlpt_offs %u\n", le32_to_cpu(mst->lpt_offs)); - printk(KERN_DEBUG "\tnhead_lnum %u\n", + printk(KERN_ERR "\tnhead_lnum %u\n", le32_to_cpu(mst->nhead_lnum)); - printk(KERN_DEBUG "\tnhead_offs %u\n", + printk(KERN_ERR "\tnhead_offs %u\n", le32_to_cpu(mst->nhead_offs)); - printk(KERN_DEBUG "\tltab_lnum %u\n", + printk(KERN_ERR "\tltab_lnum %u\n", le32_to_cpu(mst->ltab_lnum)); - printk(KERN_DEBUG "\tltab_offs %u\n", + printk(KERN_ERR "\tltab_offs %u\n", le32_to_cpu(mst->ltab_offs)); - printk(KERN_DEBUG "\tlsave_lnum %u\n", + printk(KERN_ERR "\tlsave_lnum %u\n", le32_to_cpu(mst->lsave_lnum)); - printk(KERN_DEBUG "\tlsave_offs %u\n", + printk(KERN_ERR "\tlsave_offs %u\n", le32_to_cpu(mst->lsave_offs)); - printk(KERN_DEBUG "\tlscan_lnum %u\n", + printk(KERN_ERR "\tlscan_lnum %u\n", le32_to_cpu(mst->lscan_lnum)); - printk(KERN_DEBUG "\tleb_cnt %u\n", + printk(KERN_ERR "\tleb_cnt %u\n", le32_to_cpu(mst->leb_cnt)); - printk(KERN_DEBUG "\tempty_lebs %u\n", + printk(KERN_ERR "\tempty_lebs %u\n", le32_to_cpu(mst->empty_lebs)); - printk(KERN_DEBUG "\tidx_lebs %u\n", + printk(KERN_ERR "\tidx_lebs %u\n", le32_to_cpu(mst->idx_lebs)); - printk(KERN_DEBUG "\ttotal_free %llu\n", + printk(KERN_ERR "\ttotal_free %llu\n", (unsigned long long)le64_to_cpu(mst->total_free)); - printk(KERN_DEBUG "\ttotal_dirty %llu\n", + printk(KERN_ERR "\ttotal_dirty %llu\n", (unsigned long long)le64_to_cpu(mst->total_dirty)); - printk(KERN_DEBUG "\ttotal_used %llu\n", + printk(KERN_ERR "\ttotal_used %llu\n", (unsigned long long)le64_to_cpu(mst->total_used)); - printk(KERN_DEBUG "\ttotal_dead %llu\n", + printk(KERN_ERR "\ttotal_dead %llu\n", (unsigned long long)le64_to_cpu(mst->total_dead)); - printk(KERN_DEBUG "\ttotal_dark %llu\n", + printk(KERN_ERR "\ttotal_dark %llu\n", (unsigned long long)le64_to_cpu(mst->total_dark)); break; } @@ -448,11 +448,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_ref_node *ref = node; - printk(KERN_DEBUG "\tlnum %u\n", + printk(KERN_ERR "\tlnum %u\n", le32_to_cpu(ref->lnum)); - printk(KERN_DEBUG "\toffs %u\n", + printk(KERN_ERR "\toffs %u\n", le32_to_cpu(ref->offs)); - printk(KERN_DEBUG "\tjhead %u\n", + printk(KERN_ERR "\tjhead %u\n", le32_to_cpu(ref->jhead)); break; } @@ -461,40 +461,40 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_DEBUG "\tkey %s\n", + printk(KERN_ERR "\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_DEBUG "\tcreat_sqnum %llu\n", + printk(KERN_ERR "\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); - printk(KERN_DEBUG "\tsize %llu\n", + printk(KERN_ERR "\tsize %llu\n", (unsigned long long)le64_to_cpu(ino->size)); - printk(KERN_DEBUG "\tnlink %u\n", + printk(KERN_ERR "\tnlink %u\n", le32_to_cpu(ino->nlink)); - printk(KERN_DEBUG "\tatime %lld.%u\n", + printk(KERN_ERR "\tatime %lld.%u\n", (long long)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); - printk(KERN_DEBUG "\tmtime %lld.%u\n", + printk(KERN_ERR "\tmtime %lld.%u\n", (long long)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); - printk(KERN_DEBUG "\tctime %lld.%u\n", + printk(KERN_ERR "\tctime %lld.%u\n", (long long)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); - printk(KERN_DEBUG "\tuid %u\n", + printk(KERN_ERR "\tuid %u\n", le32_to_cpu(ino->uid)); - printk(KERN_DEBUG "\tgid %u\n", + printk(KERN_ERR "\tgid %u\n", le32_to_cpu(ino->gid)); - printk(KERN_DEBUG "\tmode %u\n", + printk(KERN_ERR "\tmode %u\n", le32_to_cpu(ino->mode)); - printk(KERN_DEBUG "\tflags %#x\n", + printk(KERN_ERR "\tflags %#x\n", le32_to_cpu(ino->flags)); - printk(KERN_DEBUG "\txattr_cnt %u\n", + printk(KERN_ERR "\txattr_cnt %u\n", le32_to_cpu(ino->xattr_cnt)); - printk(KERN_DEBUG "\txattr_size %u\n", + printk(KERN_ERR "\txattr_size %u\n", le32_to_cpu(ino->xattr_size)); - printk(KERN_DEBUG "\txattr_names %u\n", + printk(KERN_ERR "\txattr_names %u\n", le32_to_cpu(ino->xattr_names)); - printk(KERN_DEBUG "\tcompr_type %#x\n", + printk(KERN_ERR "\tcompr_type %#x\n", (int)le16_to_cpu(ino->compr_type)); - printk(KERN_DEBUG "\tdata len %u\n", + printk(KERN_ERR "\tdata len %u\n", le32_to_cpu(ino->data_len)); break; } @@ -505,16 +505,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_DEBUG "\tkey %s\n", + printk(KERN_ERR "\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_DEBUG "\tinum %llu\n", + printk(KERN_ERR "\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); - printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); - printk(KERN_DEBUG "\tnlen %d\n", nlen); - printk(KERN_DEBUG "\tname "); + printk(KERN_ERR "\ttype %d\n", (int)dent->type); + printk(KERN_ERR "\tnlen %d\n", nlen); + printk(KERN_ERR "\tname "); if (nlen > UBIFS_MAX_NLEN) - printk(KERN_DEBUG "(bad name length, not printing, " + printk(KERN_ERR "(bad name length, not printing, " "bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) @@ -530,16 +530,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_DEBUG "\tkey %s\n", + printk(KERN_ERR "\tkey %s\n", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - printk(KERN_DEBUG "\tsize %u\n", + printk(KERN_ERR "\tsize %u\n", le32_to_cpu(dn->size)); - printk(KERN_DEBUG "\tcompr_typ %d\n", + printk(KERN_ERR "\tcompr_typ %d\n", (int)le16_to_cpu(dn->compr_type)); - printk(KERN_DEBUG "\tdata size %d\n", + printk(KERN_ERR "\tdata size %d\n", dlen); - printk(KERN_DEBUG "\tdata:\n"); - print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, + printk(KERN_ERR "\tdata:\n"); + print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, (void *)&dn->data, dlen, 0); break; } @@ -547,11 +547,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_trun_node *trun = node; - printk(KERN_DEBUG "\tinum %u\n", + printk(KERN_ERR "\tinum %u\n", le32_to_cpu(trun->inum)); - printk(KERN_DEBUG "\told_size %llu\n", + printk(KERN_ERR "\told_size %llu\n", (unsigned long long)le64_to_cpu(trun->old_size)); - printk(KERN_DEBUG "\tnew_size %llu\n", + printk(KERN_ERR "\tnew_size %llu\n", (unsigned long long)le64_to_cpu(trun->new_size)); break; } @@ -560,17 +560,17 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_idx_node *idx = node; n = le16_to_cpu(idx->child_cnt); - printk(KERN_DEBUG "\tchild_cnt %d\n", n); - printk(KERN_DEBUG "\tlevel %d\n", + printk(KERN_ERR "\tchild_cnt %d\n", n); + printk(KERN_ERR "\tlevel %d\n", (int)le16_to_cpu(idx->level)); - printk(KERN_DEBUG "\tBranches:\n"); + printk(KERN_ERR "\tBranches:\n"); for (i = 0; i < n && i < c->fanout - 1; i++) { const struct ubifs_branch *br; br = ubifs_idx_branch(c, idx, i); key_read(c, &br->key, &key); - printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", + printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), le32_to_cpu(br->len), dbg_snprintf_key(c, &key, key_buf, @@ -584,20 +584,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_orph_node *orph = node; - printk(KERN_DEBUG "\tcommit number %llu\n", + printk(KERN_ERR "\tcommit number %llu\n", (unsigned long long) le64_to_cpu(orph->cmt_no) & LLONG_MAX); - printk(KERN_DEBUG "\tlast node flag %llu\n", + printk(KERN_ERR "\tlast node flag %llu\n", (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; - printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); + printk(KERN_ERR "\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) - printk(KERN_DEBUG "\t ino %llu\n", + printk(KERN_ERR "\t ino %llu\n", (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: - printk(KERN_DEBUG "node type %d was not recognized\n", + printk(KERN_ERR "node type %d was not recognized\n", (int)ch->node_type); } spin_unlock(&dbg_lock); @@ -606,16 +606,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) void dbg_dump_budget_req(const struct ubifs_budget_req *req) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", + printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n", req->new_ino, req->dirtied_ino); - printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", + printk(KERN_ERR "\tnew_ino_d %d, dirtied_ino_d %d\n", req->new_ino_d, req->dirtied_ino_d); - printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", + printk(KERN_ERR "\tnew_page %d, dirtied_page %d\n", req->new_page, req->dirtied_page); - printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", + printk(KERN_ERR "\tnew_dent %d, mod_dent %d\n", req->new_dent, req->mod_dent); - printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); - printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", + printk(KERN_ERR "\tidx_growth %d\n", req->idx_growth); + printk(KERN_ERR "\tdata_growth %d dd_growth %d\n", req->data_growth, req->dd_growth); spin_unlock(&dbg_lock); } @@ -623,12 +623,12 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req) void dbg_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, " "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); - printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " + printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, " "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, lst->total_dirty); - printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " + printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, " "total_dead %lld\n", lst->total_used, lst->total_dark, lst->total_dead); spin_unlock(&dbg_lock); @@ -644,21 +644,21 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) spin_lock(&c->space_lock); spin_lock(&dbg_lock); - printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " + printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, " "total budget sum %lld\n", current->pid, bi->data_growth + bi->dd_growth, bi->data_growth + bi->dd_growth + bi->idx_growth); - printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " + printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, " "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, bi->idx_growth); - printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " + printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, " "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx); - printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", + printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n", bi->page_budget, bi->inode_budget, bi->dent_budget); - printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", + printk(KERN_ERR "\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp); - printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", c->dark_wm, c->dead_wm, c->max_idx_node_sz); if (bi != &c->bi) @@ -669,38 +669,38 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) */ goto out_unlock; - printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); - printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " + printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), atomic_long_read(&c->dirty_zn_cnt), atomic_long_read(&c->clean_zn_cnt)); - printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", + printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); /* If we are in R/O mode, journal heads do not exist */ if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) - printk(KERN_DEBUG "\tjhead %s\t LEB %d\n", + printk(KERN_ERR "\tjhead %s\t LEB %d\n", dbg_jhead(c->jheads[i].wbuf.jhead), c->jheads[i].wbuf.lnum); for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { bud = rb_entry(rb, struct ubifs_bud, rb); - printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); + printk(KERN_ERR "\tbud LEB %d\n", bud->lnum); } list_for_each_entry(bud, &c->old_buds, list) - printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); + printk(KERN_ERR "\told bud LEB %d\n", bud->lnum); list_for_each_entry(idx_gc, &c->idx_gc, list) - printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", + printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n", idx_gc->lnum, idx_gc->unmap); - printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + printk(KERN_ERR "\tcommit state %d\n", c->cmt_state); /* Print budgeting predictions */ available = ubifs_calc_available(c, c->bi.min_idx_lebs); outstanding = c->bi.data_growth + c->bi.dd_growth; free = ubifs_get_free_space_nolock(c); - printk(KERN_DEBUG "Budgeting predictions:\n"); - printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", + printk(KERN_ERR "Budgeting predictions:\n"); + printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n", available, outstanding, free); out_unlock: spin_unlock(&dbg_lock); @@ -720,11 +720,11 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) dark = ubifs_calc_dark(c, spc); if (lp->flags & LPROPS_INDEX) - printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " "free + dirty %-8d flags %#x (", lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, lp->flags); else - printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d " "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " "flags %#-4x (", lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, dark, dead, @@ -807,7 +807,7 @@ void dbg_dump_lprops(struct ubifs_info *c) struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", + printk(KERN_ERR "(pid %d) start dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); @@ -819,7 +819,7 @@ void dbg_dump_lprops(struct ubifs_info *c) dbg_dump_lprop(c, &lp); } - printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", + printk(KERN_ERR "(pid %d) finish dumping LEB properties\n", current->pid); } @@ -828,35 +828,35 @@ void dbg_dump_lpt_info(struct ubifs_info *c) int i; spin_lock(&dbg_lock); - printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); - printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); - printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); - printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); - printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); - printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); - printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); - printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); - printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); - printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); - printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); - printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); - printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); - printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); - printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); - printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); - printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); - printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); - printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); - printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); - printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid); + printk(KERN_ERR "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_ERR "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_ERR "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_ERR "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_ERR "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_ERR "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_ERR "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_ERR "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_ERR "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_ERR "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_ERR "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_ERR "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_ERR "\tspace_bits: %d\n", c->space_bits); + printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_ERR "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_ERR "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_ERR "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_ERR "\tLPT head is at %d:%d\n", c->nhead_lnum, c->nhead_offs); - printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", + printk(KERN_ERR "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) - printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + printk(KERN_ERR "\tLPT lsave is at %d:%d\n", c->lsave_lnum, c->lsave_offs); for (i = 0; i < c->lpt_lebs; i++) - printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d " "cmt %d\n", i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); spin_unlock(&dbg_lock); @@ -867,12 +867,12 @@ void dbg_dump_sleb(const struct ubifs_info *c, { struct ubifs_scan_node *snod; - printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n", current->pid, sleb->lnum, offs); list_for_each_entry(snod, &sleb->nodes, list) { cond_resched(); - printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum, snod->offs, snod->len); dbg_dump_node(c, snod->node); } @@ -887,7 +887,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) if (dbg_is_tst_rcvry(c)) return; - printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + printk(KERN_ERR "(pid %d) start dumping LEB %d\n", current->pid, lnum); buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); @@ -902,17 +902,17 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) goto out; } - printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, + printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum, sleb->nodes_cnt, sleb->endpt); list_for_each_entry(snod, &sleb->nodes, list) { cond_resched(); - printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, + printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); dbg_dump_node(c, snod->node); } - printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + printk(KERN_ERR "(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); @@ -934,7 +934,7 @@ void dbg_dump_znode(const struct ubifs_info *c, else zbr = &c->zroot; - printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" + printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d" " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip, znode->level, znode->child_cnt, znode->flags); @@ -944,18 +944,18 @@ void dbg_dump_znode(const struct ubifs_info *c, return; } - printk(KERN_DEBUG "zbranches:\n"); + printk(KERN_ERR "zbranches:\n"); for (n = 0; n < znode->child_cnt; n++) { zbr = &znode->zbranch[n]; if (znode->level > 0) - printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " + printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, dbg_snprintf_key(c, &zbr->key, key_buf, DBG_KEY_BUF_LEN)); else - printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " + printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, dbg_snprintf_key(c, &zbr->key, @@ -969,16 +969,16 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", + printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n", current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; - printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " + printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d " "flags %d\n", i, lprops->lnum, lprops->hpos, lprops->free, lprops->dirty, lprops->flags); } - printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); + printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid); } void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, @@ -986,15 +986,15 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, { int i; - printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); - printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", + printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid); + printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); - printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", + printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n", pnode->flags, iip, pnode->level, pnode->num); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { struct ubifs_lprops *lp = &pnode->lprops[i]; - printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", + printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n", i, lp->free, lp->dirty, lp->flags, lp->lnum); } } @@ -1004,20 +1004,20 @@ void dbg_dump_tnc(struct ubifs_info *c) struct ubifs_znode *znode; int level; - printk(KERN_DEBUG "\n"); - printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); + printk(KERN_ERR "\n"); + printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; - printk(KERN_DEBUG "== Level %d ==\n", level); + printk(KERN_ERR "== Level %d ==\n", level); while (znode) { if (level != znode->level) { level = znode->level; - printk(KERN_DEBUG "== Level %d ==\n", level); + printk(KERN_ERR "== Level %d ==\n", level); } dbg_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } - printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); + printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid); } static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index ad1a6fe..9f71765 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -164,9 +164,7 @@ struct ubifs_global_debug_info { #define dbg_dump_stack() dump_stack() #define dbg_err(fmt, ...) do { \ - spin_lock(&dbg_lock); \ ubifs_err(fmt, ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ } while (0) #define ubifs_dbg_msg(type, fmt, ...) \ @@ -217,7 +215,6 @@ struct ubifs_global_debug_info { /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) -extern spinlock_t dbg_lock; extern struct ubifs_global_debug_info ubifs_dbg; static inline int dbg_is_chk_gen(const struct ubifs_info *c) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index d6fe1c7..ec9f187 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -566,6 +566,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + unsigned int saved_nlink = inode->i_nlink; /* * Budget request settings: deletion direntry, deletion inode (+1 for @@ -613,7 +614,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) out_cancel: dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - inc_nlink(inode); + set_nlink(inode, saved_nlink); unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); @@ -704,8 +705,7 @@ out_cancel: dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inc_nlink(dir); - inc_nlink(inode); - inc_nlink(inode); + set_nlink(inode, 2); unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); @@ -977,6 +977,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; + unsigned int saved_nlink; /* * Budget request settings: deletion direntry, new direntry, removing @@ -1059,13 +1060,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) { /* * Directories cannot have hard-links, so if this is a - * directory, decrement its @i_nlink twice because an empty - * directory has @i_nlink 2. + * directory, just clear @i_nlink. */ + saved_nlink = new_inode->i_nlink; if (is_dir) + clear_nlink(new_inode); + else drop_nlink(new_inode); new_inode->i_ctime = time; - drop_nlink(new_inode); } else { new_dir->i_size += new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1102,9 +1104,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, out_cancel: if (unlink) { - if (is_dir) - inc_nlink(new_inode); - inc_nlink(new_inode); + set_nlink(new_inode, saved_nlink); } else { new_dir->i_size -= new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index ee4f43f..2a935b3 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -679,7 +679,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE || ret == SCANNED_A_CORRUPT_NODE) { - dbg_rcvry("found corruption - %d", ret); + dbg_rcvry("found corruption (%d) at %d:%d", + ret, lnum, offs); break; } else { dbg_err("unexpected return value %d", ret); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 6094c5a..771f7fb 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -410,13 +410,23 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) } if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { - err = 7; + ubifs_err("too few main LEBs count %d, must be at least %d", + c->main_lebs, UBIFS_MIN_MAIN_LEBS); goto failed; } - if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || - c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { - err = 8; + max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; + if (c->max_bud_bytes < max_bytes) { + ubifs_err("too small journal (%lld bytes), must be at least " + "%lld bytes", c->max_bud_bytes, max_bytes); + goto failed; + } + + max_bytes = (long long)c->leb_size * c->main_lebs; + if (c->max_bud_bytes > max_bytes) { + ubifs_err("too large journal size (%lld bytes), only %lld bytes" + "available in the main area", + c->max_bud_bytes, max_bytes); goto failed; } @@ -450,7 +460,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } - max_bytes = c->main_lebs * (long long)c->leb_size; if (c->rp_size < 0 || max_bytes < c->rp_size) { err = 14; goto failed; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 12e9477..93d59ac 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -84,9 +84,6 @@ #define INUM_WARN_WATERMARK 0xFFF00000 #define INUM_WATERMARK 0xFFFFFF00 -/* Largest key size supported in this implementation */ -#define CUR_MAX_KEY_LEN UBIFS_SK_LEN - /* Maximum number of entries in each LPT (LEB category) heap */ #define LPT_HEAP_SZ 256 @@ -277,10 +274,10 @@ struct ubifs_old_idx { /* The below union makes it easier to deal with keys */ union ubifs_key { - uint8_t u8[CUR_MAX_KEY_LEN]; - uint32_t u32[CUR_MAX_KEY_LEN/4]; - uint64_t u64[CUR_MAX_KEY_LEN/8]; - __le32 j32[CUR_MAX_KEY_LEN/4]; + uint8_t u8[UBIFS_SK_LEN]; + uint32_t u32[UBIFS_SK_LEN/4]; + uint64_t u64[UBIFS_SK_LEN/8]; + __le32 j32[UBIFS_SK_LEN/4]; }; /** diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 987585b..1ba2baa 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) } static void udf_bitmap_free_blocks(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, struct kernel_lb_addr *bloc, uint32_t offset, @@ -172,7 +171,6 @@ error_return: } static int udf_bitmap_prealloc_blocks(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, uint16_t partition, uint32_t first_block, uint32_t block_count) @@ -223,7 +221,6 @@ out: } static int udf_bitmap_new_block(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, uint16_t partition, uint32_t goal, int *err) { @@ -349,7 +346,6 @@ error_return: } static void udf_table_free_blocks(struct super_block *sb, - struct inode *inode, struct inode *table, struct kernel_lb_addr *bloc, uint32_t offset, @@ -581,7 +577,6 @@ error_return: } static int udf_table_prealloc_blocks(struct super_block *sb, - struct inode *inode, struct inode *table, uint16_t partition, uint32_t first_block, uint32_t block_count) { @@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb, } static int udf_table_new_block(struct super_block *sb, - struct inode *inode, struct inode *table, uint16_t partition, uint32_t goal, int *err) { @@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode, struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { - udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, + udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { - udf_table_free_blocks(sb, inode, map->s_uspace.s_table, + udf_table_free_blocks(sb, map->s_uspace.s_table, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, + udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - udf_table_free_blocks(sb, inode, map->s_fspace.s_table, + udf_table_free_blocks(sb, map->s_fspace.s_table, bloc, offset, count); } + + if (inode) { + inode_sub_bytes(inode, + ((sector_t)count) << sb->s_blocksize_bits); + } } inline int udf_prealloc_blocks(struct super_block *sb, @@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb, uint32_t block_count) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + sector_t allocated; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - return udf_bitmap_prealloc_blocks(sb, inode, - map->s_uspace.s_bitmap, - partition, first_block, - block_count); + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_uspace.s_bitmap, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - return udf_table_prealloc_blocks(sb, inode, - map->s_uspace.s_table, - partition, first_block, - block_count); + allocated = udf_table_prealloc_blocks(sb, + map->s_uspace.s_table, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - return udf_bitmap_prealloc_blocks(sb, inode, - map->s_fspace.s_bitmap, - partition, first_block, - block_count); + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_fspace.s_bitmap, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - return udf_table_prealloc_blocks(sb, inode, - map->s_fspace.s_table, - partition, first_block, - block_count); + allocated = udf_table_prealloc_blocks(sb, + map->s_fspace.s_table, + partition, first_block, + block_count); else return 0; + + if (inode && allocated > 0) + inode_add_bytes(inode, allocated << sb->s_blocksize_bits); + return allocated; } inline int udf_new_block(struct super_block *sb, @@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb, uint16_t partition, uint32_t goal, int *err) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + int block; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - return udf_bitmap_new_block(sb, inode, - map->s_uspace.s_bitmap, - partition, goal, err); + block = udf_bitmap_new_block(sb, + map->s_uspace.s_bitmap, + partition, goal, err); else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - return udf_table_new_block(sb, inode, - map->s_uspace.s_table, - partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - return udf_bitmap_new_block(sb, inode, - map->s_fspace.s_bitmap, + block = udf_table_new_block(sb, + map->s_uspace.s_table, partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + block = udf_bitmap_new_block(sb, + map->s_fspace.s_bitmap, + partition, goal, err); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - return udf_table_new_block(sb, inode, - map->s_fspace.s_table, - partition, goal, err); + block = udf_table_new_block(sb, + map->s_fspace.s_table, + partition, goal, err); else { *err = -EIO; return 0; } + if (inode && block) + inode_add_bytes(inode, sb->s_blocksize); + return block; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 05ab481..7e5aae4 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; + iinfo->i_checkpoint = 1; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7699df7..7d75280 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << @@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); offset = sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; } @@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; + uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; @@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync) dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + lb_recorded = 0; /* No extents => no blocks! */ + else + lb_recorded = + (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> + (blocksize_bits - 9); + if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); - fe->logicalBlocksRecorded = cpu_to_le64( - (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> - (blocksize_bits - 9)); + fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); @@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { @@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size); - efe->logicalBlocksRecorded = cpu_to_le64( - (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> - (blocksize_bits - 9)); + efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && @@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } diff --git a/fs/udf/super.c b/fs/udf/super.c index 85067b4..ac8a348 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -950,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) else bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ - if (bitmap == NULL) { - udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", - nr_groups); + if (bitmap == NULL) return NULL; - } bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); bitmap->s_nr_groups = nr_groups; diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index d1bd31e..bb8309d 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -23,6 +23,7 @@ struct udf_inode_info { __u64 i_lenExtents; __u32 i_next_alloc_block; __u32 i_next_alloc_goal; + __u32 i_checkpoint; unsigned i_alloc_type : 3; unsigned i_efe : 1; /* extendedFileEntry */ unsigned i_use : 1; /* unallocSpaceEntry */ @@ -16,7 +16,7 @@ #include <linux/security.h> #include <linux/evm.h> #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <asm/uaccess.h> diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 8d5a506..69d06b0 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -5,7 +5,7 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/gfp.h> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 427a4e8..0a99779 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_qm_bhv.o \ xfs_qm.o \ xfs_quotaops.o -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o -endif xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 74b9baf..0dbb9e7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -26,6 +26,7 @@ #include "xfs_bmap_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_rw.h" @@ -99,23 +100,6 @@ xfs_destroy_ioend( } /* - * If the end of the current ioend is beyond the current EOF, - * return the new EOF value, otherwise zero. - */ -STATIC xfs_fsize_t -xfs_ioend_new_eof( - xfs_ioend_t *ioend) -{ - xfs_inode_t *ip = XFS_I(ioend->io_inode); - xfs_fsize_t isize; - xfs_fsize_t bsize; - - bsize = ioend->io_offset + ioend->io_size; - isize = MIN(i_size_read(VFS_I(ip)), bsize); - return isize > ip->i_d.di_size ? isize : 0; -} - -/* * Fast and loose check if this write could update the on-disk inode size. */ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) @@ -124,32 +108,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) XFS_I(ioend->io_inode)->i_d.di_size; } +STATIC int +xfs_setfilesize_trans_alloc( + struct xfs_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + ioend->io_append_trans = tp; + + /* + * We hand off the transaction to the completion thread now, so + * clear the flag here. + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return 0; +} + /* * Update on-disk file size now that data has been written to disk. - * - * This function does not block as blocking on the inode lock in IO completion - * can lead to IO completion order dependency deadlocks.. If it can't get the - * inode ilock it will return EAGAIN. Callers must handle this. */ STATIC int xfs_setfilesize( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend) { - xfs_inode_t *ip = XFS_I(ioend->io_inode); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; xfs_fsize_t isize; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) - return EAGAIN; + /* + * The transaction was allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction + * manually. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - isize = xfs_ioend_new_eof(ioend); - if (isize) { - trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); - ip->i_d.di_size = isize; - xfs_mark_inode_dirty(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); + if (!isize) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, 0); + return 0; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; + trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); + + ip->i_d.di_size = isize; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return xfs_trans_commit(tp, 0); } /* @@ -163,10 +180,12 @@ xfs_finish_ioend( struct xfs_ioend *ioend) { if (atomic_dec_and_test(&ioend->io_remaining)) { + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + if (ioend->io_type == IO_UNWRITTEN) - queue_work(xfsconvertd_workqueue, &ioend->io_work); - else if (xfs_ioend_is_append(ioend)) - queue_work(xfsdatad_workqueue, &ioend->io_work); + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); } @@ -195,35 +214,36 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == IO_UNWRITTEN) { + /* + * For buffered I/O we never preallocate a transaction when + * doing the unwritten extent conversion, but for direct I/O + * we do not know if we are converting an unwritten extent + * or not at the point where we preallocate the transaction. + */ + if (ioend->io_append_trans) { + ASSERT(ioend->io_isdirect); + + current_set_flags_nested( + &ioend->io_append_trans->t_pflags, PF_FSTRANS); + xfs_trans_cancel(ioend->io_append_trans, 0); + } + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); if (error) { ioend->io_error = -error; goto done; } + } else if (ioend->io_append_trans) { + error = xfs_setfilesize(ioend); + if (error) + ioend->io_error = -error; + } else { + ASSERT(!xfs_ioend_is_append(ioend)); } - /* - * We might have to update the on-disk file size after extending - * writes. - */ - error = xfs_setfilesize(ioend); - ASSERT(!error || error == EAGAIN); - done: - /* - * If we didn't complete processing of the ioend, requeue it to the - * tail of the workqueue for another attempt later. Otherwise destroy - * it. - */ - if (error == EAGAIN) { - atomic_inc(&ioend->io_remaining); - xfs_finish_ioend(ioend); - /* ensure we don't spin on blocked ioends */ - delay(1); - } else { - xfs_destroy_ioend(ioend); - } + xfs_destroy_ioend(ioend); } /* @@ -259,6 +279,7 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_isasync = 0; + ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; @@ -269,6 +290,7 @@ xfs_alloc_ioend( ioend->io_size = 0; ioend->io_iocb = NULL; ioend->io_result = 0; + ioend->io_append_trans = NULL; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -379,14 +401,6 @@ xfs_submit_ioend_bio( atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; - - /* - * If the I/O is beyond EOF we mark the inode dirty immediately - * but don't update the inode size until I/O completion. - */ - if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); } @@ -1033,8 +1047,20 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) + if (iohead) { + /* + * Reserve log space if we might write beyond the on-disk + * inode size. + */ + if (ioend->io_type != IO_UNWRITTEN && + xfs_ioend_is_append(ioend)) { + err = xfs_setfilesize_trans_alloc(ioend); + if (err) + goto error; + } + xfs_submit_ioend(wbc, iohead); + } return 0; @@ -1314,17 +1340,32 @@ xfs_vm_direct_IO( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); + struct xfs_ioend *ioend = NULL; ssize_t ret; if (rw & WRITE) { - iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); + size_t size = iov_length(iov, nr_segs); + + /* + * We need to preallocate a transaction for a size update + * here. In the case that this write both updates the size + * and converts at least on unwritten extent we will cancel + * the still clean transaction after the I/O has finished. + */ + iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT); + if (offset + size > XFS_I(inode)->i_d.di_size) { + ret = xfs_setfilesize_trans_alloc(ioend); + if (ret) + goto out_destroy_ioend; + ioend->io_isdirect = 1; + } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - xfs_destroy_ioend(iocb->private); + goto out_trans_cancel; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1333,6 +1374,16 @@ xfs_vm_direct_IO( } return ret; + +out_trans_cancel: + if (ioend->io_append_trans) { + current_set_flags_nested(&ioend->io_append_trans->t_pflags, + PF_FSTRANS); + xfs_trans_cancel(ioend->io_append_trans, 0); + } +out_destroy_ioend: + xfs_destroy_ioend(ioend); + return ret; } STATIC void diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 116dd5c..84eafbc 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,8 +18,6 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct workqueue_struct *xfsdatad_workqueue; -extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* @@ -48,12 +46,14 @@ typedef struct xfs_ioend { int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ unsigned int io_isasync : 1; /* needs aio_complete */ + unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct xfs_trans *io_append_trans;/* xact. for size update */ struct kiocb *io_iocb; int io_result; } xfs_ioend_t; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 188ef2f..3548c6f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5536,8 +5536,12 @@ xfs_getbmap( if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) return XFS_ERROR(ENOMEM); out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) - return XFS_ERROR(ENOMEM); + if (!out) { + out = kmem_zalloc_large(bmv->bmv_count * + sizeof(struct getbmapx)); + if (!out) + return XFS_ERROR(ENOMEM); + } xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { @@ -5661,7 +5665,10 @@ xfs_getbmap( break; } - kmem_free(out); + if (is_vmalloc_addr(out)) + kmem_free_large(out); + else + kmem_free(out); return error; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4dff85c..6819b51 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; -struct workqueue_struct *xfsdatad_workqueue; -struct workqueue_struct *xfsconvertd_workqueue; #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) @@ -1793,21 +1791,8 @@ xfs_buf_init(void) if (!xfslogd_workqueue) goto out_free_buf_zone; - xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); - if (!xfsdatad_workqueue) - goto out_destroy_xfslogd_workqueue; - - xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", - WQ_MEM_RECLAIM, 1); - if (!xfsconvertd_workqueue) - goto out_destroy_xfsdatad_workqueue; - return 0; - out_destroy_xfsdatad_workqueue: - destroy_workqueue(xfsdatad_workqueue); - out_destroy_xfslogd_workqueue: - destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: kmem_zone_destroy(xfs_buf_zone); out: @@ -1817,8 +1802,6 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - destroy_workqueue(xfsconvertd_workqueue); - destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index dd974a5..1137bbc 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -215,7 +215,7 @@ xfs_swap_extents( xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; - int ilf_fields, tilf_fields; + int src_log_flags, target_log_flags; int error = 0; int aforkblks = 0; int taforkblks = 0; @@ -385,9 +385,8 @@ xfs_swap_extents( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - ilf_fields = XFS_ILOG_CORE; - - switch(ip->i_d.di_format) { + src_log_flags = XFS_ILOG_CORE; + switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or @@ -397,16 +396,15 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - ilf_fields |= XFS_ILOG_DEXT; + src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ilf_fields |= XFS_ILOG_DBROOT; + src_log_flags |= XFS_ILOG_DBROOT; break; } - tilf_fields = XFS_ILOG_CORE; - - switch(tip->i_d.di_format) { + target_log_flags = XFS_ILOG_CORE; + switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or @@ -416,10 +414,10 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - tilf_fields |= XFS_ILOG_DEXT; + target_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - tilf_fields |= XFS_ILOG_DBROOT; + target_log_flags |= XFS_ILOG_DBROOT; break; } @@ -427,8 +425,8 @@ xfs_swap_extents( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_log_inode(tp, ip, ilf_fields); - xfs_trans_log_inode(tp, tip, tilf_fields); + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); /* * If this is a synchronous mount, make sure that the diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 9245e02..d3b63ae 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -29,6 +29,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_dir2.h" #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 53db20e..4be16a0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -43,11 +43,10 @@ * Lock order: * * ip->i_lock - * qh->qh_lock - * qi->qi_dqlist_lock - * dquot->q_qlock (xfs_dqlock() and friends) - * dquot->q_flush (xfs_dqflock() and friends) - * xfs_Gqm->qm_dqfrlist_lock + * qi->qi_tree_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * qi->qi_lru_lock * * If two dquots need to be locked the order is user before group/project, * otherwise by the lowest id first, see xfs_dqlock2. @@ -60,6 +59,9 @@ int xfs_dqreq_num; int xfs_dqerror_mod = 33; #endif +struct kmem_zone *xfs_qm_dqtrxzone; +static struct kmem_zone *xfs_qm_dqzone; + static struct lock_class_key xfs_dquot_other_class; /* @@ -69,12 +71,12 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(list_empty(&dqp->q_freelist)); + ASSERT(list_empty(&dqp->q_lru)); mutex_destroy(&dqp->q_qlock); - kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); + kmem_zone_free(xfs_qm_dqzone, dqp); - atomic_dec(&xfs_Gqm->qm_totaldquots); + XFS_STATS_DEC(xs_qm_dquot); } /* @@ -282,7 +284,7 @@ xfs_qm_dqalloc( * Return if this type of quotas is turned off while we didn't * have an inode lock */ - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { xfs_iunlock(quotip, XFS_ILOCK_EXCL); return (ESRCH); } @@ -384,7 +386,7 @@ xfs_qm_dqtobp( dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; xfs_ilock(quotip, XFS_ILOCK_SHARED); - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -492,12 +494,12 @@ xfs_qm_dqread( int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); dqp->dq_flags = type; dqp->q_core.d_id = cpu_to_be32(id); dqp->q_mount = mp; - INIT_LIST_HEAD(&dqp->q_freelist); + INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); @@ -516,7 +518,7 @@ xfs_qm_dqread( if (!(type & XFS_DQ_USER)) lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); - atomic_inc(&xfs_Gqm->qm_totaldquots); + XFS_STATS_INC(xs_qm_dquot); trace_xfs_dqread(dqp); @@ -602,60 +604,6 @@ error0: } /* - * Lookup a dquot in the incore dquot hashtable. We keep two separate - * hashtables for user and group dquots; and, these are global tables - * inside the XQM, not per-filesystem tables. - * The hash chain must be locked by caller, and it is left locked - * on return. Returning dquot is locked. - */ -STATIC int -xfs_qm_dqlookup( - xfs_mount_t *mp, - xfs_dqid_t id, - xfs_dqhash_t *qh, - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - - ASSERT(mutex_is_locked(&qh->qh_lock)); - - /* - * Traverse the hashchain looking for a match - */ - list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { - /* - * We already have the hashlock. We don't need the - * dqlock to look at the id field of the dquot, since the - * id can't be modified without the hashlock anyway. - */ - if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp) - continue; - - trace_xfs_dqlookup_found(dqp); - - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - *O_dqpp = NULL; - xfs_dqunlock(dqp); - return -1; - } - - dqp->q_nrefs++; - - /* - * move the dquot to the front of the hashchain - */ - list_move(&dqp->q_hashlist, &qh->qh_list); - trace_xfs_dqlookup_done(dqp); - *O_dqpp = dqp; - return 0; - } - - *O_dqpp = NULL; - return 1; -} - -/* * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a * a locked dquot, doing an allocation (if requested) as needed. * When both an inode and an id are given, the inode's id takes precedence. @@ -672,10 +620,10 @@ xfs_qm_dqget( uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { - xfs_dquot_t *dqp; - xfs_dqhash_t *h; - uint version; - int error; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct xfs_dquot *dqp; + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || @@ -683,7 +631,6 @@ xfs_qm_dqget( (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { return (ESRCH); } - h = XFS_DQ_HASH(mp, id, type); #ifdef DEBUG if (xfs_do_dqerror) { @@ -699,42 +646,33 @@ xfs_qm_dqget( type == XFS_DQ_GROUP); if (ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (type == XFS_DQ_USER) - ASSERT(ip->i_udquot == NULL); - else - ASSERT(ip->i_gdquot == NULL); + ASSERT(xfs_inode_dquot(ip, type) == NULL); } #endif restart: - mutex_lock(&h->qh_lock); + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (dqp) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } - /* - * Look in the cache (hashtable). - * The chain is kept locked during lookup. - */ - switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) { - case -1: - XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - mutex_unlock(&h->qh_lock); - delay(1); - goto restart; - case 0: - XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); - /* - * The dquot was found, moved to the front of the chain, - * taken off the freelist if it was on it, and locked - * at this point. Just unlock the hashchain and return. - */ - ASSERT(*O_dqpp); - ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); - mutex_unlock(&h->qh_lock); - trace_xfs_dqget_hit(*O_dqpp); - return 0; /* success */ - default: - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); - break; + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(xs_qm_dqcachehits); + *O_dqpp = dqp; + return 0; } + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -745,12 +683,6 @@ restart: */ if (ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * Save the hashchain version stamp, and unlock the chain, so that - * we don't keep the lock across a disk read - */ - version = h->qh_version; - mutex_unlock(&h->qh_lock); error = xfs_qm_dqread(mp, id, type, flags, &dqp); @@ -760,97 +692,53 @@ restart: if (error) return error; - /* - * Dquot lock comes after hashlock in the lock ordering - */ if (ip) { /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ - if (type == XFS_DQ_USER) { - if (!XFS_IS_UQUOTA_ON(mp)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } - if (ip->i_udquot) { + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { xfs_qm_dqdestroy(dqp); - dqp = ip->i_udquot; + dqp = dqp1; xfs_dqlock(dqp); goto dqret; } } else { - if (!XFS_IS_OQUOTA_ON(mp)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } - if (ip->i_gdquot) { - xfs_qm_dqdestroy(dqp); - dqp = ip->i_gdquot; - xfs_dqlock(dqp); - goto dqret; - } + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); } } - /* - * Hashlock comes after ilock in lock order - */ - mutex_lock(&h->qh_lock); - if (version != h->qh_version) { - xfs_dquot_t *tmpdqp; + mutex_lock(&qi->qi_tree_lock); + error = -radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + WARN_ON(error != EEXIST); + /* - * Now, see if somebody else put the dquot in the - * hashtable before us. This can happen because we didn't - * keep the hashchain lock. We don't have to worry about - * lock order between the two dquots here since dqp isn't - * on any findable lists yet. + * Duplicate found. Just throw away the new dquot and start + * over. */ - switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) { - case 0: - case -1: - /* - * Duplicate found, either in cache or on its way out. - * Just throw away the new dquot and start over. - */ - if (tmpdqp) - xfs_qm_dqput(tmpdqp); - mutex_unlock(&h->qh_lock); - xfs_qm_dqdestroy(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto restart; - default: - break; - } + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(xs_qm_dquot_dups); + goto restart; } /* - * Put the dquot at the beginning of the hash-chain and mp's list - * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. - */ - ASSERT(mutex_is_locked(&h->qh_lock)); - dqp->q_hash = h; - list_add(&dqp->q_hashlist, &h->qh_list); - h->qh_version++; - - /* - * Attach this dquot to this filesystem's list of all dquots, - * kept inside the mount structure in m_quotainfo field - */ - mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); - - /* * We return a locked dquot to the caller, with a reference taken */ xfs_dqlock(dqp); dqp->q_nrefs = 1; - list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); - mp->m_quotainfo->qi_dquots++; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); - mutex_unlock(&h->qh_lock); + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); trace_xfs_dqget_miss(dqp); @@ -859,37 +747,22 @@ restart: } -/* - * Release a reference to the dquot (decrement ref-count) - * and unlock it. If there is a group quota attached to this - * dquot, carefully release that too without tripping over - * deadlocks'n'stuff. - */ -void -xfs_qm_dqput( +STATIC void +xfs_qm_dqput_final( struct xfs_dquot *dqp) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; struct xfs_dquot *gdqp; - ASSERT(dqp->q_nrefs > 0); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - trace_xfs_dqput(dqp); - -recurse: - if (--dqp->q_nrefs > 0) { - xfs_dqunlock(dqp); - return; - } - trace_xfs_dqput_free(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - if (list_empty(&dqp->q_freelist)) { - list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - xfs_Gqm->qm_dqfrlist_cnt++; + mutex_lock(&qi->qi_lru_lock); + if (list_empty(&dqp->q_lru)) { + list_add_tail(&dqp->q_lru, &qi->qi_lru_list); + qi->qi_lru_count++; + XFS_STATS_INC(xs_qm_dquot_unused); } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + mutex_unlock(&qi->qi_lru_lock); /* * If we just added a udquot to the freelist, then we want to release @@ -906,10 +779,29 @@ recurse: /* * If we had a group quota hint, release it now. */ - if (gdqp) { - dqp = gdqp; - goto recurse; - } + if (gdqp) + xfs_qm_dqput(gdqp); +} + +/* + * Release a reference to the dquot (decrement ref-count) and unlock it. + * + * If there is a group quota attached to this dquot, carefully release that + * too without tripping over deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + struct xfs_dquot *dqp) +{ + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + trace_xfs_dqput(dqp); + + if (--dqp->q_nrefs > 0) + xfs_dqunlock(dqp); + else + xfs_qm_dqput_final(dqp); } /* @@ -1091,17 +983,6 @@ xfs_qm_dqflush( } -void -xfs_dqunlock( - xfs_dquot_t *dqp) -{ - xfs_dqunlock_nonotify(dqp); - if (dqp->q_logitem.qli_dquot == dqp) { - xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, - &dqp->q_logitem.qli_item); - } -} - /* * Lock two xfs_dquot structures. * @@ -1131,85 +1012,6 @@ xfs_dqlock2( } /* - * Take a dquot out of the mount's dqlist as well as the hashlist. This is - * called via unmount as well as quotaoff, and the purge will always succeed. - */ -void -xfs_qm_dqpurge( - struct xfs_dquot *dqp) -{ - struct xfs_mount *mp = dqp->q_mount; - struct xfs_dqhash *qh = dqp->q_hash; - - xfs_dqlock(dqp); - - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_dqflock_pushbuf_wait(dqp); - } - - /* - * If we are turning this type of quotas off, we don't care - * about the dirty metadata sitting in this dquot. OTOH, if - * we're unmounting, we do care, so we flush it and wait. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - /* - * We don't care about getting disk errors here. We need - * to purge this dquot anyway, so we go ahead regardless. - */ - error = xfs_qm_dqflush(dqp, SYNC_WAIT); - if (error) - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - xfs_dqflock(dqp); - } - - ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - - mutex_lock(&qh->qh_lock); - list_del_init(&dqp->q_hashlist); - qh->qh_version++; - mutex_unlock(&qh->qh_lock); - - mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dqreclaims++; - mp->m_quotainfo->qi_dquots--; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); - - /* - * We move dquots to the freelist as soon as their reference count - * hits zero, so it really should be on the freelist here. - */ - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - ASSERT(!list_empty(&dqp->q_freelist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - - xfs_qm_dqdestroy(dqp); -} - -/* * Give the buffer a little push if it is incore and * wait on the flush lock. */ @@ -1241,3 +1043,31 @@ xfs_dqflock_pushbuf_wait( out_lock: xfs_dqflock(dqp); } + +int __init +xfs_qm_init(void) +{ + xfs_qm_dqzone = + kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + if (!xfs_qm_dqzone) + goto out; + + xfs_qm_dqtrxzone = + kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + if (!xfs_qm_dqtrxzone) + goto out_free_dqzone; + + return 0; + +out_free_dqzone: + kmem_zone_destroy(xfs_qm_dqzone); +out: + return -ENOMEM; +} + +void __exit +xfs_qm_exit(void) +{ + kmem_zone_destroy(xfs_qm_dqtrxzone); + kmem_zone_destroy(xfs_qm_dqzone); +} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index a1d91d8..ef9190b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -29,16 +29,6 @@ * when quotas are off. */ -/* - * The hash chain headers (hash buckets) - */ -typedef struct xfs_dqhash { - struct list_head qh_list; - struct mutex qh_lock; - uint qh_version; /* ever increasing version */ - uint qh_nelems; /* number of dquots on the list */ -} xfs_dqhash_t; - struct xfs_mount; struct xfs_trans; @@ -47,10 +37,7 @@ struct xfs_trans; */ typedef struct xfs_dquot { uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_freelist; /* global free list of dquots */ - struct list_head q_mplist; /* mount's list of dquots */ - struct list_head q_hashlist; /* gloabl hash list of dquots */ - xfs_dqhash_t *q_hash; /* the hashchain header */ + struct list_head q_lru; /* global free list of dquots */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ uint q_nrefs; /* # active refs from inodes */ @@ -110,11 +97,37 @@ static inline void xfs_dqlock(struct xfs_dquot *dqp) mutex_lock(&dqp->q_qlock); } -static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) +static inline void xfs_dqunlock(struct xfs_dquot *dqp) { mutex_unlock(&dqp->q_qlock); } +static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return XFS_IS_UQUOTA_ON(mp); + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return XFS_IS_OQUOTA_ON(mp); + default: + return 0; + } +} + +static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return ip->i_udquot; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return ip->i_gdquot; + default: + return NULL; + } +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -125,15 +138,10 @@ static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ XFS_DQ_TO_QINF(dqp)->qi_gquotaip) -#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ - (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ - (XFS_IS_OQUOTA_ON((d)->q_mount)))) - extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern void xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); @@ -144,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dqunlock(struct xfs_dquot *); extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7e5bc87..54a67dd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -163,7 +163,6 @@ xfs_file_fsync( struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; @@ -194,75 +193,18 @@ xfs_file_fsync( } /* - * We always need to make sure that the required inode state is safe on - * disk. The inode might be clean but we still might need to force the - * log because of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional changes to the - * inode core that have to go to disk and this requires us to issue - * a synchronous transaction to capture these changes correctly. - * - * This code relies on the assumption that if the i_update_core field - * of the inode is clear and the inode is unpinned then it is clean - * and no action is required. + * All metadata updates are logged, which means that we just have + * to flush the log up to the latest LSN that touched the inode. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - - /* - * First check if the VFS inode is marked dirty. All the dirtying - * of non-transactional updates do not go through mark_inode_dirty*, - * which allows us to distinguish between pure timestamp updates - * and i_size updates which need to be caught for fdatasync. - * After that also check for the dirty state in the XFS inode, which - * might gets cleared when the inode gets written out via the AIL - * or xfs_iflush_cluster. - */ - if (((inode->i_state & I_DIRTY_DATASYNC) || - ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && - ip->i_update_core) { - /* - * Kick off a transaction to log the inode core to get the - * updates. The sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return -error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out - * of the way during trans_reserve which would flush the inode. - * But there's no guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer could be pinned - * anyway if it's part of an inode in another recent - * transaction. So we play it safe and fire off the - * transaction anyway. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else { - /* - * Timestamps/size haven't changed since last inode flush or - * inode transaction commit. That means either nothing got - * written or a transaction committed which caught the updates. - * If the latter happened and the transaction hasn't hit the - * disk yet, the inode will be still be pinned. If it is, - * force the log. - */ - if (xfs_ipincount(ip)) + if (xfs_ipincount(ip)) { + if (!datasync || + (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!error && lsn) + if (lsn) error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); /* @@ -659,9 +601,6 @@ restart: return error; } - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); - /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -685,6 +624,15 @@ restart: return error; /* + * Updating the timestamps will grab the ilock again from + * xfs_fs_dirty_inode, so we have to call it after dropping the + * lock above. Eventually we should look into a way to avoid + * the pointless lock roundtrip. + */ + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 8c3e463..a98cb45 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -91,7 +91,6 @@ xfs_inode_alloc( ip->i_afp = NULL; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; - ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); @@ -350,9 +349,20 @@ xfs_iget_cache_miss( BUG(); } - spin_lock(&pag->pag_ici_lock); + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); @@ -360,11 +370,6 @@ xfs_iget_cache_miss( error = EAGAIN; goto out_preload_end; } - - /* These values _must_ be set before releasing the radix tree lock! */ - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, XFS_INEW); - spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); @@ -418,6 +423,15 @@ xfs_iget( xfs_perag_t *pag; xfs_agino_t agino; + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + /* reject inode numbers outside existing AGs */ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; @@ -642,8 +656,7 @@ xfs_iunlock( (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | - XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -656,16 +669,6 @@ xfs_iunlock( else if (lock_flags & XFS_ILOCK_SHARED) mrunlock_shared(&ip->i_lock); - if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) && - !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) { - /* - * Let the AIL know that this item has been unlocked in case - * it is in the AIL and anyone is waiting on it. Don't do - * this if the caller has asked us not to. - */ - xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, - (xfs_log_item_t*)(ip->i_itemp)); - } trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b210224..bc46c0a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1656,14 +1656,13 @@ retry: iip = ip->i_itemp; if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); - ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2177,7 +2176,7 @@ xfs_iflush_fork( mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && + if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); @@ -2187,8 +2186,8 @@ xfs_iflush_fork( case XFS_DINODE_FMT_EXTENTS: ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_format.ilf_fields & extflag[whichfork])); - if ((iip->ili_format.ilf_fields & extflag[whichfork]) && + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); @@ -2198,7 +2197,7 @@ xfs_iflush_fork( break; case XFS_DINODE_FMT_BTREE: - if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && + if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= @@ -2211,14 +2210,14 @@ xfs_iflush_fork( break; case XFS_DINODE_FMT_DEV: - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + if (iip->ili_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); } break; case XFS_DINODE_FMT_UUID: - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + if (iip->ili_fields & XFS_ILOG_UUID) { ASSERT(whichfork == XFS_DATA_FORK); memcpy(XFS_DFORK_DPTR(dip), &ip->i_df.if_u2.if_uuid, @@ -2451,9 +2450,8 @@ xfs_iflush( * to disk, because the log record didn't make it to disk! */ if (XFS_FORCED_SHUTDOWN(mp)) { - ip->i_update_core = 0; if (iip) - iip->ili_format.ilf_fields = 0; + iip->ili_fields = 0; xfs_ifunlock(ip); return XFS_ERROR(EIO); } @@ -2533,26 +2531,6 @@ xfs_iflush_int( /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); - /* - * Clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps. - * I believe that this depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. - */ - ip->i_update_core = 0; - SYNCHRONIZE(); - - /* - * Make sure to get the latest timestamps from the Linux inode. - */ - xfs_synchronize_times(ip); - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, @@ -2663,36 +2641,33 @@ xfs_iflush_int( xfs_inobp_check(mp, bp); /* - * We've recorded everything logged in the inode, so we'd - * like to clear the ilf_fields bits so we don't log and - * flush things unnecessarily. However, we can't stop - * logging all this information until the data we've copied - * into the disk buffer is written to disk. If we did we might - * overwrite the copy of the inode in the log with all the - * data after re-logging only part of it, and in the face of - * a crash we wouldn't have all the data we need to recover. + * We've recorded everything logged in the inode, so we'd like to clear + * the ili_fields bits so we don't log and flush things unnecessarily. + * However, we can't stop logging all this information until the data + * we've copied into the disk buffer is written to disk. If we did we + * might overwrite the copy of the inode in the log with all the data + * after re-logging only part of it, and in the face of a crash we + * wouldn't have all the data we need to recover. * - * What we do is move the bits to the ili_last_fields field. - * When logging the inode, these bits are moved back to the - * ilf_fields field. In the xfs_iflush_done() routine we - * clear ili_last_fields, since we know that the information - * those bits represent is permanently on disk. As long as - * the flush completes before the inode is logged again, then - * both ilf_fields and ili_last_fields will be cleared. + * What we do is move the bits to the ili_last_fields field. When + * logging the inode, these bits are moved back to the ili_fields field. + * In the xfs_iflush_done() routine we clear ili_last_fields, since we + * know that the information those bits represent is permanently on + * disk. As long as the flush completes before the inode is logged + * again, then both ili_fields and ili_last_fields will be cleared. * - * We can play with the ilf_fields bits here, because the inode - * lock must be held exclusively in order to set bits there - * and the flush lock protects the ili_last_fields bits. - * Set ili_logged so the flush done - * routine can tell whether or not to look in the AIL. - * Also, store the current LSN of the inode so that we can tell - * whether the item has moved in the AIL from xfs_iflush_done(). - * In order to read the lsn we need the AIL lock, because - * it is a 64 bit value that cannot be read atomically. + * We can play with the ili_fields bits here, because the inode lock + * must be held exclusively in order to set bits there and the flush + * lock protects the ili_last_fields bits. Set ili_logged so the flush + * done routine can tell whether or not to look in the AIL. Also, store + * the current LSN of the inode so that we can tell whether the item has + * moved in the AIL from xfs_iflush_done(). In order to read the lsn we + * need the AIL lock, because it is a 64 bit value that cannot be read + * atomically. */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; + if (iip != NULL && iip->ili_fields != 0) { + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, @@ -2711,8 +2686,7 @@ xfs_iflush_int( } else { /* * We're flushing an inode which is not in the AIL and has - * not been logged but has i_update_core set. For this - * case we can use a B_DELWRI flush and immediately drop + * not been logged. For this case we can immediately drop * the inode flush lock because we can avoid the whole * AIL state thing. It's OK to drop the flush lock now, * because we've already locked the buffer and to do anything diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 2f27b74..f123dbe 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -241,7 +241,6 @@ typedef struct xfs_inode { spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ - unsigned char i_update_core; /* timestamps/size is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ @@ -275,6 +274,20 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) } /* + * If this I/O goes past the on-disk inode size update it unless it would + * be past the current in-core inode size. + */ +static inline xfs_fsize_t +xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) +{ + xfs_fsize_t i_size = i_size_read(VFS_I(ip)); + + if (new_size > i_size) + new_size = i_size; + return new_size > ip->i_d.di_size ? new_size : 0; +} + +/* * i_flags helper functions */ static inline void @@ -422,7 +435,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) -#define XFS_IUNLOCK_NONOTIFY (1<<4) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) @@ -431,8 +443,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ - { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" } /* @@ -522,10 +533,6 @@ void xfs_promote_inode(struct xfs_inode *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_times(xfs_inode_t *); -void xfs_mark_inode_dirty(xfs_inode_t *); -void xfs_mark_inode_dirty_sync(xfs_inode_t *); - #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91d71dc..05d924e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -57,77 +57,28 @@ xfs_inode_item_size( struct xfs_inode *ip = iip->ili_inode; uint nvecs = 2; - /* - * Only log the data/extents/b-tree root if there is something - * left to log. - */ - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; - switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && - (ip->i_d.di_nextents > 0) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_extents != NULL); + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; - } break; case XFS_DINODE_FMT_BTREE: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && - (ip->i_df.if_broot_bytes > 0)) { - ASSERT(ip->i_df.if_broot != NULL); + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) nvecs++; - } else { - ASSERT(!(iip->ili_format.ilf_fields & - XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif - iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; - } break; case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; - } break; case XFS_DINODE_FMT_DEV: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_UUID); - break; - case XFS_DINODE_FMT_UUID: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_DEV); break; default: @@ -135,56 +86,31 @@ xfs_inode_item_size( break; } - /* - * If there are no attributes associated with this file, - * then there cannot be anything more to log. - * Clear all attribute-related log flags. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + if (!XFS_IFORK_Q(ip)) return nvecs; - } + /* * Log any necessary attribute data. */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && - (ip->i_d.di_anextents > 0) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_extents != NULL); + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; - } break; case XFS_DINODE_FMT_BTREE: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && - (ip->i_afp->if_broot_bytes > 0)) { - ASSERT(ip->i_afp->if_broot != NULL); + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; - } break; case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; - } break; default: @@ -254,48 +180,11 @@ xfs_inode_item_format( vecp++; nvecs = 1; - /* - * Clear i_update_core if the timestamps (or any other - * non-transactional modification) need flushing/logging - * and we're about to log them with the rest of the core. - * - * This is the same logic as xfs_iflush() but this code can't - * run at the same time as xfs_iflush because we're in commit - * processing here and so we have the inode lock held in - * exclusive mode. Although it doesn't really matter - * for the timestamps if both routines were to grab the - * timestamps or not. That would be ok. - * - * We clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps - * either here. Likewise, if they set it after we clear it - * here, we'll see it either on the next commit of this - * inode or the next time the inode gets flushed via - * xfs_iflush(). This depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. - */ - if (ip->i_update_core) { - ip->i_update_core = 0; - SYNCHRONIZE(); - } - - /* - * Make sure to get the latest timestamps from the Linux inode. - */ - xfs_synchronize_times(ip); - vecp->i_addr = &ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; /* * If this is really an old format inode, then we need to @@ -328,16 +217,17 @@ xfs_inode_item_format( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { - ASSERT(ip->i_df.if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_nextents > 0); + ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); ASSERT(iip->ili_extents_buf == NULL); - ASSERT((ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) > 0); + #ifdef XFS_NATIVE_HOST if (ip->i_d.di_nextents == ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { @@ -359,15 +249,18 @@ xfs_inode_item_format( iip->ili_format.ilf_dsize = vecp->i_len; vecp++; nvecs++; + } else { + iip->ili_fields &= ~XFS_ILOG_DEXT; } break; case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { - ASSERT(ip->i_df.if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); vecp->i_addr = ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; @@ -375,15 +268,30 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + } else { + ASSERT(!(iip->ili_fields & + XFS_ILOG_DBROOT)); +#ifdef XFS_TRANS_DEBUG + if (iip->ili_root_size > 0) { + ASSERT(iip->ili_root_size == + ip->i_df.if_broot_bytes); + ASSERT(memcmp(iip->ili_orig_root, + ip->i_df.if_broot, + iip->ili_root_size) == 0); + } else { + ASSERT(ip->i_df.if_broot_bytes == 0); + } +#endif + iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { - ASSERT(ip->i_df.if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); @@ -401,24 +309,26 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_dsize = (unsigned)data_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_DDATA; } break; case XFS_DINODE_FMT_DEV: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + if (iip->ili_fields & XFS_ILOG_DEV) { iip->ili_format.ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; } break; case XFS_DINODE_FMT_UUID: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_DEV))); - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + if (iip->ili_fields & XFS_ILOG_UUID) { iip->ili_format.ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; } @@ -430,31 +340,25 @@ xfs_inode_item_format( } /* - * If there are no attributes associated with the file, - * then we're done. - * Assert that no attribute-related log flags are set. + * If there are no attributes associated with the file, then we're done. */ if (!XFS_IFORK_Q(ip)) { - iip->ili_format.ilf_size = nvecs; - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - return; + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + goto out; } switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { -#ifdef DEBUG - int nrecs = ip->i_afp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nrecs > 0); - ASSERT(nrecs == ip->i_d.di_anextents); - ASSERT(ip->i_afp->if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_anextents > 0); -#endif #ifdef XFS_NATIVE_HOST /* * There are not delayed allocation extents @@ -471,29 +375,36 @@ xfs_inode_item_format( iip->ili_format.ilf_asize = vecp->i_len; vecp++; nvecs++; + } else { + iip->ili_fields &= ~XFS_ILOG_AEXT; } break; case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { - ASSERT(ip->i_afp->if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); + vecp->i_addr = ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; nvecs++; iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { - ASSERT(ip->i_afp->if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { ASSERT(ip->i_afp->if_u1.if_data != NULL); vecp->i_addr = ip->i_afp->if_u1.if_data; @@ -510,6 +421,8 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_asize = (unsigned)data_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_ADATA; } break; @@ -518,6 +431,15 @@ xfs_inode_item_format( break; } +out: + /* + * Now update the log format that goes out to disk from the in-core + * values. We always write the inode core to make the arithmetic + * games in recovery easier, which isn't a big deal as just about any + * transaction would dirty it anyway. + */ + iip->ili_format.ilf_fields = XFS_ILOG_CORE | + (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); iip->ili_format.ilf_size = nvecs; } @@ -596,17 +518,13 @@ xfs_inode_item_trylock( /* Stale items should force out the iclog */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); - /* - * we hold the AIL lock - notify the unlock routine of this - * so it doesn't try to get the lock again. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); + xfs_iunlock(ip, XFS_ILOCK_SHARED); return XFS_ITEM_PINNED; } #ifdef DEBUG if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_format.ilf_fields != 0); + ASSERT(iip->ili_fields != 0); ASSERT(iip->ili_logged == 0); ASSERT(lip->li_flags & XFS_LI_IN_AIL); } @@ -638,7 +556,7 @@ xfs_inode_item_unlock( if (iip->ili_extents_buf != NULL) { ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); + ASSERT(iip->ili_fields & XFS_ILOG_DEXT); ASSERT(ip->i_df.if_bytes > 0); kmem_free(iip->ili_extents_buf); iip->ili_extents_buf = NULL; @@ -646,7 +564,7 @@ xfs_inode_item_unlock( if (iip->ili_aextents_buf != NULL) { ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); + ASSERT(iip->ili_fields & XFS_ILOG_AEXT); ASSERT(ip->i_afp->if_bytes > 0); kmem_free(iip->ili_aextents_buf); iip->ili_aextents_buf = NULL; @@ -761,8 +679,7 @@ xfs_inode_item_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the inode. */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || - iip->ili_format.ilf_fields != 0); + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0); /* * Push the inode to it's backing buffer. This will not remove the @@ -985,7 +902,7 @@ xfs_iflush_abort( * Clear the inode logging fields so no more flushes are * attempted. */ - iip->ili_format.ilf_fields = 0; + iip->ili_fields = 0; } /* * Release the inode's flush lock since we're done with it. diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index d3dee61e..41d61c3 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_UUID | XFS_ILOG_ADATA | \ @@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ XFS_ILOG_DEV | XFS_ILOG_UUID | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) static inline int xfs_ilog_fbroot(int w) { @@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item { unsigned short ili_lock_flags; /* lock flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ + unsigned int ili_fields; /* fields to be logged */ struct xfs_bmbt_rec *ili_extents_buf; /* array of logged data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged @@ -148,9 +158,7 @@ typedef struct xfs_inode_log_item { static inline int xfs_inode_clean(xfs_inode_t *ip) { - return (!ip->i_itemp || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - !ip->i_update_core; + return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 76f3ca5..f588320 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -450,9 +450,12 @@ xfs_attrmulti_attr_get( if (*len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmalloc(*len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; + kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(*len); + if (!kbuf) + return ENOMEM; + } error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) @@ -462,7 +465,10 @@ xfs_attrmulti_attr_get( error = EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); return error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index f9ccb7b..a849a54 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -293,7 +293,7 @@ xfs_compat_ioc_bulkstat( int res; error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), 0, &res); + sizeof(compat_xfs_bstat_t), NULL, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 246c7d5..71a4645 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -31,6 +31,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -645,6 +646,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; + xfs_fsize_t i_size; uint resblks; int committed; int error; @@ -705,7 +707,22 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), &committed); + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ab30253..3011b87 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -50,65 +50,15 @@ #include <linux/fiemap.h> #include <linux/slab.h> -/* - * Bring the timestamps in the XFS inode uptodate. - * - * Used before writing the inode to disk. - */ -void -xfs_synchronize_times( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; - ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; -} - -/* - * If the linux inode is valid, mark it dirty, else mark the dirty state - * in the XFS inode to make sure we pick it up when reclaiming the inode. - */ -void -xfs_mark_inode_dirty_sync( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) - mark_inode_dirty_sync(inode); - else { - barrier(); - ip->i_update_core = 1; - } -} - -void -xfs_mark_inode_dirty( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) - mark_inode_dirty(inode); - else { - barrier(); - ip->i_update_core = 1; - } - -} - - -int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +xfs_initxattrs( + struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { - const struct xattr *xattr; - struct xfs_inode *ip = XFS_I(inode); - int error = 0; + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { error = xfs_attr_set(ip, xattr->name, xattr->value, @@ -678,19 +628,16 @@ xfs_setattr_nonsize( inode->i_atime = iattr->ia_atime; ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -918,13 +865,11 @@ xfs_setattr_size( inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 751e94f..9720c54 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -62,7 +62,6 @@ xfs_bulkstat_one_int( { struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct inode *inode; struct xfs_bstat *buf; /* return buffer */ int error = 0; /* error value */ @@ -86,7 +85,6 @@ xfs_bulkstat_one_int( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; - inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -99,19 +97,12 @@ xfs_bulkstat_one_int( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - - /* - * We need to read the timestamps from the Linux inode because - * the VFS keeps writing directly into the inode structure instead - * of telling us about the updates. - */ - buf->bs_atime.tv_sec = inode->i_atime.tv_sec; - buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; - buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; - buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; - + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; + buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; + buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; + buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e2cc356..98a9cb5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -67,15 +67,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log, int eventual_size); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); -/* local functions to manipulate grant head */ -STATIC int xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *xtic); STATIC void xlog_grant_push_ail(struct log *log, int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); -STATIC int xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *ticket); STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); @@ -150,78 +145,93 @@ xlog_grant_add_space( } while (head_val != old); } -STATIC bool -xlog_reserveq_wake( - struct log *log, - int *free_bytes) +STATIC void +xlog_grant_head_init( + struct xlog_grant_head *head) +{ + xlog_assign_grant_head(&head->grant, 1, 0); + INIT_LIST_HEAD(&head->waiters); + spin_lock_init(&head->lock); +} + +STATIC void +xlog_grant_head_wake_all( + struct xlog_grant_head *head) { struct xlog_ticket *tic; - int need_bytes; - list_for_each_entry(tic, &log->l_reserveq, t_queue) { + spin_lock(&head->lock); + list_for_each_entry(tic, &head->waiters, t_queue) + wake_up_process(tic->t_task); + spin_unlock(&head->lock); +} + +static inline int +xlog_ticket_reservation( + struct log *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic) +{ + if (head == &log->l_write_head) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + return tic->t_unit_res; + } else { if (tic->t_flags & XLOG_TIC_PERM_RESERV) - need_bytes = tic->t_unit_res * tic->t_cnt; + return tic->t_unit_res * tic->t_cnt; else - need_bytes = tic->t_unit_res; - - if (*free_bytes < need_bytes) - return false; - *free_bytes -= need_bytes; - - trace_xfs_log_grant_wake_up(log, tic); - wake_up(&tic->t_wait); + return tic->t_unit_res; } - - return true; } STATIC bool -xlog_writeq_wake( +xlog_grant_head_wake( struct log *log, + struct xlog_grant_head *head, int *free_bytes) { struct xlog_ticket *tic; int need_bytes; - list_for_each_entry(tic, &log->l_writeq, t_queue) { - ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); - - need_bytes = tic->t_unit_res; - + list_for_each_entry(tic, &head->waiters, t_queue) { + need_bytes = xlog_ticket_reservation(log, head, tic); if (*free_bytes < need_bytes) return false; - *free_bytes -= need_bytes; - trace_xfs_log_regrant_write_wake_up(log, tic); - wake_up(&tic->t_wait); + *free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up_process(tic->t_task); } return true; } STATIC int -xlog_reserveq_wait( +xlog_grant_head_wait( struct log *log, + struct xlog_grant_head *head, struct xlog_ticket *tic, int need_bytes) { - list_add_tail(&tic->t_queue, &log->l_reserveq); + list_add_tail(&tic->t_queue, &head->waiters); do { if (XLOG_FORCED_SHUTDOWN(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&head->lock); + XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_grant_sleep(log, tic); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_sleep(log, tic); + schedule(); trace_xfs_log_grant_wake(log, tic); - spin_lock(&log->l_grant_reserve_lock); + spin_lock(&head->lock); if (XLOG_FORCED_SHUTDOWN(log)) goto shutdown; - } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); + } while (xlog_space_left(log, &head->grant) < need_bytes); list_del_init(&tic->t_queue); return 0; @@ -230,35 +240,58 @@ shutdown: return XFS_ERROR(EIO); } +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto head->waiters, it will only return after the + * needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off head->waiters under head->lock, we + * only need to take that lock if we are going to add the ticket to the queue + * and sleep. We can avoid taking the lock if the ticket was never added to + * head->waiters because the t_queue list head will be empty and we hold the + * only reference to it so it can safely be checked unlocked. + */ STATIC int -xlog_writeq_wait( +xlog_grant_head_check( struct log *log, + struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int *need_bytes) { - list_add_tail(&tic->t_queue, &log->l_writeq); - - do { - if (XLOG_FORCED_SHUTDOWN(log)) - goto shutdown; - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep(log, tic); + int free_bytes; + int error = 0; - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - trace_xfs_log_regrant_write_wake(log, tic); + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - spin_lock(&log->l_grant_write_lock); - if (XLOG_FORCED_SHUTDOWN(log)) - goto shutdown; - } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ + *need_bytes = xlog_ticket_reservation(log, head, tic); + free_bytes = xlog_space_left(log, &head->grant); + if (!list_empty_careful(&head->waiters)) { + spin_lock(&head->lock); + if (!xlog_grant_head_wake(log, head, &free_bytes) || + free_bytes < *need_bytes) { + error = xlog_grant_head_wait(log, head, tic, + *need_bytes); + } + spin_unlock(&head->lock); + } else if (free_bytes < *need_bytes) { + spin_lock(&head->lock); + error = xlog_grant_head_wait(log, head, tic, *need_bytes); + spin_unlock(&head->lock); + } - list_del_init(&tic->t_queue); - return 0; -shutdown: - list_del_init(&tic->t_queue); - return XFS_ERROR(EIO); + return error; } static void @@ -286,6 +319,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) } /* + * Replenish the byte reservation required by moving the grant write head. + */ +int +xfs_log_regrant( + struct xfs_mount *mp, + struct xlog_ticket *tic) +{ + struct log *log = mp->m_log; + int need_bytes; + int error = 0; + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + /* + * This is a new transaction on the ticket, so we need to change the + * transaction ID so that the next transaction has a different TID in + * the log. Just add one to the existing tid so that we can see chains + * of rolling transactions in the log easily. + */ + tic->t_tid++; + + xlog_grant_push_ail(log, tic->t_unit_res); + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + + trace_xfs_log_regrant(log, tic); + + error = xlog_grant_head_check(log, &log->l_write_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_regrant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + +/* + * Reserve log space and return a ticket corresponding the reservation. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticp, + __uint8_t client, + bool permanent, + uint t_type) +{ + struct log *log = mp->m_log; + struct xlog_ticket *tic; + int need_bytes; + int error = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + ASSERT(*ticp == NULL); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, + KM_SLEEP | KM_MAYFAIL); + if (!tic) + return XFS_ERROR(ENOMEM); + + tic->t_trans_type = t_type; + *ticp = tic; + + xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + + trace_xfs_log_reserve(log, tic); + + error = xlog_grant_head_check(log, &log->l_reserve_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_reserve_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + + +/* * NOTES: * * 1. currblock field gets updated at startup and after in-core logs @@ -395,88 +550,6 @@ xfs_log_release_iclog( } /* - * 1. Reserve an amount of on-disk log space and return a ticket corresponding - * to the reservation. - * 2. Potentially, push buffers at tail of log to disk. - * - * Each reservation is going to reserve extra space for a log record header. - * When writes happen to the on-disk log, we don't subtract the length of the - * log record header from any reservation. By wasting space in each - * reservation, we prevent over allocation problems. - */ -int -xfs_log_reserve( - struct xfs_mount *mp, - int unit_bytes, - int cnt, - struct xlog_ticket **ticket, - __uint8_t client, - uint flags, - uint t_type) -{ - struct log *log = mp->m_log; - struct xlog_ticket *internal_ticket; - int retval = 0; - - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - XFS_STATS_INC(xs_try_logspace); - - - if (*ticket != NULL) { - ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = *ticket; - - /* - * this is a new transaction on the ticket, so we need to - * change the transaction ID so that the next transaction has a - * different TID in the log. Just add one to the existing tid - * so that we can see chains of rolling transactions in the log - * easily. - */ - internal_ticket->t_tid++; - - trace_xfs_log_reserve(log, internal_ticket); - - xlog_grant_push_ail(log, internal_ticket->t_unit_res); - retval = xlog_regrant_write_log_space(log, internal_ticket); - } else { - /* may sleep if need to allocate more tickets */ - internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags, - KM_SLEEP|KM_MAYFAIL); - if (!internal_ticket) - return XFS_ERROR(ENOMEM); - internal_ticket->t_trans_type = t_type; - *ticket = internal_ticket; - - trace_xfs_log_reserve(log, internal_ticket); - - xlog_grant_push_ail(log, - (internal_ticket->t_unit_res * - internal_ticket->t_cnt)); - retval = xlog_grant_log_space(log, internal_ticket); - } - - if (unlikely(retval)) { - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back - * when the ticket/ transaction gets cancelled. - */ - internal_ticket->t_curr_res = 0; - /* ungrant will give back unit_res * t_cnt. */ - internal_ticket->t_cnt = 0; - } - - return retval; -} - - -/* * Mount a log filesystem * * mp - ubiquitous xfs mount point structure @@ -760,64 +833,35 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_cil); } +/* + * Wake up processes waiting for log space after we have moved the log tail. + */ void -xfs_log_move_tail(xfs_mount_t *mp, - xfs_lsn_t tail_lsn) +xfs_log_space_wake( + struct xfs_mount *mp) { - xlog_ticket_t *tic; - xlog_t *log = mp->m_log; - int need_bytes, free_bytes; + struct log *log = mp->m_log; + int free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) return; - if (tail_lsn == 0) - tail_lsn = atomic64_read(&log->l_last_sync_lsn); - - /* tail_lsn == 1 implies that we weren't passed a valid value. */ - if (tail_lsn != 1) - atomic64_set(&log->l_tail_lsn, tail_lsn); - - if (!list_empty_careful(&log->l_writeq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - spin_lock(&log->l_grant_write_lock); - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - list_for_each_entry(tic, &log->l_writeq, t_queue) { - ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + if (!list_empty_careful(&log->l_write_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - if (free_bytes < tic->t_unit_res && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= tic->t_unit_res; - trace_xfs_log_regrant_write_wake_up(log, tic); - wake_up(&tic->t_wait); - } - spin_unlock(&log->l_grant_write_lock); + spin_lock(&log->l_write_head.lock); + free_bytes = xlog_space_left(log, &log->l_write_head.grant); + xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); + spin_unlock(&log->l_write_head.lock); } - if (!list_empty_careful(&log->l_reserveq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - spin_lock(&log->l_grant_reserve_lock); - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - list_for_each_entry(tic, &log->l_reserveq, t_queue) { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_cnt; - else - need_bytes = tic->t_unit_res; - if (free_bytes < need_bytes && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= need_bytes; - trace_xfs_log_grant_wake_up(log, tic); - wake_up(&tic->t_wait); - } - spin_unlock(&log->l_grant_reserve_lock); + if (!list_empty_careful(&log->l_reserve_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_reserve_head.lock); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); + spin_unlock(&log->l_reserve_head.lock); } } @@ -867,21 +911,7 @@ xfs_log_need_covered(xfs_mount_t *mp) return needed; } -/****************************************************************************** - * - * local routines - * - ****************************************************************************** - */ - -/* xfs_trans_tail_ail returns 0 when there is nothing in the list. - * The log manager must keep track of the last LR which was committed - * to disk. The lsn of this LR will become the new tail_lsn whenever - * xfs_trans_tail_ail returns 0. If we don't do this, we run into - * the situation where stuff could be written into the log but nothing - * was ever in the AIL when asked. Eventually, we panic since the - * tail hits the head. - * +/* * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t @@ -891,10 +921,17 @@ xlog_assign_tail_lsn( xfs_lsn_t tail_lsn; struct log *log = mp->m_log; + /* + * To make sure we always have a valid LSN for the log tail we keep + * track of the last LSN which was committed in log->l_last_sync_lsn, + * and use that when the AIL was empty and xfs_ail_min_lsn returns 0. + * + * If the AIL has been emptied we also need to wake any process + * waiting for this condition. + */ tail_lsn = xfs_ail_min_lsn(mp->m_ail); if (!tail_lsn) tail_lsn = atomic64_read(&log->l_last_sync_lsn); - atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } @@ -1100,12 +1137,9 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); - xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); - INIT_LIST_HEAD(&log->l_reserveq); - INIT_LIST_HEAD(&log->l_writeq); - spin_lock_init(&log->l_grant_reserve_lock); - spin_lock_init(&log->l_grant_write_lock); + + xlog_grant_head_init(&log->l_reserve_head); + xlog_grant_head_init(&log->l_write_head); error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { @@ -1280,7 +1314,7 @@ xlog_grant_push_ail( ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); free_blocks = BTOBBT(free_bytes); /* @@ -1412,8 +1446,8 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); - xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); @@ -2566,119 +2600,6 @@ restart: return 0; } /* xlog_state_get_iclog_space */ -/* - * Atomically get the log space required for a log ticket. - * - * Once a ticket gets put onto the reserveq, it will only return after the - * needed reservation is satisfied. - * - * This function is structured so that it has a lock free fast path. This is - * necessary because every new transaction reservation will come through this - * path. Hence any lock will be globally hot if we take it unconditionally on - * every pass. - * - * As tickets are only ever moved on and off the reserveq under the - * l_grant_reserve_lock, we only need to take that lock if we are going to add - * the ticket to the queue and sleep. We can avoid taking the lock if the ticket - * was never added to the reserveq because the t_queue list head will be empty - * and we hold the only reference to it so it can safely be checked unlocked. - */ -STATIC int -xlog_grant_log_space( - struct log *log, - struct xlog_ticket *tic) -{ - int free_bytes, need_bytes; - int error = 0; - - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - - trace_xfs_log_grant_enter(log, tic); - - /* - * If there are other waiters on the queue then give them a chance at - * logspace before us. Wake up the first waiters, if we do not wake - * up all the waiters then go to sleep waiting for more free space, - * otherwise try to get some space for this transaction. - */ - need_bytes = tic->t_unit_res; - if (tic->t_flags & XFS_LOG_PERM_RESERV) - need_bytes *= tic->t_ocnt; - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - if (!list_empty_careful(&log->l_reserveq)) { - spin_lock(&log->l_grant_reserve_lock); - if (!xlog_reserveq_wake(log, &free_bytes) || - free_bytes < need_bytes) - error = xlog_reserveq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_reserve_lock); - } else if (free_bytes < need_bytes) { - spin_lock(&log->l_grant_reserve_lock); - error = xlog_reserveq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_reserve_lock); - } - if (error) - return error; - - xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); - xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); - trace_xfs_log_grant_exit(log, tic); - xlog_verify_grant_tail(log); - return 0; -} - -/* - * Replenish the byte reservation required by moving the grant write head. - * - * Similar to xlog_grant_log_space, the function is structured to have a lock - * free fast path. - */ -STATIC int -xlog_regrant_write_log_space( - struct log *log, - struct xlog_ticket *tic) -{ - int free_bytes, need_bytes; - int error = 0; - - tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - - if (tic->t_cnt > 0) - return 0; - - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - - trace_xfs_log_regrant_write_enter(log, tic); - - /* - * If there are other waiters on the queue then give them a chance at - * logspace before us. Wake up the first waiters, if we do not wake - * up all the waiters then go to sleep waiting for more free space, - * otherwise try to get some space for this transaction. - */ - need_bytes = tic->t_unit_res; - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - if (!list_empty_careful(&log->l_writeq)) { - spin_lock(&log->l_grant_write_lock); - if (!xlog_writeq_wake(log, &free_bytes) || - free_bytes < need_bytes) - error = xlog_writeq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_write_lock); - } else if (free_bytes < need_bytes) { - spin_lock(&log->l_grant_write_lock); - error = xlog_writeq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_write_lock); - } - - if (error) - return error; - - xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); - trace_xfs_log_regrant_write_exit(log, tic); - xlog_verify_grant_tail(log); - return 0; -} - /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent * reservation has reserved cnt times the unit amount. @@ -2695,9 +2616,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) ticket->t_cnt--; - xlog_grant_sub_space(log, &log->l_grant_reserve_head, + xlog_grant_sub_space(log, &log->l_reserve_head.grant, ticket->t_curr_res); - xlog_grant_sub_space(log, &log->l_grant_write_head, + xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); @@ -2708,7 +2629,7 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) return; - xlog_grant_add_space(log, &log->l_grant_reserve_head, + xlog_grant_add_space(log, &log->l_reserve_head.grant, ticket->t_unit_res); trace_xfs_log_regrant_reserve_exit(log, ticket); @@ -2754,14 +2675,13 @@ xlog_ungrant_log_space(xlog_t *log, bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); - xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); + xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); trace_xfs_log_ungrant_exit(log, ticket); - xfs_log_move_tail(log->l_mp, 1); -} /* xlog_ungrant_log_space */ - + xfs_log_space_wake(log->l_mp); +} /* * Flush iclog to disk if this is the last reference to the given iclog and @@ -3219,7 +3139,7 @@ xlog_ticket_alloc( int unit_bytes, int cnt, char client, - uint xflags, + bool permanent, int alloc_flags) { struct xlog_ticket *tic; @@ -3313,6 +3233,7 @@ xlog_ticket_alloc( } atomic_set(&tic->t_ref, 1); + tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; @@ -3322,9 +3243,8 @@ xlog_ticket_alloc( tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; - if (xflags & XFS_LOG_PERM_RESERV) + if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - init_waitqueue_head(&tic->t_wait); xlog_tic_reset_res(tic); @@ -3380,7 +3300,7 @@ xlog_verify_grant_tail( int tail_cycle, tail_blocks; int cycle, space; - xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && @@ -3582,7 +3502,6 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_ticket_t *tic; xlog_t *log; int retval; @@ -3650,15 +3569,8 @@ xfs_log_force_umount( * we don't enqueue anything once the SHUTDOWN flag is set, and this * action is protected by the grant locks. */ - spin_lock(&log->l_grant_reserve_lock); - list_for_each_entry(tic, &log->l_reserveq, t_queue) - wake_up(&tic->t_wait); - spin_unlock(&log->l_grant_reserve_lock); - - spin_lock(&log->l_grant_write_lock); - list_for_each_entry(tic, &log->l_writeq, t_queue) - wake_up(&tic->t_wait); - spin_unlock(&log->l_grant_write_lock); + xlog_grant_head_wake_all(&log->l_reserve_head); + xlog_grant_head_wake_all(&log->l_write_head); if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2aee3b2..2c622be 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -53,15 +53,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LOG_REL_PERM_RESERV 0x1 /* - * Flags to xfs_log_reserve() - * - * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are - * performed against this type of reservation, the reservation - * is not decreased. Long running transactions should use this. - */ -#define XFS_LOG_PERM_RESERV 0x2 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -160,8 +151,8 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); -void xfs_log_move_tail(struct xfs_mount *mp, - xfs_lsn_t tail_lsn); +xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, xfs_log_callback_t *callback_entry); @@ -172,8 +163,9 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - uint flags, + bool permanent, uint t_type); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2d3b6a4..2152900 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -239,8 +239,8 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - wait_queue_head_t t_wait; /* ticket wait queue */ struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier : 4 */ atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -470,6 +470,16 @@ struct xfs_cil { #define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -520,17 +530,8 @@ typedef struct log { /* lsn of 1st LR with unflushed * buffers */ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; - /* - * ticket grant locks, queues and accounting have their own cachlines - * as these are quite hot and can be operated on concurrently. - */ - spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp; - struct list_head l_reserveq; - atomic64_t l_grant_reserve_head; - - spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp; - struct list_head l_writeq; - atomic64_t l_grant_write_head; + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG @@ -545,14 +546,13 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ -extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, - int count, char client, uint xflags, + int count, char client, bool permanent, int alloc_flags); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0ed9ee7..7c75c73 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -965,9 +965,9 @@ xlog_find_tail( log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); /* @@ -3695,7 +3695,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d06afbc..1ffead4b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -158,7 +158,7 @@ xfs_uuid_mount( out_duplicate: mutex_unlock(&xfs_uuid_table_mutex); - xfs_warn(mp, "Filesystem has duplicate UUID - can't mount"); + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); return XFS_ERROR(EINVAL); } @@ -553,9 +553,11 @@ out_unwind: void xfs_sb_from_disk( - xfs_sb_t *to, + struct xfs_mount *mp, xfs_dsb_t *from) { + struct xfs_sb *to = &mp->m_sb; + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -693,7 +695,7 @@ reread: * Initialize the mount structure from the superblock. * But first do some basic consistency checking. */ - xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { if (loud) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 19f69e2..9eba738 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -211,6 +211,9 @@ typedef struct xfs_mount { struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ + + struct workqueue_struct *m_data_workqueue; + struct workqueue_struct *m_unwritten_workqueue; } xfs_mount_t; /* @@ -395,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c436def..55c6afe 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -48,194 +48,189 @@ * quota functionality, including maintaining the freelist and hash * tables of dquots. */ -struct mutex xfs_Gqm_lock; -struct xfs_qm *xfs_Gqm; - -kmem_zone_t *qm_dqzone; -kmem_zone_t *qm_dqtrxzone; - -STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); -STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); -static struct shrinker xfs_qm_shaker = { - .shrink = xfs_qm_shake, - .seeks = DEFAULT_SEEKS, -}; - /* - * Initialize the XQM structure. - * Note that there is not one quota manager per file system. + * We use the batch lookup interface to iterate over the dquots as it + * currently is the only interface into the radix tree code that allows + * fuzzy lookups instead of exact matches. Holding the lock over multiple + * operations is fine as all callers are used either during mount/umount + * or quotaoff. */ -STATIC struct xfs_qm * -xfs_Gqm_init(void) +#define XFS_DQ_LOOKUP_BATCH 32 + +STATIC int +xfs_qm_dquot_walk( + struct xfs_mount *mp, + int type, + int (*execute)(struct xfs_dquot *dqp)) { - xfs_dqhash_t *udqhash, *gdqhash; - xfs_qm_t *xqm; - size_t hsize; - uint i; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + uint32_t next_index; + int last_error = 0; + int skipped; + int nr_found; + +restart: + skipped = 0; + next_index = 0; + nr_found = 0; + + while (1) { + struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; + int error = 0; + int i; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)batch, + next_index, XFS_DQ_LOOKUP_BATCH); + if (!nr_found) { + mutex_unlock(&qi->qi_tree_lock); + break; + } - /* - * Initialize the dquot hash tables. - */ - udqhash = kmem_zalloc_greedy(&hsize, - XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), - XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t)); - if (!udqhash) - goto out; + for (i = 0; i < nr_found; i++) { + struct xfs_dquot *dqp = batch[i]; - gdqhash = kmem_zalloc_large(hsize); - if (!gdqhash) - goto out_free_udqhash; + next_index = be32_to_cpu(dqp->q_core.d_id) + 1; - hsize /= sizeof(xfs_dqhash_t); + error = execute(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } - xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); - xqm->qm_dqhashmask = hsize - 1; - xqm->qm_usr_dqhtable = udqhash; - xqm->qm_grp_dqhtable = gdqhash; - ASSERT(xqm->qm_usr_dqhtable != NULL); - ASSERT(xqm->qm_grp_dqhtable != NULL); + mutex_unlock(&qi->qi_tree_lock); - for (i = 0; i < hsize; i++) { - xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); - xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); + /* bail out if the filesystem is corrupted. */ + if (last_error == EFSCORRUPTED) { + skipped = 0; + break; + } } - /* - * Freelist of all dquots of all file systems - */ - INIT_LIST_HEAD(&xqm->qm_dqfrlist); - xqm->qm_dqfrlist_cnt = 0; - mutex_init(&xqm->qm_dqfrlist_lock); - - /* - * dquot zone. we register our own low-memory callback. - */ - if (!qm_dqzone) { - xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t), - "xfs_dquots"); - qm_dqzone = xqm->qm_dqzone; - } else - xqm->qm_dqzone = qm_dqzone; - - register_shrinker(&xfs_qm_shaker); - - /* - * The t_dqinfo portion of transactions. - */ - if (!qm_dqtrxzone) { - xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t), - "xfs_dqtrx"); - qm_dqtrxzone = xqm->qm_dqtrxzone; - } else - xqm->qm_dqtrxzone = qm_dqtrxzone; - - atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_nrefs = 0; - return xqm; + if (skipped) { + delay(1); + goto restart; + } - out_free_udqhash: - kmem_free_large(udqhash); - out: - return NULL; + return last_error; } + /* - * Destroy the global quota manager when its reference count goes to zero. + * Purge a dquot from all tracking data structures and free it. */ -STATIC void -xfs_qm_destroy( - struct xfs_qm *xqm) +STATIC int +xfs_qm_dqpurge( + struct xfs_dquot *dqp) { - int hsize, i; + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *gdqp = NULL; - ASSERT(xqm != NULL); - ASSERT(xqm->qm_nrefs == 0); + xfs_dqlock(dqp); + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + return EAGAIN; + } - unregister_shrinker(&xfs_qm_shaker); + /* + * If this quota has a group hint attached, prepare for releasing it + * now. + */ + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } - mutex_lock(&xqm->qm_dqfrlist_lock); - ASSERT(list_empty(&xqm->qm_dqfrlist)); - mutex_unlock(&xqm->qm_dqfrlist_lock); + dqp->dq_flags |= XFS_DQ_FREEING; - hsize = xqm->qm_dqhashmask + 1; - for (i = 0; i < hsize; i++) { - xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); - xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); + /* + * If we're turning off quotas, we have to make sure that, for + * example, we don't delete quota disk blocks while dquots are + * in the process of getting written to those disk blocks. + * This dquot might well be on AIL, and we can't leave it there + * if we're turning off quotas. Basically, we need this flush + * lock, and are willing to block on it. + */ + if (!xfs_dqflock_nowait(dqp)) { + /* + * Block on the flush lock after nudging dquot buffer, + * if it is incore. + */ + xfs_dqflock_pushbuf_wait(dqp); } - kmem_free_large(xqm->qm_usr_dqhtable); - kmem_free_large(xqm->qm_grp_dqhtable); - xqm->qm_usr_dqhtable = NULL; - xqm->qm_grp_dqhtable = NULL; - xqm->qm_dqhashmask = 0; - kmem_free(xqm); -} - -/* - * Called at mount time to let XQM know that another file system is - * starting quotas. This isn't crucial information as the individual mount - * structures are pretty independent, but it helps the XQM keep a - * global view of what's going on. - */ -/* ARGSUSED */ -STATIC int -xfs_qm_hold_quotafs_ref( - struct xfs_mount *mp) -{ /* - * Need to lock the xfs_Gqm structure for things like this. For example, - * the structure could disappear between the entry to this routine and - * a HOLD operation if not locked. + * If we are turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. */ - mutex_lock(&xfs_Gqm_lock); + if (XFS_DQ_IS_DIRTY(dqp)) { + int error; - if (!xfs_Gqm) { - xfs_Gqm = xfs_Gqm_init(); - if (!xfs_Gqm) { - mutex_unlock(&xfs_Gqm_lock); - return ENOMEM; - } + /* + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, SYNC_WAIT); + if (error) + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + xfs_dqflock(dqp); } + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + qi->qi_dquots--; + /* - * We can keep a list of all filesystems with quotas mounted for - * debugging and statistical purposes, but ... - * Just take a reference and get out. + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. */ - xfs_Gqm->qm_nrefs++; - mutex_unlock(&xfs_Gqm_lock); + mutex_lock(&qi->qi_lru_lock); + ASSERT(!list_empty(&dqp->q_lru)); + list_del_init(&dqp->q_lru); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); + mutex_unlock(&qi->qi_lru_lock); + xfs_qm_dqdestroy(dqp); + + if (gdqp) + xfs_qm_dqput(gdqp); return 0; } - /* - * Release the reference that a filesystem took at mount time, - * so that we know when we need to destroy the entire quota manager. + * Purge the dquot cache. */ -/* ARGSUSED */ -STATIC void -xfs_qm_rele_quotafs_ref( - struct xfs_mount *mp) +void +xfs_qm_dqpurge_all( + struct xfs_mount *mp, + uint flags) { - ASSERT(xfs_Gqm); - ASSERT(xfs_Gqm->qm_nrefs > 0); - - /* - * Destroy the entire XQM. If somebody mounts with quotaon, this'll - * be restarted. - */ - mutex_lock(&xfs_Gqm_lock); - if (--xfs_Gqm->qm_nrefs == 0) { - xfs_qm_destroy(xfs_Gqm); - xfs_Gqm = NULL; - } - mutex_unlock(&xfs_Gqm_lock); + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); + if (flags & XFS_QMOPT_GQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); + if (flags & XFS_QMOPT_PQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); } /* @@ -376,175 +371,6 @@ xfs_qm_unmount_quotas( } } -/* - * Flush all dquots of the given file system to disk. The dquots are - * _not_ purged from memory here, just their data written to disk. - */ -STATIC int -xfs_qm_dqflush_all( - struct xfs_mount *mp) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl; - struct xfs_dquot *dqp; - int error; - - if (!q) - return 0; -again: - mutex_lock(&q->qi_dqlist_lock); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || - !XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - /* - * If we can't grab the flush lock then check - * to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - xfs_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write. - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, 0); - xfs_dqunlock(dqp); - if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - mutex_unlock(&q->qi_dqlist_lock); - /* XXX restart limit */ - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - /* return ! busy */ - return 0; -} - -/* - * Release the group dquot pointers the user dquots may be - * carrying around as a hint. mplist is locked on entry and exit. - */ -STATIC void -xfs_qm_detach_gdquots( - struct xfs_mount *mp) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_dquot *dqp, *gdqp; - - again: - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - xfs_dqunlock(dqp); - mutex_unlock(&q->qi_dqlist_lock); - delay(1); - mutex_lock(&q->qi_dqlist_lock); - goto again; - } - - gdqp = dqp->q_gdquot; - if (gdqp) - dqp->q_gdquot = NULL; - xfs_dqunlock(dqp); - - if (gdqp) - xfs_qm_dqrele(gdqp); - } -} - -/* - * Go through all the incore dquots of this file system and take them - * off the mplist and hashlist, if the dquot type matches the dqtype - * parameter. This is used when turning off quota accounting for - * users and/or groups, as well as when the filesystem is unmounting. - */ -STATIC int -xfs_qm_dqpurge_int( - struct xfs_mount *mp, - uint flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_dquot *dqp, *n; - uint dqtype; - int nmisses = 0; - LIST_HEAD (dispose_list); - - if (!q) - return 0; - - dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; - dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; - dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - - mutex_lock(&q->qi_dqlist_lock); - - /* - * In the first pass through all incore dquots of this filesystem, - * we release the group dquot pointers the user dquots may be - * carrying around as a hint. We need to do this irrespective of - * what's being turned off. - */ - xfs_qm_detach_gdquots(mp); - - /* - * Try to get rid of all of the unwanted dquots. - */ - list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { - xfs_dqlock(dqp); - if ((dqp->dq_flags & dqtype) != 0 && - !(dqp->dq_flags & XFS_DQ_FREEING)) { - if (dqp->q_nrefs == 0) { - dqp->dq_flags |= XFS_DQ_FREEING; - list_move_tail(&dqp->q_mplist, &dispose_list); - } else - nmisses++; - } - xfs_dqunlock(dqp); - } - mutex_unlock(&q->qi_dqlist_lock); - - list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist) - xfs_qm_dqpurge(dqp); - - return nmisses; -} - -int -xfs_qm_dqpurge_all( - xfs_mount_t *mp, - uint flags) -{ - int ndquots; - - /* - * Purge the dquot cache. - * None of the dquots should really be busy at this point. - */ - if (mp->m_quotainfo) { - while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) { - delay(ndquots * 10); - } - } - return 0; -} - STATIC int xfs_qm_dqattach_one( xfs_inode_t *ip, @@ -783,14 +609,6 @@ xfs_qm_dqdetach( } /* - * The hash chains and the mplist use the same xfs_dqhash structure as - * their list head, but we can take the mplist qh_lock and one of the - * hash qh_locks at the same time without any problem as they aren't - * related. - */ -static struct lock_class_key xfs_quota_mplist_class; - -/* * This initializes all the quota information that's kept in the * mount structure */ @@ -804,13 +622,6 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - /* - * Tell XQM that we exist as soon as possible. - */ - if ((error = xfs_qm_hold_quotafs_ref(mp))) { - return error; - } - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); /* @@ -823,11 +634,13 @@ xfs_qm_init_quotainfo( return error; } - INIT_LIST_HEAD(&qinf->qi_dqlist); - mutex_init(&qinf->qi_dqlist_lock); - lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + mutex_init(&qinf->qi_tree_lock); - qinf->qi_dqreclaims = 0; + INIT_LIST_HEAD(&qinf->qi_lru_list); + qinf->qi_lru_count = 0; + mutex_init(&qinf->qi_lru_lock); /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); @@ -894,6 +707,9 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } + qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&qinf->qi_shrinker); return 0; } @@ -911,17 +727,8 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - ASSERT(xfs_Gqm != NULL); - - /* - * Release the reference that XQM kept, so that we know - * when the XQM structure should be freed. We cannot assume - * that xfs_Gqm is non-null after this point. - */ - xfs_qm_rele_quotafs_ref(mp); - ASSERT(list_empty(&qi->qi_dqlist)); - mutex_destroy(&qi->qi_dqlist_lock); + unregister_shrinker(&qi->qi_shrinker); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -936,30 +743,6 @@ xfs_qm_destroy_quotainfo( mp->m_quotainfo = NULL; } - - -/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */ - -/* ARGSUSED */ -STATIC void -xfs_qm_list_init( - xfs_dqlist_t *list, - char *str, - int n) -{ - mutex_init(&list->qh_lock); - INIT_LIST_HEAD(&list->qh_list); - list->qh_version = 0; - list->qh_nelems = 0; -} - -STATIC void -xfs_qm_list_destroy( - xfs_dqlist_t *list) -{ - mutex_destroy(&(list->qh_lock)); -} - /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes @@ -1397,6 +1180,28 @@ error0: return error; } +STATIC int +xfs_qm_flush_one( + struct xfs_dquot *dqp) +{ + int error = 0; + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) + goto out_unlock; + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + + if (!xfs_dqflock_nowait(dqp)) + xfs_dqflock_pushbuf_wait(dqp); + + error = xfs_qm_dqflush(dqp, 0); + +out_unlock: + xfs_dqunlock(dqp); + return error; +} + /* * Walk thru all the filesystem inodes and construct a consistent view * of the disk quota world. If the quotacheck fails, disable quotas. @@ -1405,7 +1210,7 @@ int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error; + int done, count, error, error2; xfs_ino_t lastino; size_t structsz; xfs_inode_t *uip, *gip; @@ -1419,12 +1224,6 @@ xfs_qm_quotacheck( ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - /* - * There should be no cached dquots. The (simplistic) quotacheck - * algorithm doesn't like that. - */ - ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); - xfs_notice(mp, "Quotacheck needed: Please wait."); /* @@ -1463,12 +1262,21 @@ xfs_qm_quotacheck( } while (!done); /* - * We've made all the changes that we need to make incore. - * Flush them down to disk buffers if everything was updated - * successfully. + * We've made all the changes that we need to make incore. Flush them + * down to disk buffers if everything was updated successfully. */ - if (!error) - error = xfs_qm_dqflush_all(mp); + if (XFS_IS_UQUOTA_ON(mp)) + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); + if (XFS_IS_GQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); + if (!error) + error = error2; + } + if (XFS_IS_PQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); + if (!error) + error = error2; + } /* * We can get this error if we couldn't do a dquot allocation inside @@ -1496,7 +1304,7 @@ xfs_qm_quotacheck( * quotachecked status, since we won't be doing accounting for * that type anymore. */ - mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; mp->m_qflags |= flags; error_return: @@ -1508,7 +1316,6 @@ xfs_qm_quotacheck( * We must turn off quotas. */ ASSERT(mp->m_quotainfo != NULL); - ASSERT(xfs_Gqm != NULL); xfs_qm_destroy_quotainfo(mp); if (xfs_mount_reset_sbqflags(mp)) { xfs_warn(mp, @@ -1604,16 +1411,12 @@ xfs_qm_dqfree_one( struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - mutex_lock(&dqp->q_hash->qh_lock); - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; - mutex_unlock(&dqp->q_hash->qh_lock); + mutex_lock(&qi->qi_tree_lock); + radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); - mutex_lock(&qi->qi_dqlist_lock); - list_del_init(&dqp->q_mplist); qi->qi_dquots--; - qi->qi_dqreclaims++; - mutex_unlock(&qi->qi_dqlist_lock); + mutex_unlock(&qi->qi_tree_lock); xfs_qm_dqdestroy(dqp); } @@ -1624,6 +1427,7 @@ xfs_qm_dqreclaim_one( struct list_head *dispose_list) { struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; int error; if (!xfs_dqlock_nowait(dqp)) @@ -1637,16 +1441,14 @@ xfs_qm_dqreclaim_one( xfs_dqunlock(dqp); trace_xfs_dqreclaim_want(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqwants); + XFS_STATS_INC(xs_qm_dqwants); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; + list_del_init(&dqp->q_lru); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); return; } - ASSERT(dqp->q_hash); - ASSERT(!list_empty(&dqp->q_mplist)); - /* * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. @@ -1688,11 +1490,12 @@ xfs_qm_dqreclaim_one( xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_freelist, dispose_list); - xfs_Gqm->qm_dqfrlist_cnt--; + list_move_tail(&dqp->q_lru, dispose_list); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + XFS_STATS_INC(xs_qm_dqreclaims); return; out_busy: @@ -1701,10 +1504,10 @@ out_busy: /* * Move the dquot to the tail of the list so that we don't spin on it. */ - list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + list_move_tail(&dqp->q_lru, &qi->qi_lru_list); trace_xfs_dqreclaim_busy(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + XFS_STATS_INC(xs_qm_dqreclaim_misses); } STATIC int @@ -1712,6 +1515,8 @@ xfs_qm_shake( struct shrinker *shrink, struct shrink_control *sc) { + struct xfs_quotainfo *qi = + container_of(shrink, struct xfs_quotainfo, qi_shrinker); int nr_to_scan = sc->nr_to_scan; LIST_HEAD (dispose_list); struct xfs_dquot *dqp; @@ -1721,24 +1526,23 @@ xfs_qm_shake( if (!nr_to_scan) goto out; - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - while (!list_empty(&xfs_Gqm->qm_dqfrlist)) { + mutex_lock(&qi->qi_lru_lock); + while (!list_empty(&qi->qi_lru_list)) { if (nr_to_scan-- <= 0) break; - dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot, - q_freelist); + dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, + q_lru); xfs_qm_dqreclaim_one(dqp, &dispose_list); } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + mutex_unlock(&qi->qi_lru_lock); while (!list_empty(&dispose_list)) { - dqp = list_first_entry(&dispose_list, struct xfs_dquot, - q_freelist); - list_del_init(&dqp->q_freelist); + dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } out: - return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure; + return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; } /* diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9a9b997..44b858b 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -21,21 +21,10 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_quota_priv.h" -#include "xfs_qm_stats.h" -struct xfs_qm; struct xfs_inode; -extern struct mutex xfs_Gqm_lock; -extern struct xfs_qm *xfs_Gqm; -extern kmem_zone_t *qm_dqzone; -extern kmem_zone_t *qm_dqtrxzone; - -/* - * Dquot hashtable constants/threshold values. - */ -#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t)) -#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t)) +extern struct kmem_zone *xfs_qm_dqtrxzone; /* * This defines the unit of allocation of dquots. @@ -48,36 +37,20 @@ extern kmem_zone_t *qm_dqtrxzone; */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 -typedef xfs_dqhash_t xfs_dqlist_t; - -/* - * Quota Manager (global) structure. Lives only in core. - */ -typedef struct xfs_qm { - xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ - xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ - uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - struct list_head qm_dqfrlist; /* freelist of dquots */ - struct mutex qm_dqfrlist_lock; - int qm_dqfrlist_cnt; - atomic_t qm_totaldquots; /* total incore dquots */ - uint qm_nrefs; /* file systems with quota on */ - kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ - kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ -} xfs_qm_t; - /* * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. */ typedef struct xfs_quotainfo { + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct mutex qi_tree_lock; xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - struct list_head qi_dqlist; /* all dquots in filesys */ - struct mutex qi_dqlist_lock; + struct list_head qi_lru_list; + struct mutex qi_lru_lock; + int qi_lru_count; int qi_dquots; - int qi_dqreclaims; /* a change here indicates - a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ time_t qi_rtbtimelimit;/* limit for rt blks timer */ @@ -93,8 +66,14 @@ typedef struct xfs_quotainfo { xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct shrinker qi_shrinker; } xfs_quotainfo_t; +#define XFS_DQUOT_TREE(qi, type) \ + ((type & XFS_DQ_USER) ? \ + &((qi)->qi_uquota_tree) : \ + &((qi)->qi_gquota_tree)) + extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, @@ -130,7 +109,7 @@ extern int xfs_qm_quotacheck(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); /* dquot stuff */ -extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); +extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); /* quota ops */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a0a829a..e6986b5 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -40,28 +40,28 @@ STATIC void xfs_fill_statvfs_from_dquot( struct kstatfs *statp, - xfs_disk_dquot_t *dp) + struct xfs_dquot *dqp) { __uint64_t limit; - limit = dp->d_blk_softlimit ? - be64_to_cpu(dp->d_blk_softlimit) : - be64_to_cpu(dp->d_blk_hardlimit); + limit = dqp->q_core.d_blk_softlimit ? + be64_to_cpu(dqp->q_core.d_blk_softlimit) : + be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? - (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; + (statp->f_blocks > dqp->q_res_bcount) ? + (statp->f_blocks - dqp->q_res_bcount) : 0; } - limit = dp->d_ino_softlimit ? - be64_to_cpu(dp->d_ino_softlimit) : - be64_to_cpu(dp->d_ino_hardlimit); + limit = dqp->q_core.d_ino_softlimit ? + be64_to_cpu(dqp->q_core.d_ino_softlimit) : + be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = - (statp->f_files > be64_to_cpu(dp->d_icount)) ? - (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; + (statp->f_files > dqp->q_res_icount) ? + (statp->f_ffree - dqp->q_res_icount) : 0; } } @@ -82,7 +82,7 @@ xfs_qm_statvfs( xfs_dquot_t *dqp; if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { - xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); + xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } } @@ -156,21 +156,3 @@ xfs_qm_newmount( return 0; } - -void __init -xfs_qm_init(void) -{ - printk(KERN_INFO "SGI XFS Quota Management subsystem\n"); - mutex_init(&xfs_Gqm_lock); - xfs_qm_init_procfs(); -} - -void __exit -xfs_qm_exit(void) -{ - xfs_qm_cleanup_procfs(); - if (qm_dqzone) - kmem_zone_destroy(qm_dqzone); - if (qm_dqtrxzone) - kmem_zone_destroy(qm_dqtrxzone); -} diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c deleted file mode 100644 index 5729ba5..0000000 --- a/fs/xfs/xfs_qm_stats.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_qm.h" - -struct xqmstats xqmstats; - -static int xqm_proc_show(struct seq_file *m, void *v) -{ - /* maximum; incore; ratio free to inuse; freelist */ - seq_printf(m, "%d\t%d\t%d\t%u\n", - 0, - xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - 0, - xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); - return 0; -} - -static int xqm_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqm_proc_show, NULL); -} - -static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, - .open = xqm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int xqmstat_proc_show(struct seq_file *m, void *v) -{ - /* quota performance statistics */ - seq_printf(m, "qm %u %u %u %u %u %u %u %u\n", - xqmstats.xs_qm_dqreclaims, - xqmstats.xs_qm_dqreclaim_misses, - xqmstats.xs_qm_dquot_dups, - xqmstats.xs_qm_dqcachemisses, - xqmstats.xs_qm_dqcachehits, - xqmstats.xs_qm_dqwants, - xqmstats.xs_qm_dqshake_reclaims, - xqmstats.xs_qm_dqinact_reclaims); - return 0; -} - -static int xqmstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqmstat_proc_show, NULL); -} - -static const struct file_operations xqmstat_proc_fops = { - .owner = THIS_MODULE, - .open = xqmstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -void -xfs_qm_init_procfs(void) -{ - proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops); - proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops); -} - -void -xfs_qm_cleanup_procfs(void) -{ - remove_proc_entry("fs/xfs/xqm", NULL); - remove_proc_entry("fs/xfs/xqmstat", NULL); -} diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h deleted file mode 100644 index 5b964fc..0000000 --- a/fs/xfs/xfs_qm_stats.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2002 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QM_STATS_H__ -#define __XFS_QM_STATS_H__ - -#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) - -/* - * XQM global statistics - */ -struct xqmstats { - __uint32_t xs_qm_dqreclaims; - __uint32_t xs_qm_dqreclaim_misses; - __uint32_t xs_qm_dquot_dups; - __uint32_t xs_qm_dqcachemisses; - __uint32_t xs_qm_dqcachehits; - __uint32_t xs_qm_dqwants; - __uint32_t xs_qm_dqshake_reclaims; - __uint32_t xs_qm_dqinact_reclaims; -}; - -extern struct xqmstats xqmstats; - -# define XQM_STATS_INC(count) ( (count)++ ) - -extern void xfs_qm_init_procfs(void); -extern void xfs_qm_cleanup_procfs(void); - -#else - -# define XQM_STATS_INC(count) do { } while (0) - -static inline void xfs_qm_init_procfs(void) { }; -static inline void xfs_qm_cleanup_procfs(void) { }; - -#endif - -#endif /* __XFS_QM_STATS_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 711a86e..c4f396e 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -47,9 +47,6 @@ STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); STATIC uint xfs_qm_export_flags(uint); STATIC uint xfs_qm_export_qtype_flags(uint); -STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, - fs_disk_quota_t *); - /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -69,7 +66,6 @@ xfs_qm_scall_quotaoff( int error; uint inactivate_flags; xfs_qoff_logitem_t *qoffstart; - int nculprits; /* * No file system can have quotas enabled on disk but not in core. @@ -175,18 +171,13 @@ xfs_qm_scall_quotaoff( * This isn't protected by a particular lock directly, because we * don't want to take a mrlock every time we depend on quotas being on. */ - mp->m_qflags &= ~(flags); + mp->m_qflags &= ~flags; /* * Go through all the dquots of this file system and purge them, - * according to what was turned off. We may not be able to get rid - * of all dquots, because dquots can have temporary references that - * are not attached to inodes. eg. xfs_setattr, xfs_create. - * So, if we couldn't purge all the dquots from the filesystem, - * we can't get rid of the incore data structures. + * according to what was turned off. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) - delay(10 * nculprits); + xfs_qm_dqpurge_all(mp, dqtype); /* * Transactions that had started before ACTIVE state bit was cleared @@ -635,42 +626,6 @@ xfs_qm_scall_setqlim( return error; } -int -xfs_qm_scall_getquota( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - fs_disk_quota_t *out) -{ - xfs_dquot_t *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) { - return (error); - } - - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - return XFS_ERROR(ENOENT); - } - /* - * Convert the disk dquot to the exportable format - */ - xfs_qm_export_dquot(mp, &dqp->q_core, out); - xfs_qm_dqput(dqp); - return (error ? XFS_ERROR(EFAULT) : 0); -} - - STATIC int xfs_qm_log_quotaoff_end( xfs_mount_t *mp, @@ -759,50 +714,66 @@ error0: } -/* - * Translate an internal style on-disk-dquot to the exportable format. - * The main differences are that the counters/limits are all in Basic - * Blocks (BBs) instead of the internal FSBs, and all on-disk data has - * to be converted to the native endianness. - */ -STATIC void -xfs_qm_export_dquot( - xfs_mount_t *mp, - xfs_disk_dquot_t *src, +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, struct fs_disk_quota *dst) { + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = XFS_ERROR(ENOENT); + goto out_put; + } + memset(dst, 0, sizeof(*dst)); - dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ - dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags); - dst->d_id = be32_to_cpu(src->d_id); + dst->d_version = FS_DQUOT_VERSION; + dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags); + dst->d_id = be32_to_cpu(dqp->q_core.d_id); dst->d_blk_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit)); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); dst->d_blk_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit)); - dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit); - dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount)); - dst->d_icount = be64_to_cpu(src->d_icount); - dst->d_btimer = be32_to_cpu(src->d_btimer); - dst->d_itimer = be32_to_cpu(src->d_itimer); - dst->d_iwarns = be16_to_cpu(src->d_iwarns); - dst->d_bwarns = be16_to_cpu(src->d_bwarns); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount); + dst->d_icount = dqp->q_res_icount; + dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns); dst->d_rtb_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit)); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); dst->d_rtb_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit)); - dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount)); - dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer); - dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount); + dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns); /* * Internally, we don't reset all the timers when quota enforcement * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) || + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || (!XFS_IS_OQUOTA_ENFORCED(mp) && - (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; @@ -823,6 +794,9 @@ xfs_qm_export_dquot( } } #endif +out_put: + xfs_qm_dqput(dqp); + return error; } STATIC uint diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 8a0807e..b50ec5b 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -174,6 +174,8 @@ typedef struct xfs_qoff_logformat { #define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ #define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ #define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) /* * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h index 94a3d92..6d86219 100644 --- a/fs/xfs/xfs_quota_priv.h +++ b/fs/xfs/xfs_quota_priv.h @@ -24,17 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* - * Hash into a bucket in the dquot hash table, based on <mp, id>. - */ -#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ - (__psunsigned_t)(id)) & \ - (xfs_Gqm->qm_dqhashmask - 1)) -#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \ - (xfs_Gqm->qm_usr_dqhtable + \ - XFS_DQ_HASHVAL(mp, id)) : \ - (xfs_Gqm->qm_grp_dqhtable + \ - XFS_DQ_HASHVAL(mp, id))) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index cb6ae71..f429d9d 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) #define XFS_BB_TO_FSB(mp,bb) \ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) #define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) -#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1)) /* * File system block to byte conversions. diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 76fdc58..ce372b7 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -20,9 +20,18 @@ DEFINE_PER_CPU(struct xfsstats, xfsstats); +static int counter_val(int idx) +{ + int val = 0, cpu; + + for_each_possible_cpu(cpu) + val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + return val; +} + static int xfs_stat_proc_show(struct seq_file *m, void *v) { - int c, i, j, val; + int i, j; __uint64_t xs_xstrat_bytes = 0; __uint64_t xs_write_bytes = 0; __uint64_t xs_read_bytes = 0; @@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) { "abtc2", XFSSTAT_END_ABTC_V2 }, { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, }; /* Loop over all stats groups */ - for (i=j = 0; i < ARRAY_SIZE(xstats); i++) { + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { seq_printf(m, "%s", xstats[i].desc); /* inner loop does each group */ - while (j < xstats[i].endpoint) { - val = 0; - /* sum over all cpus */ - for_each_possible_cpu(c) - val += *(((__u32*)&per_cpu(xfsstats, c) + j)); - seq_printf(m, " %u", val); - j++; - } + for (; j < xstats[i].endpoint; j++) + seq_printf(m, " %u", counter_val(j)); seq_putc(m, '\n'); } /* extra precision counters */ @@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = { .release = single_release, }; +/* legacy quota interfaces */ +#ifdef CONFIG_XFS_QUOTA +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + 0, + counter_val(XFSSTAT_END_XQMSTAT), + 0, + counter_val(XFSSTAT_END_XQMSTAT + 1)); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota stats interface no 2 */ +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + int j; + + seq_printf(m, "qm"); + for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XFS_QUOTA */ + int xfs_init_procfs(void) { @@ -105,10 +162,24 @@ xfs_init_procfs(void) if (!proc_create("fs/xfs/stat", 0, NULL, &xfs_stat_proc_fops)) - goto out_remove_entry; + goto out_remove_xfs_dir; +#ifdef CONFIG_XFS_QUOTA + if (!proc_create("fs/xfs/xqmstat", 0, NULL, + &xqmstat_proc_fops)) + goto out_remove_stat_file; + if (!proc_create("fs/xfs/xqm", 0, NULL, + &xqm_proc_fops)) + goto out_remove_xqmstat_file; +#endif return 0; - out_remove_entry: +#ifdef CONFIG_XFS_QUOTA + out_remove_xqmstat_file: + remove_proc_entry("fs/xfs/xqmstat", NULL); + out_remove_stat_file: + remove_proc_entry("fs/xfs/stat", NULL); +#endif + out_remove_xfs_dir: remove_proc_entry("fs/xfs", NULL); out: return -ENOMEM; @@ -117,6 +188,10 @@ xfs_init_procfs(void) void xfs_cleanup_procfs(void) { +#ifdef CONFIG_XFS_QUOTA + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +#endif remove_proc_entry("fs/xfs/stat", NULL); remove_proc_entry("fs/xfs", NULL); } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 736854b..c03ad38 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -183,6 +183,16 @@ struct xfsstats { __uint32_t xs_ibt_2_alloc; __uint32_t xs_ibt_2_free; __uint32_t xs_ibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6) + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; +#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) + __uint32_t xs_qm_dquot; + __uint32_t xs_qm_dquot_unused; /* Extra precision counters */ __uint64_t xs_xstrat_bytes; __uint64_t xs_write_bytes; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index baf40e3..912442c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -324,10 +324,9 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { mp->m_flags |= XFS_MOUNT_FILESTREAMS; } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { - mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD); + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; } else if (!strcmp(this_char, MNTOPT_QUOTA) || !strcmp(this_char, MNTOPT_UQUOTA) || !strcmp(this_char, MNTOPT_USRQUOTA)) { @@ -760,6 +759,36 @@ xfs_setup_devices( return 0; } +STATIC int +xfs_init_mount_workqueues( + struct xfs_mount *mp) +{ + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_data_workqueue) + goto out; + + mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_unwritten_workqueue) + goto out_destroy_data_iodone_queue; + + return 0; + +out_destroy_data_iodone_queue: + destroy_workqueue(mp->m_data_workqueue); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_mount_workqueues( + struct xfs_mount *mp) +{ + destroy_workqueue(mp->m_data_workqueue); + destroy_workqueue(mp->m_unwritten_workqueue); +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -834,91 +863,58 @@ xfs_fs_inode_init_once( } /* - * Dirty the XFS inode when mark_inode_dirty_sync() is called so that - * we catch unlogged VFS level updates to the inode. + * This is called by the VFS when dirtying inode metadata. This can happen + * for a few reasons, but we only care about timestamp updates, given that + * we handled the rest ourselves. In theory no other calls should happen, + * but for example generic_write_end() keeps dirtying the inode after + * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC, + * and skip this call otherwise. * - * We need the barrier() to maintain correct ordering between unlogged - * updates and the transaction commit code that clears the i_update_core - * field. This requires all updates to be completed before marking the - * inode dirty. + * We'll hopefull get a different method just for updating timestamps soon, + * at which point this hack can go away, and maybe we'll also get real + * error handling here. */ STATIC void xfs_fs_dirty_inode( - struct inode *inode, - int flags) -{ - barrier(); - XFS_I(inode)->i_update_core = 1; -} - -STATIC int -xfs_fs_write_inode( struct inode *inode, - struct writeback_control *wbc) + int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; - - trace_xfs_write_inode(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); - - if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { - /* - * Make sure the inode has made it it into the log. Instead - * of forcing it all the way to stable storage using a - * synchronous transaction we let the log force inside the - * ->sync_fs call do that for thus, which reduces the number - * of synchronous log forces dramatically. - */ - error = xfs_log_dirty_inode(ip, NULL, 0); - if (error) - goto out; - return 0; - } else { - if (!ip->i_update_core) - return 0; + struct xfs_trans *tp; + int error; - /* - * We make this non-blocking if the inode is contended, return - * EAGAIN to indicate to the caller that they did not succeed. - * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by - * xfs_sync. - */ - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) - goto out; + if (flags != I_DIRTY_SYNC) + return; - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; + trace_xfs_dirty_inode(ip); - /* - * Now we have the flush lock and the inode is not pinned, we - * can check if the inode is really clean as we know that - * there are no pending transaction completions, it is not - * waiting on the delayed write queue and there is no IO in - * progress. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - error = 0; - goto out_unlock; - } - error = xfs_iflush(ip, SYNC_TRYLOCK); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto trouble; } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - out: + xfs_ilock(ip, XFS_ILOCK_EXCL); /* - * if we failed to write out the inode then mark - * it dirty again so we'll try again later. + * Grab all the latest timestamps from the Linux inode. */ + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + error = xfs_trans_commit(tp, 0); if (error) - xfs_mark_inode_dirty_sync(ip); - return -error; + goto trouble; + return; + +trouble: + xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino); } STATIC void @@ -983,6 +979,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); xfs_icsb_destroy_counters(mp); + xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); xfs_free_fsname(mp); kfree(mp); @@ -1309,10 +1306,14 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_icsb_init_counters(mp); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; + error = xfs_icsb_init_counters(mp); + if (error) + goto out_destroy_workqueues; + error = xfs_readsb(mp, flags); if (error) goto out_destroy_counters; @@ -1376,6 +1377,8 @@ xfs_fs_fill_super( xfs_freesb(mp); out_destroy_counters: xfs_icsb_destroy_counters(mp); +out_destroy_workqueues: + xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); out_free_fsname: @@ -1429,7 +1432,6 @@ static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, - .write_inode = xfs_fs_write_inode, .evict_inode = xfs_fs_evict_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, @@ -1651,13 +1653,17 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; - vfs_initquota(); + error = xfs_qm_init(); + if (error) + goto out_sysctl_unregister; error = register_filesystem(&xfs_fs_type); if (error) - goto out_sysctl_unregister; + goto out_qm_exit; return 0; + out_qm_exit: + xfs_qm_exit(); out_sysctl_unregister: xfs_sysctl_unregister(); out_cleanup_procfs: @@ -1679,7 +1685,7 @@ init_xfs_fs(void) STATIC void __exit exit_xfs_fs(void) { - vfs_exitquota(); + xfs_qm_exit(); unregister_filesystem(&xfs_fs_type); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 50a3266..09b0c26 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -21,13 +21,11 @@ #include <linux/exportfs.h> #ifdef CONFIG_XFS_QUOTA -extern void xfs_qm_init(void); +extern int xfs_qm_init(void); extern void xfs_qm_exit(void); -# define vfs_initquota() xfs_qm_init() -# define vfs_exitquota() xfs_qm_exit() #else -# define vfs_initquota() do { } while (0) -# define vfs_exitquota() do { } while (0) +# define xfs_qm_init() (0) +# define xfs_qm_exit() do { } while (0) #endif #ifdef CONFIG_XFS_POSIX_ACL diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 40b75ee..205ebcb 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -336,32 +336,6 @@ xfs_sync_fsdata( return error; } -int -xfs_log_dirty_inode( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (!ip->i_update_core) - return 0; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -385,16 +359,6 @@ xfs_quiesce_data( { int error, error2 = 0; - /* - * Log all pending size and timestamp updates. The vfs writeback - * code is supposed to do this, but due to its overagressive - * livelock detection it will skip inodes where appending writes - * were written out in the first non-blocking sync phase if their - * completion took long enough that it happened after taking the - * timestamp for the cut-off in the blocking phase. - */ - xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - /* force out the log */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -913,17 +877,15 @@ reclaim: * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. We get - * both the ilock and the iolock because the code may need to drop the - * ilock one but will still hold the iolock. + * unlocked after the lookup before we go ahead and free it. */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); - return error; + return error; } /* diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index fa96547..941202e 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); -int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb134a8..75eb54a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -580,7 +580,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_write_inode); +DEFINE_INODE_EVENT(xfs_dirty_inode); DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -741,10 +741,10 @@ DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); DEFINE_DQUOT_EVENT(xfs_dqread_fail); -DEFINE_DQUOT_EVENT(xfs_dqlookup_found); -DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqget_freeing); +DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); @@ -782,12 +782,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res = tic->t_curr_res; __entry->unit_res = tic->t_unit_res; __entry->flags = tic->t_flags; - __entry->reserveq = list_empty(&log->l_reserveq); - __entry->writeq = list_empty(&log->l_writeq); - xlog_crack_grant_head(&log->l_grant_reserve_head, + __entry->reserveq = list_empty(&log->l_reserve_head.waiters); + __entry->writeq = list_empty(&log->l_write_head.waiters); + xlog_crack_grant_head(&log->l_reserve_head.grant, &__entry->grant_reserve_cycle, &__entry->grant_reserve_bytes); - xlog_crack_grant_head(&log->l_grant_write_head, + xlog_crack_grant_head(&log->l_write_head.grant, &__entry->grant_write_cycle, &__entry->grant_write_bytes); __entry->curr_cycle = log->l_curr_cycle; @@ -826,20 +826,14 @@ DEFINE_EVENT(xfs_loggrant_class, name, \ TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); -DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7adcdf1..103b00c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -681,7 +681,6 @@ xfs_trans_reserve( uint flags, uint logcount) { - int log_flags; int error = 0; int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; @@ -707,24 +706,32 @@ xfs_trans_reserve( * Reserve the log space needed for this transaction. */ if (logspace > 0) { - ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); - ASSERT((tp->t_log_count == 0) || - (tp->t_log_count == logcount)); + bool permanent = false; + + ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); + ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); + if (flags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_PERM_RESERV; tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + permanent = true; } else { ASSERT(tp->t_ticket == NULL); ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - log_flags = 0; } - error = xfs_log_reserve(tp->t_mountp, logspace, logcount, - &tp->t_ticket, - XFS_TRANSACTION, log_flags, tp->t_type); - if (error) { - goto undo_blocks; + if (tp->t_ticket != NULL) { + ASSERT(flags & XFS_TRANS_PERM_LOG_RES); + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + } else { + error = xfs_log_reserve(tp->t_mountp, logspace, + logcount, &tp->t_ticket, + XFS_TRANSACTION, permanent, + tp->t_type); } + + if (error) + goto undo_blocks; + tp->t_log_res = logspace; tp->t_log_count = logcount; } @@ -752,6 +759,8 @@ xfs_trans_reserve( */ undo_log: if (logspace > 0) { + int log_flags; + if (flags & XFS_TRANS_PERM_LOG_RES) { log_flags = XFS_LOG_REL_PERM_RESERV; } else { diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index ed9252b..1dead07 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -611,50 +611,6 @@ xfs_ail_push_all( } /* - * This is to be called when an item is unlocked that may have - * been in the AIL. It will wake up the first member of the AIL - * wait list if this item's unlocking might allow it to progress. - * If the item is in the AIL, then we need to get the AIL lock - * while doing our checking so we don't race with someone going - * to sleep waiting for this event in xfs_trans_push_ail(). - */ -void -xfs_trans_unlocked_item( - struct xfs_ail *ailp, - xfs_log_item_t *lip) -{ - xfs_log_item_t *min_lip; - - /* - * If we're forcibly shutting down, we may have - * unlocked log items arbitrarily. The last thing - * we want to do is to move the tail of the log - * over some potentially valid data. - */ - if (!(lip->li_flags & XFS_LI_IN_AIL) || - XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - return; - } - - /* - * This is the one case where we can call into xfs_ail_min() - * without holding the AIL lock because we only care about the - * case where we are at the tail of the AIL. If the object isn't - * at the tail, it doesn't matter what result we get back. This - * is slightly racy because since we were just unlocked, we could - * go to sleep between the call to xfs_ail_min and the call to - * xfs_log_move_tail, have someone else lock us, commit to us disk, - * move us out of the tail of the AIL, and then we wake up. However, - * the call to xfs_log_move_tail() doesn't do anything if there's - * not enough free space to wake people up so we're safe calling it. - */ - min_lip = xfs_ail_min(ailp); - - if (min_lip == lip) - xfs_log_move_tail(ailp->xa_mount, 1); -} /* xfs_trans_unlocked_item */ - -/* * xfs_trans_ail_update - bulk AIL insertion operation. * * @xfs_trans_ail_update takes an array of log items that all need to be @@ -685,7 +641,6 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->xa_lock) { xfs_log_item_t *mlip; - xfs_lsn_t tail_lsn; int mlip_changed = 0; int i; LIST_HEAD(tmp); @@ -712,22 +667,12 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); + spin_unlock(&ailp->xa_lock); - if (!mlip_changed) { - spin_unlock(&ailp->xa_lock); - return; + if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { + xlog_assign_tail_lsn(ailp->xa_mount); + xfs_log_space_wake(ailp->xa_mount); } - - /* - * It is not safe to access mlip after the AIL lock is dropped, so we - * must get a copy of li_lsn before we do so. This is especially - * important on 32-bit platforms where accessing and updating 64-bit - * values like li_lsn is not atomic. - */ - mlip = xfs_ail_min(ailp); - tail_lsn = mlip->li_lsn; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); } /* @@ -758,7 +703,6 @@ xfs_trans_ail_delete_bulk( int nr_items) __releases(ailp->xa_lock) { xfs_log_item_t *mlip; - xfs_lsn_t tail_lsn; int mlip_changed = 0; int i; @@ -785,23 +729,12 @@ xfs_trans_ail_delete_bulk( if (mlip == lip) mlip_changed = 1; } + spin_unlock(&ailp->xa_lock); - if (!mlip_changed) { - spin_unlock(&ailp->xa_lock); - return; + if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { + xlog_assign_tail_lsn(ailp->xa_mount); + xfs_log_space_wake(ailp->xa_mount); } - - /* - * It is not safe to access mlip after the AIL lock is dropped, so we - * must get a copy of li_lsn before we do so. This is especially - * important on 32-bit platforms where accessing and updating 64-bit - * values like li_lsn is not atomic. It is possible we've emptied the - * AIL here, so if that is the case, pass an LSN of 0 to the tail move. - */ - mlip = xfs_ail_min(ailp); - tail_lsn = mlip ? mlip->li_lsn : 0; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); } /* diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 475a4de..1302d1d 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -463,19 +463,7 @@ xfs_trans_brelse(xfs_trans_t *tp, * Default to a normal brelse() call if the tp is NULL. */ if (tp == NULL) { - struct xfs_log_item *lip = bp->b_fspriv; - ASSERT(bp->b_transp == NULL); - - /* - * If there's a buf log item attached to the buffer, - * then let the AIL know that the buffer is being - * unlocked. - */ - if (lip != NULL && lip->li_type == XFS_LI_BUF) { - bip = bp->b_fspriv; - xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip); - } xfs_buf_relse(bp); return; } @@ -550,21 +538,10 @@ xfs_trans_brelse(xfs_trans_t *tp, ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); xfs_buf_item_relse(bp); - bip = NULL; - } - bp->b_transp = NULL; - - /* - * If we've still got a buf log item on the buffer, then - * tell the AIL that the buffer is being unlocked. - */ - if (bip != NULL) { - xfs_trans_unlocked_item(bip->bli_item.li_ailp, - (xfs_log_item_t*)bip); } + bp->b_transp = NULL; xfs_buf_relse(bp); - return; } /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c4ba366..2790997 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -605,7 +605,7 @@ xfs_trans_dqresv( time_t timer; xfs_qwarncnt_t warns; xfs_qwarncnt_t warnlimit; - xfs_qcnt_t count; + xfs_qcnt_t total_count; xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; @@ -648,13 +648,12 @@ xfs_trans_dqresv( * hardlimit or exceed the timelimit if we allocate * nblks. */ - if (hardlimit > 0ULL && - hardlimit < nblks + *resbcountp) { + total_count = *resbcountp + nblks; + if (hardlimit && total_count > hardlimit) { xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } - if (softlimit > 0ULL && - softlimit < nblks + *resbcountp) { + if (softlimit && total_count > softlimit) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, @@ -666,7 +665,7 @@ xfs_trans_dqresv( } } if (ninos > 0) { - count = be64_to_cpu(dqp->q_core.d_icount); + total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; @@ -677,13 +676,11 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && - hardlimit < ninos + count) { + if (hardlimit && total_count > hardlimit) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; } - if (softlimit > 0ULL && - softlimit < ninos + count) { + if (softlimit && total_count > softlimit) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, @@ -878,7 +875,7 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); } void @@ -887,6 +884,6 @@ xfs_trans_free_dqinfo( { if (!tp->t_dqinfo) return; - kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo); + kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); tp->t_dqinfo = NULL; } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 32f0288..7a7442c 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -95,10 +95,14 @@ xfs_trans_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; + ip->i_d.di_mtime.t_sec = tv.tv_sec; + ip->i_d.di_mtime.t_nsec = tv.tv_nsec; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; + ip->i_d.di_ctime.t_sec = tv.tv_sec; + ip->i_d.di_ctime.t_nsec = tv.tv_nsec; } } @@ -126,12 +130,12 @@ xfs_trans_log_inode( /* * Always OR in the bits from the ili_last_fields field. * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ilf_fields bits. + * routines in the eventual clearing of the ili_fields bits. * See the big comment in xfs_iflush() for an explanation of * this coordination mechanism. */ flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_format.ilf_fields |= flags; + ip->i_itemp->ili_fields |= flags; } #ifdef XFS_TRANS_DEBUG diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 44820b9..8ab2ced 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -104,9 +104,6 @@ void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); -void xfs_trans_unlocked_item(struct xfs_ail *, - xfs_log_item_t *); - struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn); diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 7c220b4..db14d0c0 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -22,7 +22,6 @@ struct file; struct xfs_inode; -struct xfs_iomap; struct attrlist_cursor_kern; /* diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 0c877cb..447e146 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -10,7 +10,6 @@ struct kiocb; struct pipe_inode_info; struct uio; struct xfs_inode; -struct xfs_iomap; int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); @@ -49,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int flags, struct xfs_iomap *iomapp, int *niomaps); void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, |