diff options
Diffstat (limited to 'fs')
483 files changed, 16724 insertions, 13841 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 7af425f..8482f2d 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; set_cached_acl(inode, ACL_TYPE_ACCESS, acl); @@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, if (acl) { if (S_ISDIR(mode)) *dpacl = posix_acl_dup(acl); - retval = posix_acl_create(&acl, GFP_NOFS, &mode); + retval = __posix_acl_create(&acl, GFP_NOFS, &mode); if (retval < 0) return retval; if (retval > 0) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 2b7a032..a69260f 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) { struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_fid *fid; if (!v9inode->fscache) return; spin_lock(&v9inode->fscache_lock); - fid = filp->private_data; + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 08f2e1e..14da825 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -56,7 +56,7 @@ enum { /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ - Opt_cache_loose, Opt_fscache, + Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, /* Error token */ @@ -74,6 +74,7 @@ static const match_table_t tokens = { {Opt_cache, "cache=%s"}, {Opt_cache_loose, "loose"}, {Opt_fscache, "fscache"}, + {Opt_mmap, "mmap"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, @@ -91,6 +92,9 @@ static int get_cache_mode(char *s) } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "mmap")) { + version = CACHE_MMAP; + p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); @@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_fscache: v9ses->cache = CACHE_FSCACHE; break; + case Opt_mmap: + v9ses->cache = CACHE_MMAP; + break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = match_strdup(&args[0]); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a8e127c..099c771 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -64,6 +64,7 @@ enum p9_session_flags { enum p9_cache_modes { CACHE_NONE, + CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, }; diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index dc95a25..b83ebfb 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; extern const struct file_operations v9fs_cached_file_operations; extern const struct file_operations v9fs_cached_file_operations_dotl; +extern const struct file_operations v9fs_mmap_file_operations; +extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9ff073f..c71e886 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) { int retval; + p9_debug(P9_DEBUG_VFS, "page %p\n", page); + retval = v9fs_vfs_writepage_locked(page); if (retval < 0) { if (retval == -EAGAIN) { @@ -282,6 +284,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = mapping->host; + + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + v9inode = V9FS_I(inode); start: page = grab_cache_page_write_begin(mapping, index, flags); @@ -312,6 +317,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct inode *inode = page->mapping->host; + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + if (unlikely(copied < len)) { /* * zero out the rest of the area diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a0df3e7..a16b0ff 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -45,6 +45,7 @@ #include "cache.h" static const struct vm_operations_struct v9fs_file_vm_ops; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** * v9fs_file_open - open a file (or directory) @@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) file->private_data = fid; mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((file->f_flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode->writeback_fid = (void *) fid; } mutex_unlock(&v9inode->v_mutex); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); return 0; out_error: @@ -461,14 +463,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, int n; loff_t i_size; size_t total = 0; - struct p9_client *clnt; loff_t origin = *offset; unsigned long pg_start, pg_end; p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); - clnt = fid->clnt; do { n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) @@ -581,11 +581,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, } static int -v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; - retval = generic_file_mmap(file, vma); + + retval = generic_file_mmap(filp, vma); if (!retval) vma->vm_ops = &v9fs_file_vm_ops; @@ -593,6 +594,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) } static int +v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int retval; + struct inode *inode; + struct v9fs_inode *v9inode; + struct p9_fid *fid; + + inode = file_inode(filp); + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if (!v9inode->writeback_fid && + (vma->vm_flags & VM_WRITE)) { + /* + * clone a fid and add it to writeback_fid + * we do it during mmap instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(filp->f_path.dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + return retval; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); + + retval = generic_file_mmap(filp, vma); + if (!retval) + vma->vm_ops = &v9fs_mmap_file_vm_ops; + + return retval; +} + +static int v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct v9fs_inode *v9inode; @@ -660,6 +698,22 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, return do_sync_read(filp, data, count, offset); } +/** + * v9fs_mmap_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + /* TODO: Check if there are dirty pages */ + return v9fs_file_read(filp, data, count, offset); +} + static ssize_t v9fs_direct_write(struct file *filp, const char __user * data, size_t count, loff_t *offsetp) @@ -730,12 +784,65 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, return do_sync_write(filp, data, count, offset); } + +/** + * v9fs_mmap_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_mmap_file_write(struct file *filp, const char __user *data, + size_t count, loff_t *offset) +{ + /* + * TODO: invalidate mmaps on filp's inode between + * offset and offset+count + */ + return v9fs_file_write(filp, data, count, offset); +} + +static void v9fs_mmap_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode; + + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + .range_start = vma->vm_pgoff * PAGE_SIZE, + /* absolute end, byte at end included */ + .range_end = vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1), + }; + + + p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); + + inode = file_inode(vma->vm_file); + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + sync_inode(inode, &wbc); +} + + static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { + .close = v9fs_mmap_vm_close, + .fault = filemap_fault, + .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, +}; + const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, @@ -786,3 +893,26 @@ const struct file_operations v9fs_file_operations_dotl = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync_dotl, }; + +const struct file_operations v9fs_mmap_file_operations = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_mmap_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4e65aa9..bb7991c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) inode->i_fop = &v9fs_cached_file_operations_dotl; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations_dotl; else inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - if (v9ses->cache) - inode->i_fop = &v9fs_cached_file_operations; + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) + inode->i_fop = + &v9fs_cached_file_operations; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations; else inode->i_fop = &v9fs_file_operations; } @@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *res; - struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; struct inode *inode; @@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); @@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * unlink. For cached mode create calls request for new * inode. But with cache disabled, lookup should do this. */ - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); @@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, return finish_no_open(file, res); err = 0; - fid = NULL; + v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, NULL, perm, @@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_invalidate_inode_attr(dir); v9inode = V9FS_I(dentry->d_inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, goto error; file->private_data = fid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(dentry->d_inode, file); *opened |= FILE_CREATED; @@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4c10edec2..59dc8e8 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) goto err_clunk_old_fid; file->private_data = ofid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); *opened |= FILE_CREATED; out: @@ -473,13 +474,11 @@ static int v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); - err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { generic_fillattr(dentry->d_inode, stat); @@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) { int retval; - struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_iattr_dotl p9attr; struct inode *inode = dentry->d_inode; @@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; - retval = -EPERM; - v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -715,7 +711,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache) { + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { @@ -768,7 +764,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - char *name; struct dentry *dir_dentry; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; @@ -786,8 +781,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - name = (char *) dentry->d_name.name; - err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { @@ -973,7 +966,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2756dcd..0afd038 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } v9fs_fill_super(sb, v9ses, flags, data); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; @@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return generic_drop_inode(inode); /* * in case of non cached mode always drop the @@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", + __func__, inode, v9inode->writeback_fid); if (!v9inode->writeback_fid) return 0; + ret = p9_client_fsync(v9inode->writeback_fid, 0); if (ret < 0) { __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 3c28cdf..04133a1 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (retval < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", retval); - p9_client_clunk(fid); - return retval; + goto err; } msize = fid->clnt->msize; while (value_len) { @@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (write_count < 0) { /* error in xattr write */ retval = write_count; - break; + goto err; } offset += write_count; value_len -= write_count; } - return p9_client_clunk(fid); + retval = offset; +err: + p9_client_clunk(fid); + return retval; } ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) @@ -68,10 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config GENERIC_ACL - bool - select FS_POSIX_ACL - menu "Caches" source "fs/fscache/Kconfig" @@ -119,7 +115,7 @@ config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" depends on TMPFS select TMPFS_XATTR - select GENERIC_ACL + select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support additional access rights for users and groups beyond the standard owner/group/world scheme, diff --git a/fs/Makefile b/fs/Makefile index 4fe6df3..47ac07b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -42,9 +42,8 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o -obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ -obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o @@ -53,7 +52,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/affs/super.c b/fs/affs/super.c index 45161a8..d098731 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -49,11 +49,6 @@ affs_put_super(struct super_block *sb) pr_debug("AFFS: put_super()\n"); cancel_delayed_work_sync(&sbi->sb_work); - kfree(sbi->s_prefix); - affs_free_bitmap(sb); - affs_brelse(sbi->s_root_bh); - kfree(sbi); - sb->s_fs_info = NULL; } static int @@ -316,7 +311,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; - int ret = -EINVAL; + int ret; save_mount_options(sb, data); @@ -412,17 +407,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) if (!silent) printk(KERN_ERR "AFFS: No valid root block on device %s\n", sb->s_id); - goto out_error; + return -EINVAL; /* N.B. after this point bh must be released */ got_root: + /* Keep super block in cache */ + sbi->s_root_bh = root_bh; root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); if (!boot_bh) { printk(KERN_ERR "AFFS: Cannot read boot block\n"); - goto out_error; + return -EINVAL; } memcpy(sig, boot_bh->b_data, 4); brelse(boot_bh); @@ -471,7 +468,7 @@ got_root: default: printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", sb->s_id, chksum); - goto out_error; + return -EINVAL; } if (mount_flags & SF_VERBOSE) { @@ -488,22 +485,17 @@ got_root: if (sbi->s_flags & SF_OFS) sbi->s_data_blksize -= 24; - /* Keep super block in cache */ - sbi->s_root_bh = root_bh; - /* N.B. after this point s_root_bh must be released */ - tmp_flags = sb->s_flags; - if (affs_init_bitmap(sb, &tmp_flags)) - goto out_error; + ret = affs_init_bitmap(sb, &tmp_flags); + if (ret) + return ret; sb->s_flags = tmp_flags; /* set up enough so that it can read an inode */ root_inode = affs_iget(sb, root_block); - if (IS_ERR(root_inode)) { - ret = PTR_ERR(root_inode); - goto out_error; - } + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); if (AFFS_SB(sb)->s_flags & SF_INTL) sb->s_d_op = &affs_intl_dentry_operations; @@ -513,22 +505,11 @@ got_root: sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); - goto out_error; + return -ENOMEM; } pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); return 0; - - /* - * Begin the cascaded cleanup ... - */ -out_error: - kfree(sbi->s_bitmap); - affs_brelse(root_bh); - kfree(sbi->s_prefix); - kfree(sbi); - sb->s_fs_info = NULL; - return ret; } static int @@ -615,11 +596,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); } +static void affs_kill_sb(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + kill_block_super(sb); + if (sbi) { + affs_free_bitmap(sb); + affs_brelse(sbi->s_root_bh); + kfree(sbi->s_prefix); + kfree(sbi); + } +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", .mount = affs_mount, - .kill_sb = kill_block_super, + .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a306bb6..6621f80 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -195,7 +195,6 @@ struct afs_cell { struct list_head link; /* main cell list link */ struct key *anonymous_key; /* anonymous user key for this cell */ struct list_head proc_link; /* /proc cell list link */ - struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 526e4bb..bddc512 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = { .write = afs_proc_cells_write, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; -static int afs_proc_rootcell_open(struct inode *inode, struct file *file); -static int afs_proc_rootcell_release(struct inode *inode, struct file *file); static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos); static ssize_t afs_proc_rootcell_write(struct file *file, @@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, size_t size, loff_t *_pos); static const struct file_operations afs_proc_rootcell_fops = { - .open = afs_proc_rootcell_open, .read = afs_proc_rootcell_read, .write = afs_proc_rootcell_write, .llseek = no_llseek, - .release = afs_proc_rootcell_release, - .owner = THIS_MODULE, }; static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); -static int afs_proc_cell_volumes_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *pos); @@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = { .open = afs_proc_cell_volumes_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_volumes_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *pos); @@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = { .open = afs_proc_cell_vlservers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_vlservers_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, loff_t *pos); @@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = { .open = afs_proc_cell_servers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_servers_release, - .owner = THIS_MODULE, + .release = seq_release, }; /* @@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = { */ int afs_proc_init(void) { - struct proc_dir_entry *p; - _enter(""); proc_afs = proc_mkdir("fs/afs", NULL); if (!proc_afs) goto error_dir; - p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); - if (!p) - goto error_cells; - - p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); - if (!p) - goto error_rootcell; + if (!proc_create("cells", 0, proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops)) + goto error_tree; _leave(" = 0"); return 0; -error_rootcell: - remove_proc_entry("cells", proc_afs); -error_cells: - remove_proc_entry("fs/afs", NULL); +error_tree: + remove_proc_subtree("fs/afs", NULL); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -172,9 +149,7 @@ error_dir: */ void afs_proc_cleanup(void) { - remove_proc_entry("rootcell", proc_afs); - remove_proc_entry("cells", proc_afs); - remove_proc_entry("fs/afs", NULL); + remove_proc_subtree("fs/afs", NULL); } /* @@ -319,19 +294,6 @@ inval: goto done; } -/* - * Stubs for /proc/fs/afs/rootcell - */ -static int afs_proc_rootcell_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static int afs_proc_rootcell_release(struct inode *inode, struct file *file) -{ - return 0; -} - static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { @@ -387,38 +349,27 @@ nomem: */ int afs_proc_cell_setup(struct afs_cell *cell) { - struct proc_dir_entry *p; + struct proc_dir_entry *dir; _enter("%p{%s}", cell, cell->name); - cell->proc_dir = proc_mkdir(cell->name, proc_afs); - if (!cell->proc_dir) + dir = proc_mkdir(cell->name, proc_afs); + if (!dir) goto error_dir; - p = proc_create_data("servers", 0, cell->proc_dir, - &afs_proc_cell_servers_fops, cell); - if (!p) - goto error_servers; - - p = proc_create_data("vlservers", 0, cell->proc_dir, - &afs_proc_cell_vlservers_fops, cell); - if (!p) - goto error_vlservers; - - p = proc_create_data("volumes", 0, cell->proc_dir, - &afs_proc_cell_volumes_fops, cell); - if (!p) - goto error_volumes; + if (!proc_create_data("servers", 0, dir, + &afs_proc_cell_servers_fops, cell) || + !proc_create_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_fops, cell) || + !proc_create_data("volumes", 0, dir, + &afs_proc_cell_volumes_fops, cell)) + goto error_tree; _leave(" = 0"); return 0; -error_volumes: - remove_proc_entry("vlservers", cell->proc_dir); -error_vlservers: - remove_proc_entry("servers", cell->proc_dir); -error_servers: - remove_proc_entry(cell->name, proc_afs); +error_tree: + remove_proc_subtree(cell->name, proc_afs); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell) { _enter(""); - remove_proc_entry("volumes", cell->proc_dir); - remove_proc_entry("vlservers", cell->proc_dir); - remove_proc_entry("servers", cell->proc_dir); - remove_proc_entry(cell->name, proc_afs); + remove_proc_subtree(cell->name, proc_afs); _leave(""); } @@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx) int i; for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } put_aio_ring_file(ctx); @@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, unsigned long flags; int rc; + rc = 0; + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EINVAL; + spin_unlock(&mapping->private_lock); + + if (rc != 0) + return rc; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); + put_page(new); return rc; } - get_page(new); - /* We can potentially race against kioctx teardown here. Use the * address_space's private data lock to protect the mapping's * private_data. @@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; + if (idx < (pgoff_t)ctx->nr_pages) { + /* And only do the move if things haven't changed */ + if (ctx->ring_pages[idx] == old) + ctx->ring_pages[idx] = new; + else + rc = -EAGAIN; + } else + rc = -EINVAL; spin_unlock_irqrestore(&ctx->completion_lock, flags); } else rc = -EBUSY; spin_unlock(&mapping->private_lock); + if (rc == MIGRATEPAGE_SUCCESS) + put_page(old); + else + put_page(new); + return rc; } #endif @@ -326,7 +362,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; - unsigned long size, populate; + unsigned long size, unused; int nr_pages; int i; struct file *file; @@ -347,6 +383,20 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } + for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_inode->i_mapping, @@ -358,19 +408,14 @@ static int aio_setup_ring(struct kioctx *ctx) SetPageUptodate(page); SetPageDirty(page); unlock_page(page); + + ctx->ring_pages[i] = page; } - ctx->aio_ring_file = file; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + ctx->nr_pages = i; - ctx->ring_pages = ctx->internal_pages; - if (nr_pages > AIO_RING_PAGES) { - ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!ctx->ring_pages) { - put_aio_ring_file(ctx); - return -ENOMEM; - } + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -EAGAIN; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -379,9 +424,9 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&mm->mmap_sem); ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, 0, &populate); + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { - up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; @@ -389,27 +434,6 @@ static int aio_setup_ring(struct kioctx *ctx) pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); - /* We must do this while still holding mmap_sem for write, as we - * need to be protected against userspace attempting to mremap() - * or munmap() the ring buffer. - */ - ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, - 1, 0, ctx->ring_pages, NULL); - - /* Dropping the reference here is safe as the page cache will hold - * onto the pages for us. It is also required so that page migration - * can unmap the pages and get the right reference count. - */ - for (i = 0; i < ctx->nr_pages; i++) - put_page(ctx->ring_pages[i]); - - up_write(&mm->mmap_sem); - - if (unlikely(ctx->nr_pages != nr_pages)) { - aio_free_ring(ctx); - return -EAGAIN; - } - ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ @@ -652,7 +676,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) @@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } - if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { - if (attr->ia_size != inode->i_size) - inode_inc_iversion(inode); - } - if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 4218e26..acf3205 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -104,7 +104,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 1818ce7..3182c0e 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, { int pipefd; int err = 0; + struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; @@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { - struct file *pipe = fget(pipefd); + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; @@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; } out: + put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 3d9d3f5..394e90b 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, goto next; } + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + DPRINTK("checking symlink %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs4_can_expire(dentry, timeout, do_now)) { + expired = dentry; + goto found; + } + goto next; + } + if (simple_empty(dentry)) goto next; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 3b9cc9b..d7bd395 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb) * just call kill_anon_super when we are called from * deactivate_super. */ - if (sbi) /* Free wait queues, close pipe */ + if (sbi) { + /* Free wait queues, close pipe */ autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + } DPRINTK("shutting down"); kill_litter_super(sb); @@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -124,7 +127,8 @@ static const match_table_t tokens = { }; static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, if (match_int(args, &option)) return 1; *pgrp = option; + *pgrp_set = true; break; case Opt_minproto: if (match_int(args, &option)) @@ -206,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + int pgrp; + bool pgrp_set = false; + int ret = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto fail_unlock; + return -ENOMEM; DPRINTK("starting up, sbi = %p",sbi); s->s_fs_info = sbi; @@ -218,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); + sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -243,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Get the root inode and dentry, but defer checking for errors. */ ino = autofs4_new_ino(sbi); - if (!ino) + if (!ino) { + ret = -ENOMEM; goto fail_free; + } root_inode = autofs4_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) @@ -255,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); @@ -284,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); - + if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (autofs_prepare_pipe(pipe) < 0) + ret = autofs_prepare_pipe(pipe); + if (ret < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; @@ -316,10 +337,10 @@ fail_dput: fail_ino: kfree(ino); fail_free: + put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; -fail_unlock: - return -EINVAL; + return ret; } struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 92ef341..2caf36a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); dir->i_mtime = CURRENT_TIME; @@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } dput(ino->dentry); @@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); dir->i_mtime = CURRENT_TIME; diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index f27c094..1e8ea19 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -14,6 +14,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino && !autofs4_oz_mode(sbi)) + ino->last_used = jiffies; nd_set_link(nd, dentry->d_inode->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 689e40d..116fd38 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, struct qstr qstr; char *name; int status, ret, type; + pid_t pid; + pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) return -ENOENT; + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + if (!dentry->d_inode) { /* * A wait for a negative dentry is invalid for certain @@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->ino = autofs4_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); - wq->pid = current->pid; - wq->tgid = current->tgid; + wq->pid = pid; + wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index daa15d6..845d2d6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -324,8 +324,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); inode = iget_locked(sb, ino); - if (IS_ERR(inode)) - return inode; + if (!inode) + return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 571a423..67be295 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -543,9 +543,6 @@ out: * libraries. There is no binary dependent code anywhere else. */ -#define INTERPRETER_NONE 0 -#define INTERPRETER_ELF 2 - #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index fc60b31..0bad24d 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -134,8 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } - iv = bip_vec_idx(bip, bip->bip_vcnt); - BUG_ON(iv == NULL); + iv = bip->bip_vec + bip->bip_vcnt; iv->bv_page = page; iv->bv_len = len; @@ -203,6 +202,12 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, return sectors; } +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; +} + /** * bio_integrity_tag_size - Retrieve integrity tag space * @bio: bio to inspect @@ -215,9 +220,9 @@ unsigned int bio_integrity_tag_size(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - BUG_ON(bio->bi_size == 0); + BUG_ON(bio->bi_iter.bi_size == 0); - return bi->tag_size * (bio->bi_size / bi->sector_size); + return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); } EXPORT_SYMBOL(bio_integrity_tag_size); @@ -235,9 +240,9 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); - if (nr_sectors * bi->tuple_size > bip->bip_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", - __func__, nr_sectors * bi->tuple_size, bip->bip_size); + if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, + nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); return -1; } @@ -299,29 +304,30 @@ static void bio_integrity_generate(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector = bio->bi_sector; - unsigned int i, sectors, total; + struct bio_vec bv; + struct bvec_iter iter; + sector_t sector = bio->bi_iter.bi_sector; + unsigned int sectors, total; void *prot_buf = bio->bi_integrity->bip_buf; total = 0; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; + bio_for_each_segment(bv, bio, iter) { + void *kaddr = kmap_atomic(bv.bv_page); + bix.data_buf = kaddr + bv.bv_offset; + bix.data_size = bv.bv_len; bix.prot_buf = prot_buf; bix.sector = sector; bi->generate_fn(&bix); - sectors = bv->bv_len / bi->sector_size; + sectors = bv.bv_len / bi->sector_size; sector += sectors; prot_buf += sectors * bi->tuple_size; total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); + BUG_ON(total > bio->bi_integrity->bip_iter.bi_size); kunmap_atomic(kaddr); } @@ -386,8 +392,8 @@ int bio_integrity_prep(struct bio *bio) bip->bip_owns_buf = 1; bip->bip_buf = buf; - bip->bip_size = len; - bip->bip_sector = bio->bi_sector; + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; /* Map it */ offset = offset_in_page(buf); @@ -442,16 +448,18 @@ static int bio_integrity_verify(struct bio *bio) struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; struct bio_vec *bv; - sector_t sector = bio->bi_integrity->bip_sector; - unsigned int i, sectors, total, ret; + sector_t sector = bio->bi_integrity->bip_iter.bi_sector; + unsigned int sectors, total, ret; void *prot_buf = bio->bi_integrity->bip_buf; + int i; ret = total = 0; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i) { void *kaddr = kmap_atomic(bv->bv_page); + bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -468,7 +476,7 @@ static int bio_integrity_verify(struct bio *bio) sector += sectors; prot_buf += sectors * bi->tuple_size; total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); + BUG_ON(total > bio->bi_integrity->bip_iter.bi_size); kunmap_atomic(kaddr); } @@ -495,7 +503,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); } /** @@ -533,56 +541,6 @@ void bio_integrity_endio(struct bio *bio, int error) EXPORT_SYMBOL(bio_integrity_endio); /** - * bio_integrity_mark_head - Advance bip_vec skip bytes - * @bip: Integrity vector to advance - * @skip: Number of bytes to advance it - */ -void bio_integrity_mark_head(struct bio_integrity_payload *bip, - unsigned int skip) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (skip == 0) { - bip->bip_idx = i; - return; - } else if (skip >= iv->bv_len) { - skip -= iv->bv_len; - } else { /* skip < iv->bv_len) */ - iv->bv_offset += skip; - iv->bv_len -= skip; - bip->bip_idx = i; - return; - } - } -} - -/** - * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long - * @bip: Integrity vector to truncate - * @len: New length of integrity vector - */ -void bio_integrity_mark_tail(struct bio_integrity_payload *bip, - unsigned int len) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (len == 0) { - bip->bip_vcnt = i; - return; - } else if (len >= iv->bv_len) { - len -= iv->bv_len; - } else { /* len < iv->bv_len) */ - iv->bv_len = len; - len = 0; - } - } -} - -/** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed @@ -595,13 +553,9 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); - - nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); - bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } EXPORT_SYMBOL(bio_integrity_advance); @@ -621,64 +575,13 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset, { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); - BUG_ON(!bio_flagged(bio, BIO_CLONED)); - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - bip->bip_sector = bip->bip_sector + offset; - bio_integrity_mark_head(bip, offset * bi->tuple_size); - bio_integrity_mark_tail(bip, sectors * bi->tuple_size); + bio_integrity_advance(bio, offset << 9); + bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); } EXPORT_SYMBOL(bio_integrity_trim); /** - * bio_integrity_split - Split integrity metadata - * @bio: Protected bio - * @bp: Resulting bio_pair - * @sectors: Offset - * - * Description: Splits an integrity page into a bio_pair. - */ -void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) -{ - struct blk_integrity *bi; - struct bio_integrity_payload *bip = bio->bi_integrity; - unsigned int nr_sectors; - - if (bio_integrity(bio) == 0) - return; - - bi = bdev_get_integrity(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bip->bip_vcnt != 1); - - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - - bp->bio1.bi_integrity = &bp->bip1; - bp->bio2.bi_integrity = &bp->bip2; - - bp->iv1 = bip->bip_vec[bip->bip_idx]; - bp->iv2 = bip->bip_vec[bip->bip_idx]; - - bp->bip1.bip_vec = &bp->iv1; - bp->bip2.bip_vec = &bp->iv2; - - bp->iv1.bv_len = sectors * bi->tuple_size; - bp->iv2.bv_offset += sectors * bi->tuple_size; - bp->iv2.bv_len -= sectors * bi->tuple_size; - - bp->bip1.bip_sector = bio->bi_integrity->bip_sector; - bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; - - bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; - bp->bip1.bip_idx = bp->bip2.bip_idx = 0; -} -EXPORT_SYMBOL(bio_integrity_split); - -/** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio @@ -702,9 +605,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); - bip->bip_sector = bip_src->bip_sector; bip->bip_vcnt = bip_src->bip_vcnt; - bip->bip_idx = bip_src->bip_idx; + bip->bip_iter = bip_src->bip_iter; return 0; } @@ -38,8 +38,6 @@ */ #define BIO_INLINE_VECS 4 -static mempool_t *bio_split_pool __read_mostly; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -273,6 +271,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + atomic_set(&bio->bi_remaining, 1); atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -295,9 +294,35 @@ void bio_reset(struct bio *bio) memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags|(1 << BIO_UPTODATE); + atomic_set(&bio->bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); +static void bio_chain_endio(struct bio *bio, int error) +{ + bio_endio(bio->bi_private, error); + bio_put(bio); +} + +/** + * bio_chain - chain bio completions + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ + BUG_ON(bio->bi_private || bio->bi_end_io); + + bio->bi_private = parent; + bio->bi_end_io = bio_chain_endio; + atomic_inc(&parent->bi_remaining); +} +EXPORT_SYMBOL(bio_chain); + static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); @@ -473,13 +498,13 @@ EXPORT_SYMBOL(bio_alloc_bioset); void zero_fill_bio(struct bio *bio) { unsigned long flags; - struct bio_vec *bv; - int i; + struct bio_vec bv; + struct bvec_iter iter; - bio_for_each_segment(bv, bio, i) { - char *data = bvec_kmap_irq(bv, &flags); - memset(data, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); + bio_for_each_segment(bv, bio, iter) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + flush_dcache_page(bv.bv_page); bvec_kunmap_irq(data, &flags); } } @@ -515,51 +540,49 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) EXPORT_SYMBOL(bio_phys_segments); /** - * __bio_clone - clone a bio + * __bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: destination bio * @bio_src: bio to clone * * Clone a &bio. Caller will own the returned bio, but not * the actual data it points to. Reference count of returned * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. */ -void __bio_clone(struct bio *bio, struct bio *bio_src) +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - memcpy(bio->bi_io_vec, bio_src->bi_io_vec, - bio_src->bi_max_vecs * sizeof(struct bio_vec)); + BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); /* * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_sector = bio_src->bi_sector; bio->bi_bdev = bio_src->bi_bdev; bio->bi_flags |= 1 << BIO_CLONED; bio->bi_rw = bio_src->bi_rw; - bio->bi_vcnt = bio_src->bi_vcnt; - bio->bi_size = bio_src->bi_size; - bio->bi_idx = bio_src->bi_idx; + bio->bi_iter = bio_src->bi_iter; + bio->bi_io_vec = bio_src->bi_io_vec; } -EXPORT_SYMBOL(__bio_clone); +EXPORT_SYMBOL(__bio_clone_fast); /** - * bio_clone_bioset - clone a bio + * bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: bio to clone * @gfp_mask: allocation priority * @bs: bio_set to allocate from * - * Like __bio_clone, only also allocates the returned bio + * Like __bio_clone_fast, only also allocates the returned bio */ -struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, - struct bio_set *bs) +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) { struct bio *b; - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); + b = bio_alloc_bioset(gfp_mask, 0, bs); if (!b) return NULL; - __bio_clone(b, bio); + __bio_clone_fast(b, bio); if (bio_integrity(bio)) { int ret; @@ -574,6 +597,74 @@ struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, return b; } +EXPORT_SYMBOL(bio_clone_fast); + +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + unsigned nr_iovecs = 0; + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio_for_each_segment(bv, bio_src, iter) + nr_iovecs++; + + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs); + if (!bio) + return NULL; + + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + return bio; +} EXPORT_SYMBOL(bio_clone_bioset); /** @@ -612,7 +703,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) return 0; /* @@ -635,8 +726,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page simulate merging updated prev_bvec as new bvec. */ .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size - prev_bv_len, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size - + prev_bv_len, .bi_rw = bio->bi_rw, }; @@ -684,8 +776,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size, .bi_rw = bio->bi_rw, }; @@ -708,7 +800,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page bio->bi_vcnt++; bio->bi_phys_segments++; done: - bio->bi_size += len; + bio->bi_iter.bi_size += len; return len; } @@ -807,28 +899,7 @@ void bio_advance(struct bio *bio, unsigned bytes) if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); - bio->bi_sector += bytes >> 9; - bio->bi_size -= bytes; - - if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) - return; - - while (bytes) { - if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { - WARN_ONCE(1, "bio idx %d >= vcnt %d\n", - bio->bi_idx, bio->bi_vcnt); - break; - } - - if (bytes >= bio_iovec(bio)->bv_len) { - bytes -= bio_iovec(bio)->bv_len; - bio->bi_idx++; - } else { - bio_iovec(bio)->bv_len -= bytes; - bio_iovec(bio)->bv_offset += bytes; - bytes = 0; - } - } + bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(bio_advance); @@ -874,117 +945,80 @@ EXPORT_SYMBOL(bio_alloc_pages); */ void bio_copy_data(struct bio *dst, struct bio *src) { - struct bio_vec *src_bv, *dst_bv; - unsigned src_offset, dst_offset, bytes; + struct bvec_iter src_iter, dst_iter; + struct bio_vec src_bv, dst_bv; void *src_p, *dst_p; + unsigned bytes; - src_bv = bio_iovec(src); - dst_bv = bio_iovec(dst); - - src_offset = src_bv->bv_offset; - dst_offset = dst_bv->bv_offset; + src_iter = src->bi_iter; + dst_iter = dst->bi_iter; while (1) { - if (src_offset == src_bv->bv_offset + src_bv->bv_len) { - src_bv++; - if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { - src = src->bi_next; - if (!src) - break; - - src_bv = bio_iovec(src); - } + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; - src_offset = src_bv->bv_offset; + src_iter = src->bi_iter; } - if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { - dst_bv++; - if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { - dst = dst->bi_next; - if (!dst) - break; - - dst_bv = bio_iovec(dst); - } + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; - dst_offset = dst_bv->bv_offset; + dst_iter = dst->bi_iter; } - bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, - src_bv->bv_offset + src_bv->bv_len - src_offset); + src_bv = bio_iter_iovec(src, src_iter); + dst_bv = bio_iter_iovec(dst, dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); - src_p = kmap_atomic(src_bv->bv_page); - dst_p = kmap_atomic(dst_bv->bv_page); + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); - memcpy(dst_p + dst_offset, - src_p + src_offset, + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, bytes); kunmap_atomic(dst_p); kunmap_atomic(src_p); - src_offset += bytes; - dst_offset += bytes; + bio_advance_iter(src, &src_iter, bytes); + bio_advance_iter(dst, &dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data); struct bio_map_data { - struct bio_vec *iovecs; - struct sg_iovec *sgvecs; int nr_sgvecs; int is_our_pages; + struct sg_iovec sgvecs[]; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, struct sg_iovec *iov, int iov_count, int is_our_pages) { - memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd->nr_sgvecs = iov_count; bmd->is_our_pages = is_our_pages; bio->bi_private = bmd; } -static void bio_free_map_data(struct bio_map_data *bmd) -{ - kfree(bmd->iovecs); - kfree(bmd->sgvecs); - kfree(bmd); -} - static struct bio_map_data *bio_alloc_map_data(int nr_segs, unsigned int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd; - if (iov_count > UIO_MAXIOV) return NULL; - bmd = kmalloc(sizeof(*bmd), gfp_mask); - if (!bmd) - return NULL; - - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); - if (!bmd->iovecs) { - kfree(bmd); - return NULL; - } - - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); - if (bmd->sgvecs) - return bmd; - - kfree(bmd->iovecs); - kfree(bmd); - return NULL; + return kmalloc(sizeof(struct bio_map_data) + + sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -994,7 +1028,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = iovecs[i].bv_len; + unsigned int bv_len = bvec->bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -1054,14 +1088,14 @@ int bio_uncopy_user(struct bio *bio) * don't copy into a random user address space, just free. */ if (current->mm) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, + ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, + bio_data_dir(bio) == READ, 0, bmd->is_our_pages); else if (bmd->is_our_pages) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } - bio_free_map_data(bmd); + kfree(bmd); bio_put(bio); return ret; } @@ -1175,7 +1209,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); + ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); if (ret) goto cleanup; } @@ -1189,7 +1223,7 @@ cleanup: bio_put(bio); out_bmd: - bio_free_map_data(bmd); + kfree(bmd); return ERR_PTR(ret); } @@ -1485,7 +1519,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (IS_ERR(bio)) return bio; - if (bio->bi_size == len) + if (bio->bi_iter.bi_size == len) return bio; /* @@ -1506,16 +1540,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); - int len = bmd->iovecs[i].bv_len; if (read) - memcpy(p, addr, len); + memcpy(p, addr, bvec->bv_len); __free_page(bvec->bv_page); - p += len; + p += bvec->bv_len; } - bio_free_map_data(bmd); + kfree(bmd); bio_put(bio); } @@ -1686,11 +1719,11 @@ void bio_check_pages_dirty(struct bio *bio) #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { - int i; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; - bio_for_each_segment(bvec, bi, i) - flush_dcache_page(bvec->bv_page); + bio_for_each_segment(bvec, bi, iter) + flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL(bio_flush_dcache_pages); #endif @@ -1711,96 +1744,86 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); **/ void bio_endio(struct bio *bio, int error) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + while (bio) { + BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - if (bio->bi_end_io) - bio->bi_end_io(bio, error); -} -EXPORT_SYMBOL(bio_endio); + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; -void bio_pair_release(struct bio_pair *bp) -{ - if (atomic_dec_and_test(&bp->cnt)) { - struct bio *master = bp->bio1.bi_private; + if (!atomic_dec_and_test(&bio->bi_remaining)) + return; - bio_endio(master, bp->error); - mempool_free(bp, bp->bio2.bi_private); + /* + * Need to have a real endio function for chained bios, + * otherwise various corner cases will break (like stacking + * block devices that save/restore bi_end_io) - however, we want + * to avoid unbounded recursion and blowing the stack. Tail call + * optimization would handle this, but compiling with frame + * pointers also disables gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + struct bio *parent = bio->bi_private; + bio_put(bio); + bio = parent; + } else { + if (bio->bi_end_io) + bio->bi_end_io(bio, error); + bio = NULL; + } } } -EXPORT_SYMBOL(bio_pair_release); +EXPORT_SYMBOL(bio_endio); -static void bio_pair_end_1(struct bio *bi, int err) +/** + * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining + * @bio: bio + * @error: error, if any + * + * For code that has saved and restored bi_end_io; thing hard before using this + * function, probably you should've cloned the entire bio. + **/ +void bio_endio_nodec(struct bio *bio, int error) { - struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); - - if (err) - bp->error = err; - - bio_pair_release(bp); + atomic_inc(&bio->bi_remaining); + bio_endio(bio, error); } +EXPORT_SYMBOL(bio_endio_nodec); -static void bio_pair_end_2(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); - - if (err) - bp->error = err; - - bio_pair_release(bp); -} - -/* - * split a bio - only worry about a bio with a single page in its iovec +/** + * bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's + * responsibility to ensure that @bio is not freed before the split. */ -struct bio_pair *bio_split(struct bio *bi, int first_sectors) +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) { - struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); - - if (!bp) - return bp; - - trace_block_split(bdev_get_queue(bi->bi_bdev), bi, - bi->bi_sector + first_sectors); - - BUG_ON(bio_segments(bi) > 1); - atomic_set(&bp->cnt, 3); - bp->error = 0; - bp->bio1 = *bi; - bp->bio2 = *bi; - bp->bio2.bi_sector += first_sectors; - bp->bio2.bi_size -= first_sectors << 9; - bp->bio1.bi_size = first_sectors << 9; - - if (bi->bi_vcnt != 0) { - bp->bv1 = *bio_iovec(bi); - bp->bv2 = *bio_iovec(bi); - - if (bio_is_rw(bi)) { - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; - } + struct bio *split = NULL; - bp->bio1.bi_io_vec = &bp->bv1; - bp->bio2.bi_io_vec = &bp->bv2; + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); - bp->bio1.bi_max_vecs = 1; - bp->bio2.bi_max_vecs = 1; - } + split = bio_clone_fast(bio, gfp, bs); + if (!split) + return NULL; - bp->bio1.bi_end_io = bio_pair_end_1; - bp->bio2.bi_end_io = bio_pair_end_2; + split->bi_iter.bi_size = sectors << 9; - bp->bio1.bi_private = bi; - bp->bio2.bi_private = bio_split_pool; + if (bio_integrity(split)) + bio_integrity_trim(split, 0, sectors); - if (bio_integrity(bi)) - bio_integrity_split(bi, bp, first_sectors); + bio_advance(bio, split->bi_iter.bi_size); - return bp; + return split; } EXPORT_SYMBOL(bio_split); @@ -1814,80 +1837,20 @@ void bio_trim(struct bio *bio, int offset, int size) { /* 'bio' is a cloned bio which we need to trim to match * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec */ - int i; - struct bio_vec *bvec; - int sofar = 0; size <<= 9; - if (offset == 0 && size == bio->bi_size) + if (offset == 0 && size == bio->bi_iter.bi_size) return; clear_bit(BIO_SEG_VALID, &bio->bi_flags); bio_advance(bio, offset << 9); - bio->bi_size = size; - - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } + bio->bi_iter.bi_size = size; } EXPORT_SYMBOL_GPL(bio_trim); -/** - * bio_sector_offset - Find hardware sector offset in bio - * @bio: bio to inspect - * @index: bio_vec index - * @offset: offset in bv_page - * - * Return the number of hardware sectors between beginning of bio - * and an end point indicated by a bio_vec index and an offset - * within that vector's page. - */ -sector_t bio_sector_offset(struct bio *bio, unsigned short index, - unsigned int offset) -{ - unsigned int sector_sz; - struct bio_vec *bv; - sector_t sectors; - int i; - - sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); - sectors = 0; - - if (index >= bio->bi_idx) - index = bio->bi_vcnt - 1; - - bio_for_each_segment_all(bv, bio, i) { - if (i == index) { - if (offset > bv->bv_offset) - sectors += (offset - bv->bv_offset) / sector_sz; - break; - } - - sectors += bv->bv_len / sector_sz; - } - - return sectors; -} -EXPORT_SYMBOL(bio_sector_offset); - /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. @@ -2065,11 +2028,6 @@ static int __init init_bio(void) if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) panic("bio: can't create integrity pool\n"); - bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, - sizeof(struct bio_pair)); - if (!bio_split_pool) - panic("bio: can't create split pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index aa976ec..a66768e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,6 +1,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 1a44e42..f341a98 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o + uuid-tree.o props.o hash.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0890c83..ff9b399 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; - if (!IS_POSIXACL(inode)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -76,31 +69,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int ret = 0; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - acl = btrfs_get_acl(dentry->d_inode, type); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return ret; -} - /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct btrfs_trans_handle *trans, +static int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; @@ -158,35 +130,9 @@ out: return ret; } -static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - int ret; - struct posix_acl *acl = NULL; - - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } - } - - ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); -out: - posix_acl_release(acl); - - return ret; + return __btrfs_set_acl(NULL, inode, acl, type); } /* @@ -197,83 +143,31 @@ out: int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int ret = 0; /* this happens with subvols */ if (!dir) return 0; - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; - if (!acl) - inode->i_mode &= ~current_umask(); + if (default_acl) { + ret = __btrfs_set_acl(trans, inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - if (IS_POSIXACL(dir) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(trans, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto failed; - } - ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (ret < 0) - return ret; - - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } else if (ret < 0) { - cache_no_acl(inode); - } - } else { - cache_no_acl(inode); + if (acl) { + if (!ret) + ret = __btrfs_set_acl(trans, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); } -failed: - posix_acl_release(acl); - - return ret; -} -int btrfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int ret = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!IS_POSIXACL(inode)) - return 0; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) - return PTR_ERR(acl); - - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); + if (!default_acl && !acl) + cache_no_acl(inode); return ret; } - -const struct xattr_handler btrfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -const struct xattr_handler btrfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3775947..aded3ef 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, return 0; } +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, struct extent_inode_elem **eie) @@ -209,18 +219,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, int level, - struct btrfs_key *key_for_search, u64 time_seq, - u64 wanted_disk_byte, - const u64 *extent_item_pos) + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos) { int ret = 0; int slot; struct extent_buffer *eb; struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; if (level != 0) { eb = path->nodes[level]; @@ -238,7 +249,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -254,6 +265,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; + count++; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -273,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, old = old->next; old->next = eie; } + eie = NULL; } next: ret = btrfs_next_old_item(root, path, time_seq); @@ -280,6 +293,8 @@ next: if (ret > 0) ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -299,23 +314,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; + int index; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + root = btrfs_read_fs_root_no_name(fs_info, &root_key); if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out; } root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); goto out; + } path->lowest_level = level; ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", ref->root_id, level, ref->count, ret, @@ -334,9 +360,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, level, &ref->key_for_search, - time_seq, ref->wanted_disk_byte, - extent_item_pos); + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos); out: path->lowest_level = 0; btrfs_release_path(path); @@ -376,10 +401,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos); - if (err == -ENOMEM) - goto out; - if (err) + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { continue; + } else if (err) { + ret = err; + goto out; + } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); @@ -538,14 +569,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(&op_key, &extent_op->key); - while ((n = rb_prev(n))) { + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { struct btrfs_delayed_ref_node *node; node = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - + n = rb_next(n); if (node->seq > seq) continue; @@ -612,10 +642,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, WARN_ON(1); } if (ret) - return ret; + break; } - - return 0; + spin_unlock(&head->lock); + return ret; } /* @@ -828,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -882,15 +913,15 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } + spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); - if (ret) { - spin_unlock(&delayed_refs->lock); + if (ret) goto out; - } + } else { + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); } if (path->slots[0]) { @@ -941,7 +972,6 @@ again: goto out; } if (ref->count && ref->parent) { - struct extent_inode_elem *eie = NULL; if (extent_item_pos && !ref->inode_list) { u32 bsz; struct extent_buffer *eb; @@ -976,6 +1006,7 @@ again: eie = eie->next; eie->next = ref->inode_list; } + eie = NULL; } list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); @@ -994,7 +1025,8 @@ out: list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); } - + if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -1002,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks) { struct ulist_node *node = NULL; struct extent_inode_elem *eie; - struct extent_inode_elem *eie_next; struct ulist_iterator uiter; ULIST_ITER_INIT(&uiter); @@ -1010,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks) if (!node->aux) continue; eie = (struct extent_inode_elem *)(uintptr_t)node->aux; - for (; eie; eie = eie_next) { - eie_next = eie->next; - kfree(eie); - } + free_inode_elem_list(eie); node->aux = 0; } @@ -1101,44 +1129,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!node) break; bytenr = node->val; + cond_resched(); } ulist_free(tmp); return 0; } - -static int __inode_info(u64 inum, u64 ioff, u8 key_type, - struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_key *found_key) -{ - int ret; - struct btrfs_key key; - struct extent_buffer *eb; - - key.type = key_type; - key.objectid = inum; - key.offset = ioff; - - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if (ret < 0) - return ret; - - eb = path->nodes[0]; - if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_root, path); - if (ret) - return ret; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); - if (found_key->type != key.type || found_key->objectid != key.objectid) - return 1; - - return 0; -} - /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1146,16 +1143,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path) { struct btrfs_key key; - return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, - &key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_ITEM_KEY, &key); } static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) { - return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, - found_key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_REF_KEY, found_key); } int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, @@ -1335,20 +1332,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_item(fs_info->extent_root, path, - 0, BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - return ret; - btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + while (1) { + u32 nritems; + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(fs_info->extent_root, path); + if (ret != 0) { + if (ret > 0) { + pr_debug("logical %llu is not within " + "any extent\n", logical); + ret = -ENOENT; + } + return ret; + } + } else { + path->slots[0]--; + } + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) { + pr_debug("logical %llu is not within any extent\n", + logical); + return -ENOENT; + } + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, + path->slots[0]); + if (found_key->type == BTRFS_EXTENT_ITEM_KEY || + found_key->type == BTRFS_METADATA_ITEM_KEY) + break; + } + if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; - if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && - found_key->type != BTRFS_METADATA_ITEM_KEY) || - found_key->objectid > logical || + if (found_key->objectid > logical || found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; @@ -1601,7 +1623,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1614,9 +1635,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, parent = found_key.offset; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); @@ -1674,17 +1698,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ++found; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); cur_offset = 0; while (cur_offset < item_size) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ac0b39d..8fed212 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -43,6 +43,7 @@ #define BTRFS_INODE_COPY_EVERYTHING 8 #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 /* in memory btrfs inode */ struct btrfs_inode { @@ -135,6 +136,9 @@ struct btrfs_inode { */ u64 index_cnt; + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 131d828..49a62b4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + - btrfs_stack_file_extent_offset(&file_extent_item); - generation = btrfs_stack_file_extent_generation(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) @@ -1695,7 +1699,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } bio->bi_bdev = block_ctx->dev->bdev; - bio->bi_sector = dev_bytenr >> 9; + bio->bi_iter.bi_sector = dev_bytenr >> 9; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -3013,7 +3017,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) int bio_is_patched; char **mapped_datav; - dev_bytenr = 512 * bio->bi_sector; + dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) @@ -3021,8 +3025,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) "submit_bio(rw=0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", rw, bio->bi_vcnt, - (unsigned long long)bio->bi_sector, dev_bytenr, - bio->bi_bdev); + (unsigned long long)bio->bi_iter.bi_sector, + dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, GFP_NOFS); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1499b27..e2600cd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode, kunmap_atomic(kaddr); if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %llu " - "extent %llu csum %u " - "wanted %u mirror %d\n", - btrfs_ino(inode), disk_start, csum, *cb_sum, - cb->mirror_num); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -172,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + ret = check_compressed_csum(inode, cb, + (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -201,18 +201,16 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int bio_index = 0; - struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + int i; + struct bio_vec *bvec; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ - while (bio_index < cb->orig_bio->bi_vcnt) { + bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bvec++; - bio_index++; - } + bio_endio(cb->orig_bio, 0); } @@ -372,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; - if (bio->bi_size) + if (bio->bi_iter.bi_size) ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); @@ -412,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); } if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_CACHE_SIZE; @@ -506,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_sector) { + (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, last_offset, end); unlock_page(page); @@ -552,7 +551,7 @@ next: * in it. We don't actually do IO on those pages but allocate new ones * to hold the compressed pages on disk. * - * bio->bi_sector points to the compressed extent on disk + * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages * bio->bi_vcnt is a count of pages * @@ -573,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct page *page; struct block_device *bdev; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; @@ -659,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = inode->i_mapping; page->index = em_start >> PAGE_CACHE_SHIFT; - if (comp_bio->bi_size) + if (comp_bio->bi_iter.bi_size) ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); @@ -687,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_size + root->sectorsize - 1) / - root->sectorsize; + sums += (comp_bio->bi_iter.bi_size + + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 316136b..cbd3a7d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -475,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -483,24 +484,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; BUG_ON(!tm); - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); - /* - * Ok we no longer care about logging modifications, free up tm - * and return 0. Any callers shouldn't be using tm after - * calling tree_mod_log_insert, but if they do we can just - * change this to return a special error code to let the callers - * do their own thing. - */ - kfree(tm); - return 0; - } - spin_lock(&fs_info->tree_mod_seq_lock); tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); @@ -518,18 +504,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->seq > tm->seq) new = &((*new)->rb_right); - else { - ret = -EEXIST; - kfree(tm); - goto out; - } + else + return -EEXIST; } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return 0; } /* @@ -545,19 +526,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { struct tree_mod_elem *tm; tm = kzalloc(sizeof(*tm), flags); if (!tm) - return -ENOMEM; + return NULL; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -567,8 +567,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); - return __tree_mod_log_insert(fs_info, tm); + return tm; } static noinline int @@ -576,10 +577,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - if (tree_mod_dont_log(fs_info, eb)) + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) return 0; - return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); + + return ret; } static noinline int @@ -587,53 +605,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; + tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; } - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); - tm->index = eb->start >> PAGE_CACHE_SHIFT; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); - return __tree_mod_log_insert(fs_info, tm); + return ret; } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { - int i; - u32 nritems; + int i, j; int ret; - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } } + + return 0; } static noinline int @@ -642,17 +702,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *new_root, gfp_t flags, int log_removal) { - struct tree_mod_elem *tm; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; - if (tree_mod_dont_log(fs_info, NULL)) + if (!tree_mod_need_log(fs_info, NULL)) return 0; - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -660,7 +741,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; } static struct tree_mod_elem * @@ -729,31 +833,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline void +static noinline int tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { - int ret; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, NULL)) - return; + if (!tree_mod_need_log(fs_info, NULL)) + return 0; if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return; + return 0; + + tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - BUG_ON(ret < 0); - ret = __tree_mod_log_insert_key(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD, - GFP_NOFS); - BUG_ON(ret < 0); + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; } static inline void @@ -772,18 +920,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, { int ret; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, + ret = tree_mod_log_insert_key(fs_info, eb, slot, MOD_LOG_KEY_REPLACE, atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } -static noinline void +static noinline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + if (tree_mod_dont_log(fs_info, eb)) - return; - __tree_mod_log_free_eb(fs_info, eb); + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; } static noinline void @@ -1041,8 +1229,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1287,8 +1480,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) old = read_tree_block(root, logical, blocksize, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); - pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", - logical); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -2462,6 +2655,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_path *path; + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + if (found_path == NULL) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } else + path = found_path; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if ((ret < 0) || (found_key == NULL)) { + if (path != found_path) + btrfs_free_path(path); + return ret; + } + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2495,6 +2731,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); if (ins_len < 0) { lowest_unlock = 2; @@ -2603,8 +2840,6 @@ again: } } cow_done: - BUG_ON(!cow && ins_len); - p->nodes[level] = b; btrfs_clear_path_blocking(p, NULL, 0); @@ -2614,13 +2849,19 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); + if (!ins_len && !p->keep_locks) { + int u = level + 1; + + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } ret = key_search(b, key, level, &prev_cmp, &slot); @@ -2648,7 +2889,7 @@ cow_done: * which means we must have a write lock * on the parent */ - if (slot == 0 && cow && + if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1; btrfs_release_path(p); @@ -2901,7 +3142,9 @@ again: if (ret < 0) return ret; if (!ret) { - p->slots[0] = btrfs_header_nritems(leaf) - 1; + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; return 0; } if (!return_any) @@ -3022,8 +3265,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3093,8 +3340,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3295,7 +3546,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3362,8 +3618,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, int ret; ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), leaf_space_used(leaf, 0, nritems), nritems); } @@ -3571,6 +3827,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + return __push_leaf_right(trans, root, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: @@ -3887,14 +4156,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, int progress = 0; int slot; u32 nritems; + int space_needed = data_size; slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); /* * try to push all the items after our slot into the * right leaf */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3914,7 +4186,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3958,13 +4230,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, /* first try to make some room by pushing left and right */ if (data_size && path->nodes[1]) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } @@ -4432,7 +4709,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); } @@ -4495,7 +4772,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (btrfs_leaf_free_space(root, leaf) < total_size) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(root, leaf)); BUG(); } @@ -4505,7 +4782,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -4817,7 +5094,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * This may release the path, and so you may lose any locks held at the * time you call it. */ -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -5240,7 +5517,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, if (!left_start_ctransid || !right_start_ctransid) { WARN(1, KERN_WARNING - "btrfs: btrfs_compare_tree detected " + "BTRFS: btrfs_compare_tree detected " "a change in one of the trees while " "iterating. This is probably a " "bug.\n"); @@ -5680,3 +5957,46 @@ int btrfs_previous_item(struct btrfs_root *root, } return 1; } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54ab861..2c1a42c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -521,9 +521,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -532,7 +538,12 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES) + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find @@ -1094,7 +1105,7 @@ struct btrfs_qgroup_limit_item { } __attribute__ ((__packed__)); struct btrfs_space_info { - u64 flags; + spinlock_t lock; u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ @@ -1104,14 +1115,25 @@ struct btrfs_space_info { transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + u64 disk_used; /* total bytes used on disk */ u64 disk_total; /* total bytes on disk, takes mirrors into account */ + u64 flags; + /* * bytes_pinned is kept in line with what is actually pinned, as in * we've called update_block_group and dropped the bytes_used counter @@ -1124,22 +1146,15 @@ struct btrfs_space_info { */ struct percpu_counter total_bytes_pinned; - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - struct list_head list; + struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - spinlock_t lock; - struct rw_semaphore groups_sem; wait_queue_head_t wait; + + struct kobject kobj; + struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1346,6 +1361,7 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit @@ -1448,7 +1464,6 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; struct list_head tree_mod_seq_list; - struct seq_list tree_mod_seq_elem; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -1515,6 +1530,8 @@ struct btrfs_fs_info { int thread_pool_size; struct kobject super_kobj; + struct kobject *space_info_kobj; + struct kobject *device_dir_kobj; struct completion kobj_unregister; int do_barriers; int closing; @@ -1643,6 +1660,10 @@ struct btrfs_fs_info { spinlock_t reada_lock; struct radix_tree_root reada_tree; + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + struct radix_tree_root buffer_radix; + /* next backup root to be overwritten */ int backup_root_index; @@ -1795,6 +1816,12 @@ struct btrfs_root { struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; }; struct btrfs_ioctl_defrag_range_args { @@ -1997,6 +2024,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) @@ -2925,6 +2953,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -2958,15 +2990,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_file_extent_item *e) -{ - return btrfs_file_extent_ram_bytes(eb, e); -} - /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is @@ -2980,6 +3003,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + int slot, + struct btrfs_file_extent_item *fi) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + /* + * return the space used on disk if this item isn't + * compressed or encoded + */ + if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && + btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && + btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { + return btrfs_file_extent_inline_item_len(eb, + btrfs_item_nr(slot)); + } + + /* otherwise use the ram bytes field */ + return btrfs_token_file_extent_ram_bytes(eb, fi, &token); +} + + /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, @@ -3143,6 +3192,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -3163,6 +3214,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, @@ -3301,6 +3353,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -3350,6 +3404,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -3399,6 +3455,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3563,12 +3620,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3676,7 +3727,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid); + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid); int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -3745,7 +3798,10 @@ extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache); + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); @@ -3764,6 +3820,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -3796,14 +3854,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +#ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#endif #ifdef CONFIG_BTRFS_ASSERT static inline void assfail(char *expr, char *file, int line) { - printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", + pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", expr, file, line); BUG(); } @@ -3841,7 +3905,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - printk(KERN_INFO "btrfs: setting %llu feature flag\n", + btrfs_info(fs_info, "setting %llu feature flag", flag); } spin_unlock(&fs_info->super_lock); @@ -3899,20 +3963,17 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_acl_chmod(struct inode *inode); #else #define btrfs_get_acl NULL +#define btrfs_set_acl NULL static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { return 0; } -static inline int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} #endif /* relocation.c */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8d292fb..451b00c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node( delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); delayed_node->count = 0; - delayed_node->in_list = 0; - delayed_node->inode_dirty = 0; + delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, int mod) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { if (!list_empty(&node->p_list)) list_move_tail(&node->p_list, &root->prepare_list); else if (mod) @@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->p_list, &root->prepare_list); atomic_inc(&node->refs); /* inserted into list */ root->nodes++; - node->in_list = 1; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, struct btrfs_delayed_node *node) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; atomic_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); - node->in_list = 0; + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( delayed_root = node->root->fs_info->delayed_root; spin_lock(&delayed_root->lock); - if (!node->in_list) { /* not in the list */ + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ if (list_empty(&delayed_root->node_list)) goto out; p = delayed_root->node_list.next; @@ -1004,9 +1004,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) { struct btrfs_delayed_root *delayed_root; - if (delayed_node && delayed_node->inode_dirty) { + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { BUG_ON(!delayed_node->root); - delayed_node->inode_dirty = 0; + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; @@ -1014,6 +1015,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) } } +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); +} + static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1022,13 +1035,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + int mod; int ret; key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - ret = btrfs_lookup_inode(trans, root, path, &key, 1); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1036,19 +1055,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return ret; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto no_iref; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); +no_iref: + btrfs_release_path(path); +err_out: btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - return 0; + return ret; + +search: + btrfs_release_path(path); + + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = -1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1059,7 +1117,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; mutex_lock(&node->mutex); - if (!node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { mutex_unlock(&node->mutex); return 0; } @@ -1203,7 +1261,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) return 0; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return 0; @@ -1227,7 +1285,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) ret = __btrfs_update_delayed_inode(trans, delayed_node->root, path, delayed_node); else @@ -1300,36 +1358,9 @@ again: trans->block_rsv = &root->fs_info->delayed_block_rsv; __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - /* - * Maybe new delayed items have been inserted, so we need requeue - * the work. Besides that, we must dequeue the empty delayed nodes - * to avoid the race between delayed items balance and the worker. - * The race like this: - * Task1 Worker thread - * count == 0, needn't requeue - * also needn't insert the - * delayed node into prepare - * list again. - * add lots of delayed items - * queue the delayed node - * already in the list, - * and not in the prepare - * list, it means the delayed - * node is being dealt with - * by the worker. - * do delayed items balance - * the delayed node is being - * dealt with by the worker - * now, just wait. - * the worker goto idle. - * Task1 will sleep until the transaction is commited. - */ - mutex_lock(&delayed_node->mutex); - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); - mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; - btrfs_end_transaction_dmeta(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty_nodelay(root); release_path: @@ -1376,52 +1407,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } -static int refs_newer(struct btrfs_delayed_root *delayed_root, - int seq, int count) +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); - if (val < seq || val >= seq + count) + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 1; + return 0; } void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; - int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; - seq = atomic_read(&delayed_root->items_seq); - if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; int ret; - DEFINE_WAIT(__wait); + + seq = atomic_read(&delayed_root->items_seq); ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - while (1) { - prepare_to_wait(&delayed_root->wait, &__wait, - TASK_INTERRUPTIBLE); - - if (refs_newer(delayed_root, seq, - BTRFS_DELAYED_BATCH) || - atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND) { - break; - } - if (!signal_pending(current)) - schedule(); - else - break; - } - finish_wait(&delayed_root->wait, &__wait); + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; } btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); @@ -1472,9 +1492,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %.*s) " + btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) " "into the insertion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", name_len, name, delayed_node->root->objectid, delayed_node->inode_id, ret); BUG(); @@ -1544,9 +1564,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(index: %llu) " + btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) " "into the deletion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", index, node->root->objectid, node->inode_id, ret); BUG(); @@ -1759,7 +1779,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) return -ENOENT; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return -ENOENT; @@ -1810,7 +1830,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, return PTR_ERR(delayed_node); mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1821,7 +1841,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); - delayed_node->inode_dirty = 1; + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); release_node: @@ -1830,6 +1850,41 @@ release_node: return ret; } +int btrfs_delayed_delete_inode_ref(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) { struct btrfs_root *root = delayed_node->root; @@ -1852,7 +1907,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(root, delayed_node); btrfs_release_delayed_inode(delayed_node); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index a4b38f9..f70119f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -48,6 +48,10 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + struct btrfs_delayed_node { u64 inode_id; u64 bytes_reserved; @@ -65,8 +69,7 @@ struct btrfs_delayed_node { struct btrfs_inode_item inode_item; atomic_t refs; u64 index_cnt; - bool in_list; - bool inode_dirty; + unsigned long flags; int count; }; @@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct inode *inode); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e4d467b..f3bff89 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return NULL; } +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->node.bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->node.bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + /* * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact * match is found. */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + struct btrfs_delayed_ref_head **last, int return_bigger) { struct rb_node *n; - struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_head *entry; int cmp = 0; again: n = root->rb_node; entry = NULL; while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); if (last) *last = entry; - if (bytenr < entry->bytenr) + if (bytenr < entry->node.bytenr) cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) + else if (bytenr > entry->node.bytenr) cmp = 1; else cmp = 0; @@ -203,12 +229,12 @@ again: } if (entry && return_bigger) { if (cmp > 0) { - n = rb_next(&entry->rb_node); + n = rb_next(&entry->href_node); if (!n) n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - bytenr = entry->bytenr; + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + bytenr = entry->node.bytenr; return_bigger = 0; goto again; } @@ -243,33 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - rb_erase(&ref->rb_node, &delayed_refs->root); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + rb_erase(&head->href_node, &delayed_refs->href_root); + } else { + assert_spin_locked(&head->lock); + rb_erase(&ref->rb_node, &head->ref_root); + } ref->in_tree = 0; btrfs_put_delayed_ref(ref); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); if (trans->delayed_ref_updates) trans->delayed_ref_updates--; } static int merge_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct rb_node *node; - int merged = 0; int mod = 0; int done = 0; - node = rb_prev(&ref->rb_node); - while (node) { + node = rb_next(&ref->rb_node); + while (!done && node) { struct btrfs_delayed_ref_node *next; next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_prev(node); - if (next->bytenr != ref->bytenr) - break; + node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_entry(ref, next, 0)) @@ -289,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - merged++; - drop_delayed_ref(trans, delayed_refs, next); + drop_delayed_ref(trans, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, ref); - break; + drop_delayed_ref(trans, delayed_refs, head, ref); + done = 1; } else { /* * You can't have multiples of the same ref on a tree @@ -303,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } - - if (done) - break; - node = rb_prev(&ref->rb_node); } - - return merged; + return done; } void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, @@ -320,6 +345,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; u64 seq = 0; + assert_spin_locked(&head->lock); + /* + * We don't have too much refs to merge in the case of delayed data + * refs. + */ + if (head->is_data) + return; + spin_lock(&fs_info->tree_mod_seq_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -330,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - node = rb_prev(&head->node.rb_node); + node = rb_first(&head->ref_root); while (node) { struct btrfs_delayed_ref_node *ref; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - /* We can't merge refs that are outside of our seq count */ if (seq && ref->seq >= seq) break; - if (merge_ref(trans, delayed_refs, ref, seq)) - node = rb_prev(&head->node.rb_node); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + node = rb_first(&head->ref_root); else - node = rb_prev(node); + node = rb_next(&ref->rb_node); } } @@ -373,71 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans) { - int count = 0; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; + u64 start; + bool loop = false; delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); - } else { - ref = NULL; - find_ref_head(&delayed_refs->root, start + 1, &ref, 1); - if (ref) { - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); - } + again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; - } - } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ + start = delayed_refs->run_delayed_start; + head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + if (!head && !loop) { + delayed_refs->run_delayed_start = 0; start = 0; - node = rb_first(&delayed_refs->root); - goto again; + loop = true; + head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + if (!head) + return NULL; + } else if (!head && loop) { + return NULL; } - return 1; -} -void btrfs_release_ref_cluster(struct list_head *cluster) -{ - struct list_head *pos, *q; + while (head->processing) { + struct rb_node *node; + + node = rb_next(&head->href_node); + if (!node) { + if (loop) + return NULL; + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + goto again; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + } - list_for_each_safe(pos, q, cluster) - list_del_init(pos); + head->processing = 1; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->node.bytenr + + head->node.num_bytes; + return head; } /* @@ -451,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster) static noinline void update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { @@ -463,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans, */ existing->ref_mod--; if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, existing); + drop_delayed_ref(trans, delayed_refs, head, existing); else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); @@ -533,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } } /* - * update the reference mod on the head to reflect this new operation + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. */ + spin_lock(&existing_ref->lock); existing->ref_mod += update->ref_mod; + spin_unlock(&existing_ref->lock); } /* @@ -543,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, int action, int is_data) { - struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -596,38 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->ref_root = RB_ROOT; + head_ref->processing = 0; - INIT_LIST_HEAD(&head_ref->cluster); + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); trace_add_delayed_ref_head(ref, head_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); if (existing) { - update_existing_head_ref(existing, ref); + update_existing_head_ref(&existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + return head_ref; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - int for_cow) +static noinline void +add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, int level, + int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -663,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action, int for_cow) +static noinline void +add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, u64 owner, + u64 offset, int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -724,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* @@ -775,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 0); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 0); - add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); spin_unlock(&delayed_refs->lock); @@ -823,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 1); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 1); - add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); spin_unlock(&delayed_refs->lock); @@ -869,14 +895,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) { - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; + return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); } void btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70b962c..4ba9b93 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -81,7 +81,10 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; - struct list_head cluster; + spinlock_t lock; + struct rb_root ref_root; + + struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; /* @@ -98,6 +101,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int processing:1; }; struct btrfs_delayed_tree_ref { @@ -116,7 +120,8 @@ struct btrfs_delayed_data_ref { }; struct btrfs_delayed_ref_root { - struct rb_root root; + /* head ref rbtree */ + struct rb_root href_root; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -124,7 +129,7 @@ struct btrfs_delayed_ref_root { /* how many delayed ref updates we've queued, used by the * throttling code */ - unsigned long num_entries; + atomic_t num_entries; /* total number of head nodes in tree */ unsigned long num_heads; @@ -133,15 +138,6 @@ struct btrfs_delayed_ref_root { unsigned long num_heads_ready; /* - * bumped when someone is making progress on the delayed - * refs, so that other procs know they are just adding to - * contention intead of helping - */ - atomic_t procs_running_refs; - atomic_t ref_seq; - wait_queue_head_t wait; - - /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away @@ -226,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) mutex_unlock(&head->mutex); } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); -void btrfs_release_ref_cluster(struct list_head *cluster); + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2cfc3dff..564c926 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -102,7 +102,8 @@ no_valid_dev_replace_entry_found: ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { - pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); goto no_valid_dev_replace_entry_found; } @@ -145,13 +146,19 @@ no_valid_dev_replace_entry_found: if (!dev_replace->srcdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - src_devid); + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { @@ -210,7 +217,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, } ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for dev_replace item!\n", + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); goto out; } @@ -230,7 +237,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); goto out; } @@ -243,7 +250,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - pr_warn("btrfs: insert dev_replace item failed %d!\n", + btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); goto out; } @@ -305,7 +312,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *src_device = NULL; if (btrfs_fs_incompat(fs_info, RAID56)) { - pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); return -EINVAL; } @@ -325,7 +332,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, &tgt_device); if (ret) { - pr_err("btrfs: target device %s is invalid!\n", + btrfs_err(fs_info, "target device %s is invalid!", args->start.tgtdev_name); mutex_unlock(&fs_info->volume_mutex); return -EINVAL; @@ -341,7 +348,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, } if (tgt_device->total_bytes < src_device->total_bytes) { - pr_err("btrfs: target device is smaller than source device!\n"); + btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; goto leave_no_lock; } @@ -366,7 +373,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->tgtdev = tgt_device; printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s started\n", + "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -489,7 +496,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (scrub_ret) { printk_in_rcu(KERN_ERR - "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -504,7 +511,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -699,7 +706,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - pr_info("btrfs: suspending dev_replace for unmount\n"); + btrfs_info(fs_info, "suspending dev_replace for unmount"); break; } @@ -728,8 +735,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) break; } if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { - pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" - "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); btrfs_dev_replace_unlock(dev_replace); return 0; } @@ -755,14 +763,14 @@ static int btrfs_dev_replace_kthread(void *data) kfree(status_args); do_div(progress, 10); printk_in_rcu(KERN_INFO - "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", - dev_replace->srcdev->missing ? "<missing disk>" : - rcu_str_deref(dev_replace->srcdev->name), - dev_replace->srcdev->devid, - dev_replace->tgtdev ? - rcu_str_deref(dev_replace->tgtdev->name) : - "<missing target disk>", - (unsigned int)progress); + "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "<missing disk>" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "<missing target disk>", + (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c031ea3..a0691df 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, * see if there is room in the item to insert this * name */ - data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size_nr(leaf, slot) + @@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root, u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { - printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + btrfs_crit(root->fs_info, "invalid dir item type: %d", (int)type); return 1; } @@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_root *root, namelen = XATTR_NAME_MAX; if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", + btrfs_crit(root->fs_info, "invalid dir item name len: %u", (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } @@ -476,7 +476,7 @@ int verify_dir_item(struct btrfs_root *root, /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { - printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n", + btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u", (unsigned)btrfs_dir_name_len(leaf, dir_item), (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8072cfa..0e69295 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,6 +48,7 @@ #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" +#include "sysfs.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -299,11 +300,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " - "failed on %llu wanted %X found %X " - "level %d\n", - root->fs_info->sb->s_id, buf->start, - val, found, btrfs_header_level(buf)); + printk_ratelimited(KERN_INFO + "BTRFS: %s checksum verify failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -382,13 +383,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb) ret = 1; if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); + printk(KERN_WARNING + "BTRFS: super block crcs don't match, older mkfs detected\n"); ret = 0; } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", + printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", csum_type); ret = 1; } @@ -464,13 +466,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_io_tree *tree; u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; - tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; @@ -500,8 +499,8 @@ static int check_tree_block_fsid(struct btrfs_root *root, } #define CORRUPT(reason, eb, root, slot) \ - printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d\n", reason, \ + btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d", reason, \ btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, @@ -569,7 +568,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) { - struct extent_io_tree *tree; u64 found_start; int found_level; struct extent_buffer *eb; @@ -580,7 +578,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (!page->private) goto out; - tree = &BTRFS_I(page->mapping->host)->io_tree; eb = (struct extent_buffer *)page->private; /* the pending IO might have been the only thing that kept this buffer @@ -600,21 +597,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "btrfs bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " "%llu %llu\n", found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", + printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d\n", + btrfs_info(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -842,20 +839,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, static int btree_csum_one_bio(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; struct btrfs_root *root; - int ret = 0; + int i, ret = 0; - WARN_ON(bio->bi_vcnt <= 0); - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root, bvec->bv_page); if (ret) break; - bio_index++; - bvec++; } + return ret; } @@ -967,11 +961,9 @@ static int btree_migratepage(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; struct btrfs_fs_info *fs_info; int ret; - tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { if (wbc->for_kupdate) @@ -1010,8 +1002,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); if (PagePrivate(page)) { - printk(KERN_WARNING "btrfs warning page private not zero " - "on page %llu\n", (unsigned long long)page_offset(page)); + btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, + "page private not zero on page %llu", + (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); @@ -1095,21 +1088,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr); - return eb; + return find_extent_buffer(root->fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; + return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1273,7 +1258,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - u64 bytenr; uuid_le uuid; root = btrfs_alloc_root(fs_info); @@ -1295,7 +1279,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - bytenr = leaf->start; memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); @@ -1616,7 +1599,8 @@ again: if (ret) goto fail; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, + location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret < 0) goto fail; if (ret == 0) @@ -1684,18 +1668,16 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct end_io_wq *end_io_wq; - struct btrfs_fs_info *fs_info; int error; end_io_wq = container_of(work, struct end_io_wq, work); bio = end_io_wq->bio; - fs_info = end_io_wq->info; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kfree(end_io_wq); - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static int cleaner_kthread(void *arg) @@ -2080,6 +2062,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log_root_tree(NULL, fs_info); + btrfs_destroy_pinned_extent(fs_info->tree_root, + fs_info->pinned_extents); + } } int open_ctree(struct super_block *sb, @@ -2154,6 +2142,7 @@ int open_ctree(struct super_block *sb, mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2167,6 +2156,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); seqlock_init(&fs_info->profiles_lock); @@ -2198,7 +2188,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - + fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2337,7 +2327,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); + printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; goto fail_alloc; } @@ -2356,7 +2346,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); err = -EINVAL; goto fail_alloc; } @@ -2421,7 +2411,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "btrfs: has skinny extents\n"); + printk(KERN_ERR "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2429,7 +2419,7 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } @@ -2446,7 +2436,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != leafsize)) { - printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2583,12 +2573,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2597,7 +2587,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "btrfs: failed to read the system " + printk(KERN_WARNING "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2614,7 +2604,7 @@ int open_ctree(struct super_block *sb, blocksize, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2626,7 +2616,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2638,7 +2628,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2653,7 +2643,7 @@ retry_root_backup: blocksize, generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); goto recovery_tree_root; @@ -2724,50 +2714,56 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to recover balance\n"); + printk(KERN_WARNING "BTRFS: failed to recover balance\n"); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("btrfs: failed to init dev_replace: %d\n", ret); + pr_err("BTRFS: failed to init dev_replace: %d\n", ret); goto fail_block_groups; } btrfs_close_extra_devices(fs_info, fs_devices, 1); - ret = btrfs_init_space_info(fs_info); + ret = btrfs_sysfs_add_one(fs_info); if (ret) { - printk(KERN_ERR "Failed to initial space info: %d\n", ret); + pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_block_groups; } + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + goto fail_sysfs; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { - printk(KERN_ERR "Failed to read block groups: %d\n", ret); - goto fail_block_groups; + printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable mount is not allowed\n"); - goto fail_block_groups; + printk(KERN_WARNING "BTRFS: " + "too many missing devices, writeable mount is not allowed\n"); + goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) - goto fail_block_groups; + goto fail_sysfs; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, @@ -2778,11 +2774,15 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " "mode\n"); btrfs_set_opt(fs_info->mount_opt, SSD); } + /* Set the real inode map cache flag */ + if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) + btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, @@ -2791,7 +2791,7 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "btrfs: failed to initialize" + printk(KERN_WARNING "BTRFS: failed to initialize" " integrity check module %s\n", sb->s_id); } #endif @@ -2804,7 +2804,7 @@ retry_root_backup: u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "Btrfs log replay required " + printk(KERN_WARNING "BTRFS: log replay required " "on RO media\n"); err = -EIO; goto fail_qgroup; @@ -2827,7 +2827,7 @@ retry_root_backup: generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "btrfs: failed to read log tree\n"); + printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); goto fail_trans_kthread; @@ -2861,7 +2861,7 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING - "btrfs: failed to recover relocation\n"); + "BTRFS: failed to recover relocation\n"); err = -EINVAL; goto fail_qgroup; } @@ -2891,14 +2891,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to resume balance\n"); + printk(KERN_WARNING "BTRFS: failed to resume balance\n"); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + pr_warn("BTRFS: failed to resume dev_replace\n"); close_ctree(tree_root); return ret; } @@ -2906,20 +2906,20 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); if (create_uuid_tree) { - pr_info("btrfs: creating UUID tree\n"); + pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the UUID tree %d\n", + pr_warn("BTRFS: failed to create the UUID tree %d\n", ret); close_ctree(tree_root); return ret; } } else if (check_uuid_tree || btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { - pr_info("btrfs: checking UUID tree\n"); + pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to check the UUID tree %d\n", + pr_warn("BTRFS: failed to check the UUID tree %d\n", ret); close_ctree(tree_root); return ret; @@ -2945,6 +2945,9 @@ fail_cleaner: */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); +fail_sysfs: + btrfs_sysfs_remove_one(fs_info); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3000,7 +3003,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " "I/O error on %s\n", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have @@ -3119,7 +3122,7 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "btrfs: couldn't get super " + printk(KERN_ERR "BTRFS: couldn't get super " "buffer head for bytenr %Lu\n", bytenr); errors++; continue; @@ -3140,7 +3143,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = btrfsic_submit_bh(WRITE_FUA, bh); + if (i == 0) + ret = btrfsic_submit_bh(WRITE_FUA, bh); + else + ret = btrfsic_submit_bh(WRITE_SYNC, bh); if (ret) errors++; } @@ -3186,7 +3192,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("btrfs: disabling barriers on dev %s\n", + printk_in_rcu("BTRFS: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; } else if (!bio_flagged(bio, BIO_UPTODATE)) { @@ -3407,7 +3413,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) total_errors++; } if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", + btrfs_err(root->fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -3455,10 +3461,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - btrfs_free_log_root_tree(NULL, fs_info); - } __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); @@ -3563,14 +3567,12 @@ int close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + btrfs_err(root->fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); - btrfs_put_block_group_cache(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -3580,12 +3582,16 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + btrfs_info(root->fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } + btrfs_sysfs_remove_one(fs_info); + del_fs_roots(fs_info); + btrfs_put_block_group_cache(fs_info); + btrfs_free_block_groups(fs_info); btrfs_stop_all_workers(fs_info); @@ -3803,55 +3809,55 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; spin_lock(&delayed_refs->lock); - if (delayed_refs->num_entries == 0) { + if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - printk(KERN_INFO "delayed_refs has NO entry\n"); + btrfs_info(root->fs_info, "delayed_refs has NO entry"); return ret; } - while ((node = rb_first(&delayed_refs->root)) != NULL) { - struct btrfs_delayed_ref_head *head = NULL; + while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; bool pin_bytes = false; - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - atomic_set(&ref->refs, 1); - if (btrfs_delayed_ref_is_head(ref)) { - - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - /* Need to wait for the delayed ref to run */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - - spin_lock(&delayed_refs->lock); - continue; - } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; - list_del_init(&head->cluster); - } - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - spin_unlock(&delayed_refs->lock); - if (head) { - if (pin_bytes) - btrfs_pin_extent(root, ref->bytenr, - ref->num_bytes, 1); + mutex_lock(&head->mutex); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + spin_lock(&delayed_refs->lock); + continue; + } + spin_lock(&head->lock); + while ((node = rb_first(&head->ref_root)) != NULL) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ref->in_tree = 0; + rb_erase(&ref->rb_node, &head->ref_root); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); + cond_resched_lock(&head->lock); } - btrfs_put_delayed_ref(ref); + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + atomic_dec(&delayed_refs->num_entries); + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + if (pin_bytes) + btrfs_pin_extent(root, head->node.bytenr, + head->node.num_bytes, 1); + btrfs_put_delayed_ref(&head->node); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45d98d0..9c9ecc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ #include "locking.h" #include "free-space-cache.h" #include "math.h" +#include "sysfs.h" #undef SCRAMBLE_DELAYED_REFS @@ -441,7 +442,8 @@ next: if (ret) break; - if (need_resched()) { + if (need_resched() || + rwsem_is_contended(&fs_info->extent_commit_sem)) { caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); @@ -767,20 +769,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); @@ -788,7 +789,6 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - metadata = 0; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -855,14 +855,16 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -1072,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2287,64 +2289,62 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); + else if (last == NULL) + last = ref; + node = rb_next(node); } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; - } - return NULL; + return last; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2365,6 +2365,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * finish. If we merged anything we need to re-loop so we can * get a good ref. */ + spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, locked_ref); @@ -2376,17 +2377,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); + spin_unlock(&locked_ref->lock); btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); continue; } @@ -2401,6 +2399,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2413,8 +2413,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); @@ -2428,19 +2427,39 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); - spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; } + continue; + } - goto next; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root)) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the @@ -2457,20 +2476,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - } else { - list_del_init(&locked_ref->cluster); } - spin_unlock(&delayed_refs->lock); + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); if (ret) { + locked_ref->processing = 0; btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); - spin_lock(&delayed_refs->lock); return ret; } @@ -2486,11 +2503,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); count++; -next: cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); } - return count; + return 0; } #ifdef SCRAMBLE_DELAYED_REFS @@ -2572,16 +2607,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, - int count) -{ - int val = atomic_read(&delayed_refs->ref_seq); - - if (val < seq || val >= seq + count) - return 1; - return 0; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2598,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_rsv *global_rsv; @@ -2627,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, return ret; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2642,13 +2683,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2660,130 +2698,40 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - if (!run_all && !run_most) { - int old; - int seq = atomic_read(&delayed_refs->ref_seq); - -progress: - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - DEFINE_WAIT(__wait); - if (delayed_refs->flushing || - !btrfs_should_throttle_delayed_refs(trans, root)) - return 0; - - prepare_to_wait(&delayed_refs->wait, &__wait, - TASK_UNINTERRUPTIBLE); - - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - schedule(); - finish_wait(&delayed_refs->wait, &__wait); - - if (!refs_newer(delayed_refs, seq, 256)) - goto progress; - else - return 0; - } else { - finish_wait(&delayed_refs->wait, &__wait); - goto again; - } - } - - } else { - atomic_inc(&delayed_refs->procs_running_refs); - } - again: - loops = 0; - spin_lock(&delayed_refs->lock); - #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - - while (1) { - if (!(run_all || run_most) && - !btrfs_should_throttle_delayed_refs(trans, root)) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - btrfs_release_ref_cluster(&cluster); - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - atomic_dec(&delayed_refs->procs_running_refs); - wake_up(&delayed_refs->wait); - return ret; - } - - atomic_add(ret, &delayed_refs->ref_seq); - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (loops == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked), bail out. - */ - loops = 1; - } else { - /* - * no runnable refs left, stop trying - */ - BUG_ON(run_all); - break; - } - } - if (ret) { - /* refs were run, let's reset staleness detection */ - loops = 0; - } + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - if (!list_empty(&trans->new_bgs)) { - spin_unlock(&delayed_refs->lock); + if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - spin_lock(&delayed_refs->lock); - } - node = rb_first(&delayed_refs->root); - if (!node) + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2797,20 +2745,16 @@ again: btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - atomic_dec(&delayed_refs->procs_running_refs); - smp_mb(); - if (waitqueue_active(&delayed_refs->wait)) - wake_up(&delayed_refs->wait); - - spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; } @@ -2852,12 +2796,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2874,40 +2819,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - data_ref = btrfs_delayed_node_to_data_ref(ref); + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - node = rb_prev(node); - if (node) { - int seq = ref->seq; + data_ref = btrfs_delayed_node_to_data_ref(ref); - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr && ref->seq == seq) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -3404,6 +3344,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3441,8 +3398,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { INIT_LIST_HEAD(&found->block_groups[i]); + kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); + } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -3459,11 +3418,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = found; - return 0; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -4639,7 +4608,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -5918,24 +5887,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { @@ -5957,19 +5918,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -5980,6 +5941,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } @@ -6147,11 +6111,29 @@ int __get_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache) { return __get_raid_index(cache->flags); } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -6179,7 +6161,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; @@ -6188,7 +6169,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, int index = __get_raid_index(flags); int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -6241,7 +6221,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -6278,7 +6257,6 @@ search: u64 offset; int cached; - used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -6306,7 +6284,6 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - found_uncached_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6322,6 +6299,7 @@ have_block_group: * lets look there */ if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* * the refill lock keeps out other @@ -6332,10 +6310,8 @@ have_block_group: if (used_block_group != block_group && (!used_block_group || used_block_group->ro || - !block_group_bits(used_block_group, flags))) { - used_block_group = block_group; + !block_group_bits(used_block_group, flags))) goto refill_cluster; - } if (used_block_group != block_group) btrfs_get_block_group(used_block_group); @@ -6349,17 +6325,19 @@ have_block_group: /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_put_block_group(block_group); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { + if (used_block_group != block_group) btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } refill_cluster: - BUG_ON(used_block_group != block_group); /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6478,25 +6456,25 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, used_block_group, + search_start = stripe_align(root, block_group, offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, + ret = btrfs_update_reserved_bytes(block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -6506,16 +6484,12 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); @@ -6586,12 +6560,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, @@ -6605,7 +6579,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", cache->key.objectid, cache->key.offset, btrfs_block_group_used(&cache->item), cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); @@ -6968,7 +6944,7 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); + "BTRFS: block rsv returned %d\n", ret); } try_reserve: ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -7716,7 +7692,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { - pr_debug("btrfs: drop snapshot early exit\n"); + pr_debug("BTRFS: drop snapshot early exit\n"); err = -EAGAIN; goto out_free; } @@ -7781,7 +7757,7 @@ out: */ if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); return err; } @@ -8335,6 +8311,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); @@ -8345,9 +8323,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } - percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = &space_info->block_group_kobjs[i]; + if (kobj->parent) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -8358,10 +8344,57 @@ static void __link_block_group(struct btrfs_space_info *space_info, int index = get_block_group_index(cache); down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) { + struct kobject *kobj = &space_info->block_group_kobjs[index]; + int ret; + + kobject_get(&space_info->kobj); /* put in release */ + ret = kobject_add(kobj, &space_info->kobj, "%s", + get_raid_name(index)); + if (ret) { + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); + kobject_put(&space_info->kobj); + } + } list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); } +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -8397,26 +8430,16 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); if (need_clear) { /* @@ -8437,16 +8460,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - found_key.objectid); - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space @@ -8460,8 +8477,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); goto error; } @@ -8592,38 +8608,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, root->fs_info->last_trans_log_full_commit = trans->transid; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - chunk_offset); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; ret = exclude_super_stripes(root, cache); @@ -8633,8 +8626,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); return ret; } @@ -8798,8 +8790,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobject_del(&block_group->space_info->block_group_kobjs[index]); + kobject_put(&block_group->space_info->block_group_kobjs[index]); clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ff43802..85bbd01 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " + printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " "state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs)); @@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " "refs %d\n", eb->start, eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); @@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void) } } -#define btrfs_debug_check_extent_io_range(inode, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct inode *inode, u64 start, u64 end) + struct extent_io_tree *tree, u64 start, u64 end) { - u64 isize = i_size_read(inode); + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + inode = tree->mapping->host; + isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { printk_ratelimited(KERN_DEBUG - "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", caller, btrfs_ino(inode), isize, start, end); } } @@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { + if (!tree->mapping) + return NULL; return btrfs_sb(tree->mapping->host->i_sb); } @@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping) { tree->state = RB_ROOT; - INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } @@ -224,12 +230,20 @@ void free_extent_state(struct extent_state *state) } static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct tree_entry *entry; + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -242,35 +256,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } +do_insert: rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; + struct rb_node **n = &root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); prev_entry = entry; if (offset < entry->start) - n = n->rb_left; + n = &(*n)->rb_left; else if (offset > entry->end) - n = n->rb_right; + n = &(*n)->rb_right; else - return n; + return *n; } + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + if (prev_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { @@ -292,18 +314,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, return NULL; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_node *prev = NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL); + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); if (!ret) return prev; return ret; } +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -385,23 +416,25 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, + struct rb_node ***p, + struct rb_node **parent, unsigned long *bits) { struct rb_node *node; if (end < start) - WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", end, start); state->start = start; state->end = end; set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node); + node = tree_insert(&tree->state, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " "%llu %llu\n", found->start, found->end, start, end); return -EEXIST; @@ -444,7 +477,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, + NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -542,7 +576,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; @@ -702,7 +736,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct rb_node *node; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); spin_lock(&tree->lock); again: @@ -783,11 +817,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); bits |= EXTENT_FIRST_DELALLOC; again: @@ -809,14 +845,16 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, &bits); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; goto out; } @@ -919,7 +957,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); @@ -1005,11 +1043,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -1032,17 +1072,19 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1135,7 +1177,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1984,7 +2026,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; map_length = length; ret = btrfs_map_block(fs_info, WRITE, logical, @@ -1995,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } BUG_ON(mirror_num != bbio->mirror_num); sector = bbio->stripes[mirror_num-1].physical >> 9; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; kfree(bbio); if (!dev || !dev->bdev || !dev->writeable) { @@ -2012,9 +2054,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " - "(dev %s sector %llu)\n", page->mapping->host->i_ino, - start, rcu_str_deref(dev->name), sector); + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; @@ -2156,7 +2199,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } - if (em->start > start || em->start + em->len < start) { + if (em->start > start || em->start + em->len <= start) { free_extent_map(em); em = NULL; } @@ -2268,9 +2311,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; + bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2332,37 +2375,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio, int err) { - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; + struct bio_vec *bvec; u64 start; u64 end; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page write in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (end_extent_writepage(page, err, start, end)) continue; end_page_writeback(page); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); } @@ -2392,9 +2437,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2405,16 +2449,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 extent_len = 0; int mirror; int ret; + int i; if (err) uptodate = 0; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_sector, err, + "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2423,19 +2468,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page read in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; len = bvec->bv_len; - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); - mirror = io_bio->mirror_num; if (likely(uptodate && tree->ops && tree->ops->readpage_end_io_hook)) { @@ -2516,7 +2564,7 @@ readpage_ok: extent_start = start; extent_len = end + 1 - start; } - } while (bvec <= bvec_end); + } if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, @@ -2547,9 +2595,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, } if (bio) { - bio->bi_size = 0; bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; btrfs_bio = btrfs_io_bio(bio); btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; @@ -2643,7 +2690,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (old_compressed) - contig = bio->bi_sector == sector; + contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3287,8 +3334,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } @@ -3410,20 +3457,18 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; struct extent_buffer *eb; - int done; + int i, done; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - bvec--; eb = (struct extent_buffer *)page->private; BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); ClearPageUptodate(page); SetPageError(page); @@ -3435,10 +3480,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) continue; end_extent_buffer_writeback(eb); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); - } static int write_one_eb(struct extent_buffer *eb, @@ -3447,6 +3491,7 @@ static int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; @@ -3464,7 +3509,7 @@ static int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); @@ -4082,12 +4127,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0; u64 em_len = 0; u64 em_end = 0; - unsigned long emflags; if (len == 0) return -EINVAL; @@ -4112,8 +4155,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } WARN_ON(!ret); path->slots[0]--; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); @@ -4181,7 +4222,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - emflags = em->flags; disko = 0; flags = 0; @@ -4333,10 +4373,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len, gfp_t mask) { struct extent_buffer *eb = NULL; @@ -4345,7 +4384,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - eb->tree = tree; + eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); @@ -4477,13 +4516,14 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) } } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start) +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); mark_extent_buffer_accessed(eb); @@ -4494,7 +4534,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); @@ -4503,16 +4543,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; - struct address_space *mapping = tree->mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; int uptodate = 1; int ret; - - eb = find_extent_buffer(tree, start); + eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); if (!eb) return NULL; @@ -4567,12 +4606,13 @@ again: if (ret) goto free_eb; - spin_lock(&tree->buffer_lock); - ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); - spin_unlock(&tree->buffer_lock); + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(tree, start); + exists = find_extent_buffer(fs_info, start); if (exists) goto free_eb; else @@ -4580,6 +4620,7 @@ again: } /* add one reference for the tree */ check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * there is a race where release page may have @@ -4623,17 +4664,17 @@ static int release_extent_buffer(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { - spin_unlock(&eb->refs_lock); - } else { - struct extent_io_tree *tree = eb->tree; + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); } /* Should be safe to release our pages at this point */ @@ -5112,12 +5153,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } @@ -5159,12 +5200,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19620c5..58b27e5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -43,6 +43,7 @@ #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 #define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_IN_TREE 10 /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -94,12 +95,10 @@ struct extent_io_ops { struct extent_io_tree { struct rb_root state; - struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; int track_uptodate; spinlock_t lock; - spinlock_t buffer_lock; struct extent_io_ops *ops; }; @@ -130,7 +129,7 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; unsigned long bflags; - struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; @@ -266,11 +265,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a4a7a1a..996ad56b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -79,12 +79,21 @@ void free_extent_map(struct extent_map *em) } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - struct extent_map *entry; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); while (*p) { parent = *p; @@ -92,19 +101,37 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, WARN_ON(!entry->in_tree); - if (offset < entry->start) + if (em->start < entry->start) p = &(*p)->rb_left; - else if (offset >= extent_map_end(entry)) + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; else - return parent; + return -EEXIST; } - entry = rb_entry(node, struct extent_map, rb_node); - entry->in_tree = 1; - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + em->in_tree = 1; + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color(&em->rb_node, root); + return 0; } /* @@ -228,7 +255,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->len; + em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; @@ -310,20 +337,11 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { int ret = 0; - struct rb_node *rb; - struct extent_map *exist; - exist = lookup_extent_mapping(tree, em->start, em->len); - if (exist) { - free_extent_map(exist); - ret = -EEXIST; - goto out; - } - rb = tree_insert(&tree->map, em->start, &em->rb_node); - if (rb) { - ret = -EEXIST; + ret = tree_insert(&tree->map, em); + if (ret) goto out; - } + atomic_inc(&em->refs); em->mod_start = em->start; @@ -337,14 +355,6 @@ out: return ret; } -/* simple helper to do math around the end of an extent, handling wrap */ -static u64 range_end(u64 start, u64 len) -{ - if (start + len < start) - return (u64)-1; - return start + len; -} - static struct extent_map * __lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len, int strict) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6f38488..127555b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (!path) return -ENOMEM; - nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; + nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, @@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, csum = (u8 *)dst; } - if (bio->bi_size > PAGE_CACHE_SIZE * 8) + if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; WARN_ON(bio->bi_vcnt <= 0); @@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path->skip_locking = 1; } - disk_bytenr = (u64)bio->bi_sector << 9; + disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { @@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + bvec->bv_len - 1, EXTENT_NODATASUM, GFP_NOFS); } else { - printk(KERN_INFO "btrfs no csum found " - "for inode %llu start %llu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } item = NULL; @@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 offset) { - int len = (bio->bi_sector << 9) - dip->disk_bytenr; + int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; @@ -447,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, u64 offset; WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + GFP_NOFS); if (!sums) return -ENOMEM; - sums->len = bio->bi_size; + sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); if (contig) @@ -461,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = (u64)bio->bi_sector << 9; + sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; while (bio_index < bio->bi_vcnt) { @@ -476,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, btrfs_add_ordered_sum(inode, ordered, sums); btrfs_put_ordered_extent(ordered); - bytes_left = bio->bi_size - total_bytes; + bytes_left = bio->bi_iter.bi_size - total_bytes; sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); @@ -484,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_sector << 9) + + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82d0342..0165b86 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -692,7 +692,10 @@ next: int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache) + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs = (root->ref_cows || root == root->fs_info->tree_root); int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -744,6 +749,7 @@ next_slot: ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -766,7 +772,8 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { WARN_ON(1); extent_end = search_start; @@ -927,14 +934,44 @@ next_slot: } if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion. + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); + + leaf = path->nodes[0]; + /* + * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that + * is, its contents got pushed to its neighbors), in which case + * it means path->locks[0] == 0 + */ + if (!ret && replace_extent && leafs_visited == 1 && + path->locks[0] && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } } + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); if (drop_end) *drop_end = found ? min(end, extent_end) : end; - btrfs_release_path(path); return ret; } @@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache); + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos, } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * this just gets pages into the page cache and locks them down. */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; - - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + int faili; -again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); @@ -1280,57 +1306,85 @@ again: } wait_on_page_writeback(pages[i]); } - faili = num_pages - 1; - err = 0; + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { + ordered->file_offset <= last_pos) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - err = btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - if (err) - goto fail; - goto again; + ret = btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos + 1); + if (ret) + return ret; + else + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { if (clear_page_dirty_for_io(pages[i])) account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; - } - return err; + return ret; } static noinline int check_can_nocow(struct inode *inode, loff_t pos, @@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + struct extent_state *cached_state = NULL; u64 release_bytes = 0; + u64 lockstart; + u64 lockend; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; + bool need_unlock; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / @@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = reserve_bytes; - + need_unlock = false; +again: /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate); if (ret) break; + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; + } + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; - if (copied > 0) { + + if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); - if (ret) { - btrfs_drop_pages(pages, num_pages); - break; - } + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; } release_bytes = 0; - btrfs_drop_pages(pages, num_pages); - if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); u64 lockend = lockstart + @@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, only_release_metadata = false; } + btrfs_drop_pages(pages, num_pages); + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (file->private_data) btrfs_ioctl_trans_end(file); + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) { @@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_key key; int ret; + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) return ret; @@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; + int rsv_count; bool same_page = ((offset >> PAGE_CACHE_SHIFT) == ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len < lockstart || + (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_UPTODATE, 0, @@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent + * 1 - adding the hole extent if no_holes isn't set */ - trans = btrfs_start_transaction(root, 3); + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, - &drop_end, 1); + &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) break; @@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 057be95..73f3de7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -347,8 +347,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, btrfs_readpage(NULL, page); lock_page(page); if (!PageUptodate(page)) { - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); io_ctl_drop_pages(io_ctl); return -EIO; } @@ -405,7 +405,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "btrfs: space cache generation " + printk_ratelimited(KERN_ERR "BTRFS: space cache generation " "(%Lu) does not match inode (%Lu)\n", *gen, generation); io_ctl_unmap_page(io_ctl); @@ -463,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " "space cache\n"); io_ctl_unmap_page(io_ctl); return -EIO; @@ -1902,7 +1902,7 @@ out: spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); ASSERT(ret != -EEXIST); } @@ -2011,14 +2011,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; - printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - info->offset, info->bytes, + btrfs_crit(block_group->fs_info, + "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } - printk(KERN_INFO "block group has cluster?: %s\n", + btrfs_info(block_group->fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); - printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" - "\n", count); + btrfs_info(block_group->fs_info, + "%d blocks of free space at or bigger than bytes is", count); } void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) @@ -2421,7 +2422,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; struct btrfs_free_space *last; struct rb_node *node; - u64 window_start; u64 window_free; u64 max_extent; u64 total_size = 0; @@ -2443,7 +2443,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); } - window_start = entry->offset; window_free = entry->bytes; max_extent = entry->bytes; first = entry; diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c new file mode 100644 index 0000000..85889aa --- /dev/null +++ b/fs/btrfs/hash.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <crypto/hash.h> +#include <linux/err.h> +#include "hash.h" + +static struct crypto_shash *tfm; + +int __init btrfs_hash_init(void) +{ + tfm = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + return 0; +} + +void btrfs_hash_exit(void) +{ + crypto_free_shash(tfm); +} + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm)]; + } desc; + int err; + + desc.shash.tfm = tfm; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 1d98281..118a231 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,10 +19,15 @@ #ifndef __HASH__ #define __HASH__ -#include <linux/crc32c.h> +int __init btrfs_hash_init(void); + +void btrfs_hash_exit(void); + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); + static inline u64 btrfs_name_hash(const char *name, int len) { - return crc32c((u32)~1, name, len); + return btrfs_crc32c((u32)~1, name, len); } /* @@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len) static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { - return (u64) crc32c(parent_objectid, name, len); + return (u64) btrfs_crc32c(parent_objectid, name, len); } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index ec82fae..2be38df 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, return 0; } -static struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) -{ - int ret; - struct btrfs_key key; - struct btrfs_inode_ref *ref; - - key.objectid = inode_objectid; - key.type = BTRFS_INODE_REF_KEY; - key.offset = ref_objectid; - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - if (!find_name_in_backref(path, name, name_len, &ref)) - return NULL; - return ref; -} - /* Returns NULL if no extref found */ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, @@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return extref; } -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index) -{ - struct btrfs_inode_ref *ref; - struct btrfs_inode_extref *extref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, - inode_objectid, ref_objectid, ins_len, - cow); - if (IS_ERR(ref)) - return PTR_ERR(ref); - - if (ref != NULL) { - *ret_index = btrfs_inode_ref_index(path->nodes[0], ref); - return 0; - } - - btrfs_release_path(path); - - extref = btrfs_lookup_inode_extref(trans, root, path, name, - name_len, inode_objectid, - ref_objectid, ins_len, cow); - if (IS_ERR(extref)) - return PTR_ERR(extref); - - if (extref) { - *ret_index = btrfs_inode_extref_index(path->nodes[0], extref); - return 0; - } - - return -ENOENT; -} - static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a7744..5c4ab9c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,9 +58,10 @@ #include "inode-map.h" #include "backref.h" #include "hash.h" +#include "props.h" struct btrfs_iget_args { - u64 ino; + struct btrfs_key *location; struct btrfs_root *root; }; @@ -125,13 +126,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -140,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, int err = 0; int ret; size_t cur_size = size; - size_t datasize; unsigned long offset; if (compressed_size && compressed_pages) cur_size = compressed_size; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + inode_add_bytes(inode, size); - path->leave_spinning = 1; + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; - key.objectid = btrfs_ino(inode); - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - goto fail; + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -203,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_release_path(path); /* * we're an inline extent, so nobody can @@ -219,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return ret; fail: - btrfs_free_path(path); return err; } @@ -242,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -256,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, return 1; } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_free_path(path); return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -269,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -284,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; } @@ -1262,7 +1281,8 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); extent_end = ALIGN(extent_end, root->sectorsize); } else { BUG_ON(1); @@ -1577,7 +1597,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -1585,7 +1605,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); @@ -1841,14 +1861,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + int extent_inserted = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; - /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want @@ -1858,17 +1877,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, 0); + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); if (ret) goto out; - ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - if (ret) - goto out; + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -2290,7 +2315,7 @@ again: u64 extent_len; struct btrfs_key found_key; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out_free_path; @@ -2543,12 +2568,6 @@ out_kfree: return NULL; } -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -3248,7 +3267,8 @@ out: * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3264,6 +3284,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, } slot++; + *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3273,6 +3294,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; @@ -3301,6 +3324,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, * something larger than an xattr. We have to assume the inode * has acls */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; return 1; } @@ -3315,10 +3340,12 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; + int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3328,7 +3355,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!path) goto make_bad; - path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -3338,7 +3364,7 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; if (filled) - goto cache_acl; + goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3381,18 +3407,51 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode)); + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d\n", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + if (!maybe_acls) cache_no_acl(inode); - btrfs_free_path(path); - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; @@ -3496,7 +3555,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, goto failed; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3593,6 +3651,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; btrfs_release_path(path); + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } + } + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { @@ -3602,7 +3678,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto err; } - +skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -3948,7 +4024,7 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, - fi); + path->slots[0], fi); } item_end--; } @@ -4018,6 +4094,12 @@ search_again: inode_sub_bytes(inode, item_end + 1 - new_size); } + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); @@ -4203,6 +4285,49 @@ out: return ret; } +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert @@ -4211,7 +4336,6 @@ out: */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -4266,31 +4390,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *hole_em; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - break; - } - - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, 1); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - err = btrfs_insert_file_extent(trans, root, - btrfs_ino(inode), cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) break; - } - btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4309,7 +4412,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = trans->transid; + hole_em->generation = root->fs_info->generation; while (1) { write_lock(&em_tree->lock); @@ -4322,17 +4425,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); -next: - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); } +next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); @@ -4354,8 +4454,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, newsize); @@ -4464,12 +4568,70 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); + err = posix_acl_chmod(inode, inode->i_mode); } return err; } +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages(&inode->i_data, 0); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -4480,7 +4642,8 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); - truncate_inode_pages(&inode->i_data, 0); + evict_inode_truncate_pages(inode); + if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || @@ -4655,9 +4818,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); + ret = btrfs_find_item(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid, BTRFS_ROOT_REF_KEY, NULL); if (ret) { if (ret < 0) err = ret; @@ -4818,7 +4981,9 @@ again: static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); BTRFS_I(inode)->root = args->root; return 0; } @@ -4826,19 +4991,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == btrfs_ino(inode) && + return args->location->objectid == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, + struct btrfs_key *location, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(objectid, root); + unsigned long hashval = btrfs_inode_hash(location->objectid, root); - args.ino = objectid; + args.location = location; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@ -4855,13 +5020,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, { struct inode *inode; - inode = btrfs_iget_locked(s, location->objectid, root); + inode = btrfs_iget_locked(s, location, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); @@ -4917,7 +5080,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.objectid == 0) - return NULL; + return ERR_PTR(-ENOENT); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -4981,10 +5144,17 @@ static void btrfs_dentry_release(struct dentry *dentry) static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; + struct inode *inode; - ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - return ret; + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); } unsigned char btrfs_filetype_table[] = { @@ -5354,7 +5524,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; unsigned long ptr; int ret; - int owner; path = btrfs_alloc_path(); if (!path) @@ -5388,6 +5557,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * number */ BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -5400,11 +5570,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (S_ISDIR(mode)) - owner = 0; - else - owner = 1; - key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; @@ -5469,6 +5634,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + return inode; fail: if (dir) @@ -5737,6 +5908,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; @@ -6000,7 +6173,7 @@ again: btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, root->sectorsize); } next: @@ -6069,7 +6242,7 @@ next: goto out; } - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); @@ -6386,6 +6559,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int slot; int found_type; bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6429,6 +6603,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) goto out; + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) goto out; @@ -6446,8 +6624,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (btrfs_extent_readonly(root, disk_bytenr)) goto out; btrfs_release_path(path); @@ -6779,17 +6955,16 @@ unlock_err: static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; u32 *csums = (u32 *)dip->csum; - int index = 0; u64 start; + int i; start = dip->logical_offset; - do { + bio_for_each_segment_all(bvec, bio, i) { if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { struct page *page = bvec->bv_page; char *kaddr; @@ -6805,18 +6980,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != csums[index]) { + if (csum != csums[i]) { btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, - csums[index]); + csums[i]); err = -EIO; } } start += bvec->bv_len; - bvec++; - index++; - } while (bvec <= bvec_end); + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -6894,10 +7067,11 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; if (err) { - printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " - "sector %#Lx len %u err no %d\n", + btrfs_err(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); dip->errors = 1; /* @@ -6988,7 +7162,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; + u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; @@ -6996,7 +7170,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int ret = 0; int async_submit = 0; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -7004,7 +7178,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, return -EIO; } - if (map_length >= orig_bio->bi_size) { + if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; goto submit; } @@ -7056,7 +7230,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); @@ -7114,7 +7288,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (!skip_sum && !write) { csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; sum_len *= csum_size; } else { sum_len = 0; @@ -7129,8 +7304,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; dip->orig_bio = io_bio; @@ -7367,6 +7542,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + int inode_evicting = inode->i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -7382,17 +7558,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* * IO on this page will never be started, so we need * to account for any ordered extents now */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -7416,14 +7596,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); if (PagePrivate(page)) { @@ -7733,7 +7921,9 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid) + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid) { struct inode *inode; int err; @@ -7751,6 +7941,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d\n", + new_root->root_key.objectid, err); + err = btrfs_update_inode(trans, new_root, inode); iput(inode); @@ -7776,6 +7972,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; + ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; @@ -8063,6 +8260,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ root->fs_info->last_trans_log_full_commit = trans->transid; @@ -8151,6 +8349,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_fail; } + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); @@ -8286,7 +8487,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { int ret; - if (root->fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; ret = __start_delalloc_inodes(root, delay_iput); @@ -8312,7 +8513,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) struct list_head splice; int ret; - if (fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; INIT_LIST_HEAD(&splice); @@ -8649,12 +8850,14 @@ static const struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@ -8724,6 +8927,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { @@ -8735,6 +8939,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { @@ -8748,7 +8953,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1116225..b013489 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -56,6 +56,8 @@ #include "rcu-string.h" #include "send.h" #include "dev-replace.h" +#include "props.h" +#include "sysfs.h" static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff); @@ -190,6 +192,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int i_oldflags; umode_t mode; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (btrfs_root_readonly(root)) return -EROFS; @@ -200,9 +205,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - if (!inode_owner_or_capable(inode)) - return -EACCES; - ret = mnt_want_write_file(file); if (ret) return ret; @@ -280,9 +282,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (flags & FS_NOCOMP_FL) { ip->flags &= ~BTRFS_INODE_COMPRESS; ip->flags |= BTRFS_INODE_NOCOMPRESS; + + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; } else if (flags & FS_COMPR_FL) { + const char *comp; + ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + + if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + comp = "lzo"; + else + comp = "zlib"; + ret = btrfs_set_prop(inode, "btrfs.compression", + comp, strlen(comp), 0); + if (ret) + goto out_drop; + } else { ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } @@ -392,6 +410,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; + struct inode *inode; int ret; int err; u64 objectid; @@ -417,7 +436,9 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out; + btrfs_subvolume_release_metadata(root, &block_rsv, + qgroup_reserved); + return ret; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -500,7 +521,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, root, ret); @@ -542,6 +563,8 @@ static noinline int create_subvol(struct inode *dir, fail: trans->block_rsv = NULL; trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -553,10 +576,12 @@ fail: if (err && !ret) ret = err; - if (!ret) - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); -out: - btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + } return ret; } @@ -642,7 +667,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(inode); goto fail; } - BUG_ON(!inode); + d_instantiate(dentry, inode); ret = 0; fail: @@ -1011,7 +1036,7 @@ out: static int cluster_pages_for_defrag(struct inode *inode, struct page **pages, unsigned long start_index, - int num_pages) + unsigned long num_pages) { unsigned long file_end; u64 isize = i_size_read(inode); @@ -1169,8 +1194,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; - int cluster = max_cluster; + unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -1254,7 +1279,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); ret = -EAGAIN; break; } @@ -1416,20 +1441,20 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); + btrfs_info(root->fs_info, "resizing devid %llu", devid); } device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", + btrfs_info(root->fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; goto out_free; } if (!device->writeable) { - printk(KERN_INFO "btrfs: resizer unable to apply on " - "readonly device %llu\n", + btrfs_info(root->fs_info, + "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; goto out_free; @@ -1466,6 +1491,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, } new_size = old_size - new_size; } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -EINVAL; + goto out_free; + } new_size = old_size + new_size; } @@ -1481,7 +1510,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -1542,9 +1571,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - printk(KERN_INFO "btrfs: Snapshot src from " - "another FS\n"); + btrfs_info(BTRFS_I(src_inode)->root->fs_info, + "Snapshot src from another FS"); ret = -EINVAL; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, @@ -1662,6 +1697,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1686,11 +1724,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out_drop_write; - } - down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1698,12 +1731,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); - if (flags & BTRFS_SUBVOL_RDONLY) + if (flags & BTRFS_SUBVOL_RDONLY) { btrfs_set_root_flags(&root->root_item, root_flags | BTRFS_ROOT_SUBVOL_RDONLY); - else - btrfs_set_root_flags(&root->root_item, + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -1910,7 +1959,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", + printk(KERN_ERR "BTRFS: could not find root %llu\n", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2000,7 +2049,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", tree_id); + printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); ret = -ENOENT; goto out; } @@ -2121,7 +2170,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2284,6 +2333,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2685,14 +2735,11 @@ out_unlock: #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) static long btrfs_ioctl_file_extent_same(struct file *file, - void __user *argp) + struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args tmp; struct btrfs_ioctl_same_args *same; struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file->f_dentry->d_inode; - struct file *dst_file = NULL; - struct inode *dst; + struct inode *src = file_inode(file); u64 off; u64 len; int i; @@ -2700,6 +2747,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, unsigned long size; u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; bool is_admin = capable(CAP_SYS_ADMIN); + u16 count; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2708,17 +2756,14 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (ret) return ret; - if (copy_from_user(&tmp, - (struct btrfs_ioctl_same_args __user *)argp, - sizeof(tmp))) { + if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } - size = sizeof(tmp) + - tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info); + size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); - same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size); + same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); @@ -2755,52 +2800,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file, goto out; /* pre-format output fields to sane values */ - for (i = 0; i < same->dest_count; i++) { + for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; same->info[i].status = 0; } - ret = 0; - for (i = 0; i < same->dest_count; i++) { - info = &same->info[i]; - - dst_file = fget(info->fd); - if (!dst_file) { + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_file = fdget(info->fd); + if (!dst_file.file) { info->status = -EBADF; - goto next; + continue; } + dst = file_inode(dst_file.file); - if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { info->status = -EINVAL; - goto next; - } - - info->status = -EXDEV; - if (file->f_path.mnt != dst_file->f_path.mnt) - goto next; - - dst = dst_file->f_dentry->d_inode; - if (src->i_sb != dst->i_sb) - goto next; - - if (S_ISDIR(dst->i_mode)) { + } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { info->status = -EISDIR; - goto next; - } - - if (!S_ISREG(dst->i_mode)) { + } else if (!S_ISREG(dst->i_mode)) { info->status = -EACCES; - goto next; + } else { + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; } - - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - -next: - if (dst_file) - fput(dst_file); + fdput(dst_file); } ret = copy_to_user(argp, same, size); @@ -2859,12 +2887,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode, * note the key will change type as we walk through the * tree. */ + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2891,11 +2921,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u8 comp; u64 endoff; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); comp = btrfs_file_extent_compression(leaf, extent); @@ -2914,11 +2939,20 @@ static int btrfs_clone(struct inode *src, struct inode *inode, datal = btrfs_file_extent_ram_bytes(leaf, extent); } - btrfs_release_path(path); if (key.offset + datal <= off || - key.offset >= off + len - 1) - goto next; + key.offset >= off + len - 1) { + path->slots[0]++; + goto process_slot; + } + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(inode); @@ -3089,7 +3123,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, } ret = btrfs_end_transaction(trans, root); } -next: btrfs_release_path(path); key.offset++; } @@ -3217,9 +3250,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: - mutex_unlock(&src->i_mutex); - if (!same_inode) - mutex_unlock(&inode->i_mutex); + if (!same_inode) { + if (inode < src) { + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + } else { + mutex_unlock(&inode->i_mutex); + mutex_unlock(&src->i_mutex); + } + } else { + mutex_unlock(&src->i_mutex); + } out_fput: fdput(src_file); out_drop_write: @@ -3342,8 +3383,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); - printk(KERN_ERR "Umm, you don't have the default dir item, " - "this isn't going to work\n"); + btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" + "item, this isn't going to work"); ret = -ENOENT; goto out; } @@ -3496,6 +3537,20 @@ out: return ret; } +static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + u64 reserved; + + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (arg && copy_to_user(arg, &reserved, sizeof(reserved))) + return -EFAULT; + return 0; +} + /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -4324,6 +4379,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret < 0) return ret; @@ -4340,11 +4398,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); @@ -4430,8 +4483,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { - pr_warn("btrfs: label is too long, return the first %zu bytes\n", - --len); + btrfs_warn(root->fs_info, + "label is too long, return the first %zu bytes", --len); } ret = copy_to_user(arg, label, len); @@ -4454,7 +4507,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { - pr_err("btrfs: unable to set label with more than %d bytes\n", + btrfs_err(root->fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; } @@ -4479,6 +4532,166 @@ out_unlock: return ret; } +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +static int btrfs_ioctl_get_supported_features(struct file *file, + void __user *arg) +{ + static struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_root *root, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_names[set]; + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(root->fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(root, change_mask, flags, mask_base) \ +check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(root, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(root, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(root, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&root->fs_info->super_lock); + + return btrfs_end_transaction(trans, root); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4544,6 +4757,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); + case BTRFS_IOC_GLOBAL_RSV: + return btrfs_ioctl_global_rsv(root, argp); case BTRFS_IOC_SYNC: { int ret; @@ -4597,6 +4812,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_FILE_EXTENT_SAME: return btrfs_ioctl_file_extent_same(file, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(file, argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(file, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a6f07..b47f669 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); ret = -1; goto out; @@ -357,7 +357,7 @@ cont: if (need_unmap) kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed\n"); + printk(KERN_WARNING "BTRFS: decompress failed\n"); ret = -1; break; } @@ -401,7 +401,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, out_len = PAGE_CACHE_SIZE; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed!\n"); + printk(KERN_WARNING "BTRFS: decompress failed!\n"); ret = -1; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 69582d5..b16450b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -336,13 +336,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, entry->len); *file_offset = dec_end; if (dec_start > dec_end) { - printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - dec_start, dec_end); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordering dec_start %llu end %llu", dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - entry->bytes_left, to_dec); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) @@ -401,7 +402,8 @@ have_entry: } if (io_size > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } entry->bytes_left -= io_size; @@ -520,7 +522,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); - tree->last = NULL; + if (tree->last == node) + tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 24cad16..65793ed 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -69,23 +69,3 @@ out: btrfs_free_path(path); return ret; } - -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - btrfs_free_path(path); - return ret; -} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 417053b..6efd70d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + pr_warn("BTRFS: uuid item with illegal size %lu!\n", (unsigned long)item_size); return; } @@ -249,7 +249,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) BTRFS_FILE_EXTENT_INLINE) { printk(KERN_INFO "\t\tinline extent data " "size %u\n", - btrfs_file_extent_inline_len(l, fi)); + btrfs_file_extent_inline_len(l, i, fi)); break; } printk(KERN_INFO "\t\textent data disk bytenr %llu " diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c new file mode 100644 index 0000000..129b1dd --- /dev/null +++ b/fs/btrfs/props.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/hashtable.h> +#include "props.h" +#include "btrfs_inode.h" +#include "hash.h" +#include "transaction.h" +#include "xattr.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const char *value, size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + int inheritable; +}; + +static int prop_compression_validate(const char *value, size_t len); +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len); +static const char *prop_compression_extract(struct inode *inode); + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, + { + .xattr_name = NULL + } +}; + +void __init btrfs_props_init(void) +{ + struct prop_handler *p; + + hash_init(prop_handlers_ht); + + for (p = &prop_handlers[0]; p->xattr_name; p++) { + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } +} + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +static int __btrfs_set_prop(struct btrfs_trans_handle *trans, + struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = handler->validate(value, value_len); + if (ret) + return ret; + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + value, value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size_nr(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(inode), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(inode); + int ret; + + ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); + + return ret; +} + +static int inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *parent) +{ + const struct prop_handler *h; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (h = &prop_handlers[0]; h->xattr_name; h++) { + const char *value; + u64 num_bytes; + + if (!h->inheritable) + continue; + + value = h->extract(parent); + if (!value) + continue; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + ret = __btrfs_set_prop(trans, inode, h->xattr_name, + value, strlen(value), 0); + btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir) +{ + if (!dir) + return 0; + + return inherit_props(trans, inode, dir); +} + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root) +{ + struct btrfs_key key; + struct inode *parent_inode, *child_inode; + int ret; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, + parent_root, NULL); + if (IS_ERR(parent_inode)) + return PTR_ERR(parent_inode); + + child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(child_inode)) { + iput(parent_inode); + return PTR_ERR(child_inode); + } + + ret = inherit_props(trans, child_inode, parent_inode); + iput(child_inode); + iput(parent_inode); + + return ret; +} + +static int prop_compression_validate(const char *value, size_t len) +{ + if (!strncmp("lzo", value, len)) + return 0; + else if (!strncmp("zlib", value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len) +{ + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, len)) + type = BTRFS_COMPRESS_LZO; + else if (!strncmp("zlib", value, len)) + type = BTRFS_COMPRESS_ZLIB; + else + return -EINVAL; + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->force_compress) { + case BTRFS_COMPRESS_ZLIB: + return "zlib"; + case BTRFS_COMPRESS_LZO: + return "lzo"; + } + + return NULL; +} diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h new file mode 100644 index 0000000..100f188 --- /dev/null +++ b/fs/btrfs/props.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_PROPS_H +#define __BTRFS_PROPS_H + +#include "ctree.h" + +void __init btrfs_props_init(void); + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root); + +#endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e6ef49..472302a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -301,16 +301,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (btrfs_qgroup_status_version(l, ptr) != BTRFS_QGROUP_STATUS_VERSION) { - printk(KERN_ERR - "btrfs: old qgroup version, quota disabled\n"); + btrfs_err(fs_info, + "old qgroup version, quota disabled"); goto out; } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_ERR - "btrfs: qgroup generation mismatch, " - "marked as inconsistent\n"); + btrfs_err(fs_info, + "qgroup generation mismatch, " + "marked as inconsistent"); } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); @@ -325,7 +325,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); + btrfs_err(fs_info, "inconsitent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -396,8 +396,8 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); if (ret == -ENOENT) { - printk(KERN_WARNING - "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } @@ -644,8 +644,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_limit = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_limit_item); + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); @@ -687,8 +686,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_info = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_info_item); + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); @@ -1161,7 +1159,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, limit->rsv_excl); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_INFO "unable to update quota limit for %llu\n", + btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1349,7 +1347,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_key ins; struct btrfs_root *quota_root; u64 ref_root; struct btrfs_qgroup *qgroup; @@ -1363,10 +1360,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, BUG_ON(!fs_info->quota_root); - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) { struct btrfs_delayed_tree_ref *ref; @@ -1840,7 +1833,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", + btrfs_err(trans->root->fs_info, + "qgroups not uptodate in trans handle %p: list is%s empty, " + "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", (u32)(trans->delayed_ref_elem.seq >> 32), (u32)trans->delayed_ref_elem.seq); @@ -1902,9 +1897,17 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + u64 num_bytes; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); - if (found.type != BTRFS_EXTENT_ITEM_KEY) + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) continue; + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->extent_root->leafsize; + else + num_bytes = found.offset; + ret = btrfs_find_all_roots(trans, fs_info, found.objectid, tree_mod_seq_elem.seq, &roots); if (ret < 0) @@ -1949,12 +1952,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_qgroup_list *glist; qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += found.offset; - qg->rfer_cmpr += found.offset; + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; WARN_ON(qg->tag >= seq); if (qg->refcnt - seq == roots->nnodes) { - qg->excl += found.offset; - qg->excl_cmpr += found.offset; + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; } qgroup_dirty(fs_info, qg); @@ -2037,10 +2040,10 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); if (err >= 0) { - pr_info("btrfs: qgroup scan completed%s\n", + btrfs_info(fs_info, "qgroup scan completed%s", err == 2 ? " (inconsistency flag cleared)" : ""); } else { - pr_err("btrfs: qgroup scan failed with %d\n", err); + btrfs_err(fs_info, "qgroup scan failed with %d", err); } complete_all(&fs_info->qgroup_rescan_completion); @@ -2096,7 +2099,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (ret) { err: - pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 24ac218..9af0b25 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1032,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_sector << 9; - last_end += last->bi_size; + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different @@ -1053,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, if (!bio) return -ENOMEM; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; - bio->bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> 9; set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -1111,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { - start = (u64)bio->bi_sector << 9; + start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; @@ -1272,7 +1272,7 @@ cleanup: static int find_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 physical = bio->bi_sector; + u64 physical = bio->bi_iter.bi_sector; u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1298,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_sector; + u64 logical = bio->bi_iter.bi_sector; u64 stripe_start; int i; @@ -1602,8 +1602,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) plug_list); struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, plug_list); - u64 a_sector = ra->bio_list.head->bi_sector; - u64 b_sector = rb->bio_list.head->bi_sector; + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; if (a_sector < b_sector) return -1; @@ -1691,7 +1691,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, if (IS_ERR(rbio)) return PTR_ERR(rbio); bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; /* * don't plug on full rbios, just get them out the door @@ -2044,7 +2044,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->read_rebuild = 1; bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 1031b69..31c797c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, */ #ifdef DEBUG if (rec->generation != generation) { - printk(KERN_DEBUG "generation mismatch for " - "(%llu,%d,%llu) %llu != %llu\n", + btrfs_debug(root->fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", key.objectid, key.type, key.offset, rec->generation, generation); } @@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", BTRFS_MAX_MIRRORS); + btrfs_err(root->fs_info, + "readahead: more than %d copies not supported", + BTRFS_MAX_MIRRORS); goto error; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ce459a7..07b3b36 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -94,6 +94,7 @@ struct backref_edge { #define LOWER 0 #define UPPER 1 +#define RELOCATION_RESERVED_NODES 256 struct backref_cache { /* red black tree of all backref nodes in the cache */ @@ -176,6 +177,8 @@ struct reloc_control { u64 merging_rsv_size; /* size of relocated tree nodes */ u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; u64 search_start; u64 extents_found; @@ -184,7 +187,6 @@ struct reloc_control { unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int commit_transaction:1; }; /* stages of data relocation */ @@ -571,7 +573,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } @@ -1264,10 +1268,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1275,7 +1279,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1283,23 +1287,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_unlock(&rc->reloc_root_tree.lock); if (!node) - return 0; + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); return 0; } @@ -1420,7 +1446,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) @@ -1432,11 +1457,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (root->fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -2287,10 +2310,7 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); - __update_reloc_root(reloc_root, 1); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); + __del_reloc_root(reloc_root); } } @@ -2332,10 +2352,9 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __update_reloc_root(reloc_root, 1); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); goto out; } } else { @@ -2388,6 +2407,13 @@ out: btrfs_std_error(root->fs_info, ret); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); @@ -2424,7 +2450,7 @@ static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct backref_node *node, - struct backref_edge *edges[], int *nr) + struct backref_edge *edges[]) { struct backref_node *next; struct btrfs_root *root; @@ -2466,7 +2492,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return NULL; - *nr = index; next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2562,28 +2587,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct btrfs_root *root = rc->extent_root; u64 num_bytes; int ret; + u64 tmp; num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + rc->reserved_bytes += num_bytes; + ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { - if (ret == -EAGAIN) - rc->commit_transaction = 1; + if (ret == -EAGAIN) { + tmp = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + } return ret; } return 0; } -static void release_metadata_space(struct reloc_control *rc, - struct backref_node *node) -{ - u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); -} - /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2605,7 +2638,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, u32 blocksize; u64 bytenr; u64 generation; - int nr; int slot; int ret; int err = 0; @@ -2618,7 +2650,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, cond_resched(); upper = edge->node[UPPER]; - root = select_reloc_root(trans, rc, upper, edges, &nr); + root = select_reloc_root(trans, rc, upper, edges); BUG_ON(!root); if (upper->eb && !upper->locked) { @@ -2870,7 +2902,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int release = 0; int ret = 0; if (!node) @@ -2887,7 +2918,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; - release = 1; } if (root) { @@ -2912,11 +2942,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) { - if (release) - release_metadata_space(rc, node); + if (ret || node->level == 0 || node->cowonly) remove_backref_node(&rc->backref_cache, node); - } return ret; } @@ -3839,29 +3866,20 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; - int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; - /* - * reserve some space for creating reloc trees. - * btrfs_init_reloc_root will use them when there - * is no reservation in transaction handle. - */ - ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - BTRFS_RESERVE_FLUSH_ALL); - if (ret) - return ret; - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3905,6 +3923,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } progress++; trans = btrfs_start_transaction(rc->extent_root, 0); if (IS_ERR(trans)) { @@ -3983,6 +4009,12 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + /* + * if we fail to relocate tree blocks, force to update + * backref cache when committing transaction. + */ + rc->backref_cache.last_trans = trans->transid - 1; + if (ret != -EAGAIN) { err = ret; break; @@ -3992,14 +4024,8 @@ restart: } } - if (rc->commit_transaction) { - rc->commit_transaction = 0; - ret = btrfs_commit_transaction(trans, rc->extent_root); - BUG_ON(ret); - } else { - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root); - } + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && @@ -4219,7 +4245,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); ret = btrfs_start_delalloc_roots(fs_info, 0); @@ -4241,7 +4267,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) if (rc->extents_found == 0) break; - printk(KERN_INFO "btrfs: found %llu extents\n", + btrfs_info(extent_root->fs_info, "found %llu extents", rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { @@ -4257,11 +4283,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } } - filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); @@ -4522,6 +4543,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ec71ea4..1389b69 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -44,7 +44,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "btrfs: mismatching " + printk(KERN_WARNING "BTRFS: mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " @@ -154,7 +154,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu", key->objectid, key->type, key->offset); BUG_ON(1); } @@ -400,21 +400,6 @@ out: return err; } -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id) -{ - struct btrfs_key key; - int ret; - - key.objectid = root_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = ref_id; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - return ret; -} - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1fd3f33..efba5d1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -256,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace); static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -269,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) wake_up(&sctx->list_wait); } +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -480,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -492,7 +517,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -555,7 +580,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); printk_in_rcu(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " + "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, rcu_str_deref(dev->name), @@ -704,13 +729,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) struct scrub_fixup_nodatasum *fixup; struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); sctx = fixup->sctx; - fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { @@ -759,8 +782,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -1161,7 +1184,7 @@ corrected_error: sctx->stat.corrected_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", + "BTRFS: fixed up error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } } else { @@ -1170,7 +1193,7 @@ did_not_correct_error: sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } @@ -1308,7 +1331,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_sector = page->physical >> 9; + bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); if (btrfsic_submit_bio_wait(READ, bio)) @@ -1418,8 +1441,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING - "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); return -EIO; } @@ -1427,7 +1451,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; - bio->bi_sector = page_bad->physical >> 9; + bio->bi_iter.bi_sector = page_bad->physical >> 9; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1520,7 +1544,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1877,7 +1901,7 @@ static void scrub_submit(struct scrub_ctx *sctx) * This case is handled correctly (but _very_ slowly). */ printk_ratelimited(KERN_WARNING - "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); bio_endio(sbio->bio, -EIO); } else { btrfsic_submit_bio(READ, sbio->bio); @@ -1926,7 +1950,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2286,8 +2310,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ key_start.objectid = logical; @@ -2311,16 +2334,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!IS_ERR(reada2)) btrfs_reada_wait(reada2); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during @@ -2357,22 +2370,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2380,8 +2385,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto out; if (ret > 0) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_previous_extent_item(root, path, 0); if (ret < 0) goto out; if (ret > 0) { @@ -2439,9 +2443,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid < logical && (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - printk(KERN_ERR - "btrfs scrub: tree block %llu spanning " - "stripes, ignored. logical=%llu\n", + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", key.objectid, logical); goto next; } @@ -2683,21 +2687,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); btrfs_put_block_group(cache); if (ret) @@ -2823,8 +2815,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * check some assumptions */ if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize == leafsize (%d == %d) fails", fs_info->chunk_root->nodesize, fs_info->chunk_root->leafsize); return -EINVAL; @@ -2836,16 +2828,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * the way scrub is implemented. Do not handle this * situation at all because it won't ever happen. */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -2858,7 +2851,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * would exhaust the array bounds of pagev member in * struct scrub_block */ - pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", fs_info->chunk_root->nodesize, SCRUB_MAX_PAGES_PER_BLOCK, fs_info->chunk_root->sectorsize, @@ -2908,7 +2902,13 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); @@ -2917,9 +2917,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * by holding device list mutex, we can * kick off writing super in log tree sync. */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end, @@ -3167,7 +3168,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ret = iterate_inodes_from_logical(logical, fs_info, path, record_inode_for_nocow, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", logical, physical_for_dev_replace, len, mirror_num, ret); not_written = 1; @@ -3289,7 +3291,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { - pr_err("find_or_create_page() failed\n"); + btrfs_err(fs_info, "find_or_create_page() failed"); ret = -ENOMEM; goto out; } @@ -3361,7 +3363,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, return -EIO; if (!dev->bdev) { printk_ratelimited(KERN_WARNING - "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); @@ -3371,8 +3373,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, spin_unlock(&sctx->stat_lock); return -ENOMEM; } - bio->bi_size = 0; - bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); if (ret != PAGE_CACHE_SIZE) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6837fe8..730dce3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -88,8 +88,6 @@ struct send_ctx { u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ - struct vfsmount *mnt; - struct btrfs_root *send_root; struct btrfs_root *parent_root; struct clone_root *clone_roots; @@ -111,6 +109,7 @@ struct send_ctx { int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_last_extent; u64 send_progress; @@ -122,6 +121,74 @@ struct send_ctx { int name_cache_size; char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; }; struct name_cache_entry { @@ -145,6 +212,15 @@ struct name_cache_entry { char name[]; }; +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + static void fs_path_reset(struct fs_path *p) { if (p->reversed) { @@ -336,16 +412,6 @@ out: return ret; } -#if 0 -static void fs_path_remove(struct fs_path *p) -{ - BUG_ON(p->reversed); - while (p->start != p->end && *p->end != '/') - p->end--; - *p->end = 0; -} -#endif - static int fs_path_copy(struct fs_path *p, struct fs_path *from) { int ret; @@ -436,30 +502,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return 0; } -#if 0 -static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) -{ - return tlv_put(sctx, attr, &value, sizeof(value)); -} - -static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) -{ - __le16 tmp = cpu_to_le16(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} - -static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) -{ - __le32 tmp = cpu_to_le32(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} -#endif +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } -static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) -{ - __le64 tmp = cpu_to_le64(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} +TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, const char *str, int len) @@ -475,17 +526,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); } -#if 0 -static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, - struct timespec *ts) -{ - struct btrfs_timespec bts; - bts.sec = cpu_to_le64(ts->tv_sec); - bts.nsec = cpu_to_le32(ts->tv_nsec); - return tlv_put(sctx, attr, &bts, sizeof(bts)); -} -#endif - static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, struct extent_buffer *eb, struct btrfs_timespec *ts) @@ -533,12 +573,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, if (ret < 0) \ goto tlv_put_failure; \ } while (0) -#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ - do { \ - ret = tlv_put_timespec(sctx, attrtype, ts); \ - if (ret < 0) \ - goto tlv_put_failure; \ - } while (0) #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ do { \ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ @@ -1270,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx, if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; - printk(KERN_ERR "btrfs: ERROR did not find backref in " + btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " "disk_byte=%llu found extent=%llu\n", ino, data_offset, disk_byte, found_key.objectid); @@ -1343,7 +1377,7 @@ static int read_symlink(struct btrfs_root *root, BUG_ON(compression); off = btrfs_file_extent_inline_start(ei); - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -1372,7 +1406,7 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { - len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); if (len >= sizeof(tmp)) { /* should really not happen */ @@ -1933,6 +1967,7 @@ static void name_cache_free(struct send_ctx *sctx) */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, + int skip_name_cache, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) @@ -1942,6 +1977,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; + if (skip_name_cache) + goto get_ref; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes @@ -1986,11 +2023,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } +get_ref: /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ - if (ino < sctx->send_progress) + if (ino < sctx->send_progress && !skip_name_cache) ret = get_first_ref(sctx->send_root, ino, parent_ino, parent_gen, dest); else @@ -2014,6 +2052,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out; ret = 1; } + if (skip_name_cache) + goto out; out_cache: /* @@ -2081,6 +2121,9 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + u64 start_ino = ino; + u64 start_gen = gen; + int skip_name_cache = 0; name = fs_path_alloc(); if (!name) { @@ -2088,19 +2131,32 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, goto out; } + if (is_waiting_for_move(sctx, ino)) + skip_name_cache = 1; + +again: dest->reversed = 1; fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, + ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, &parent_inode, &parent_gen, name); if (ret < 0) goto out; if (ret) stop = 1; + if (!skip_name_cache && + is_waiting_for_move(sctx, parent_inode)) { + ino = start_ino; + gen = start_gen; + stop = 0; + skip_name_cache = 1; + goto again; + } + ret = fs_path_add_path(dest, name); if (ret < 0) goto out; @@ -2131,7 +2187,7 @@ static int send_subvol_begin(struct send_ctx *sctx) char *name = NULL; int namelen; - path = alloc_path_for_send(); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2180,12 +2236,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - sctx->send_root->root_item.ctransid); + le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, sctx->parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - sctx->parent_root->root_item.ctransid); + le64_to_cpu(sctx->parent_root->root_item.ctransid)); } ret = send_cmd(sctx); @@ -2672,10 +2728,349 @@ out: return ret; } +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return 1; + } + return 0; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +#ifdef CONFIG_BTRFS_ASSERT + +static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) { + n = n->rb_left; + } else if (ino > entry->ino) { + n = n->rb_right; + } else { + rb_erase(&entry->node, &sctx->waiting_dir_moves); + kfree(entry); + return 0; + } + } + return -ENOENT; +} + +#endif + +static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = sctx->cur_ino; + pm->gen = sctx->cur_inode_gen; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, &sctx->new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + int ret; + + from_path = fs_path_alloc(); + if (!from_path) + return -ENOMEM; + + sctx->send_progress = pm->ino; + ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + if (ret < 0) + goto out; + + to_path = fs_path_alloc(); + if (!to_path) { + ret = -ENOMEM; + goto out; + } + + sctx->send_progress = sctx->cur_ino + 1; + ASSERT(del_waiting_dir_move(sctx, pm->ino) == 0); + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref) +{ + int ret; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + u64 new_gen, old_gen; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + if (parent_ref->dir <= sctx->cur_ino) + return 0; + + if (is_waiting_for_move(sctx, ino)) + return 1; + + ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, + NULL, NULL, NULL, NULL); + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; + + ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + return ret; + + if (new_gen != old_gen) + return 0; + + path_before = fs_path_alloc(); + if (!path_before) + return -ENOMEM; + + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret == -ENOENT) { + ret = 0; + goto out; + } else if (ret < 0) { + goto out; + } + + path_after = fs_path_alloc(); + if (!path_after) { + ret = -ENOMEM; + goto out; + } + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret == -ENOENT) { + ret = 0; + goto out; + } else if (ret < 0) { + goto out; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if ((parent_ino_before != parent_ino_after) && (len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + goto out; + } + ret = 0; + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ -static int process_recorded_refs(struct send_ctx *sctx) +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) { int ret = 0; struct recorded_ref *cur; @@ -2824,11 +3219,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (ret < 0) - goto out; - ret = fs_path_copy(valid_path, cur->full_path); + if (wait_for_parent_move(sctx, cur)) { + ret = add_pending_dir_move(sctx, + cur->dir); + *pending_move = 1; + } else { + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + } if (ret < 0) goto out; } else { @@ -3197,6 +3598,7 @@ static int process_all_refs(struct send_ctx *sctx, struct extent_buffer *eb; int slot; iterate_inode_ref_t cb; + int pending_move = 0; path = alloc_path_for_send(); if (!path) @@ -3240,7 +3642,9 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); out: btrfs_free_path(path); @@ -3706,7 +4110,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root->root->root_item.ctransid); + le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); @@ -3752,6 +4156,39 @@ out: return ret; } +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3764,12 +4201,14 @@ static int send_write_or_clone(struct send_ctx *sctx, u64 len; u32 l; u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); /* * it is possible the inline item won't cover the whole page, * but there may be items after this page. Make @@ -3787,7 +4226,7 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (clone_root) { + if (clone_root && IS_ALIGNED(offset + len, bs)) { ret = send_clone(sctx, offset, len, clone_root); } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); @@ -3979,6 +4418,101 @@ out: return ret; } +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) @@ -3995,7 +4529,7 @@ static int process_extent(struct send_ctx *sctx, goto out; if (ret) { ret = 0; - goto out; + goto out_hole; } } else { struct btrfs_file_extent_item *ei; @@ -4031,7 +4565,10 @@ static int process_extent(struct send_ctx *sctx, goto out; ret = send_write_or_clone(sctx, path, key, found_clone); - + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); out: return ret; } @@ -4054,17 +4591,25 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -4077,8 +4622,7 @@ static int process_all_extents(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -4086,7 +4630,9 @@ out: return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) { int ret = 0; @@ -4098,17 +4644,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, pending_move); if (ret < 0) goto out; - /* - * We have processed the refs and thus need to advance send_progress. - * Now, calls to get_cur_xxx will take the updated refs of the current - * inode into account. - */ - sctx->send_progress = sctx->cur_ino + 1; - + *refs_processed = 1; out: return ret; } @@ -4124,11 +4664,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 right_gid; int need_chmod = 0; int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; - ret = process_recorded_refs_if_needed(sctx, at_end); + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); if (ret < 0) goto out; + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) @@ -4157,6 +4715,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, sctx->cur_inode_size); if (ret < 0) @@ -4177,9 +4748,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } /* - * Need to send that every time, no matter if it actually changed - * between the two trees as we have done changes to the inode before. + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. + */ + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + } + + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. */ + sctx->send_progress = sctx->cur_ino + 1; ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; @@ -4200,6 +4783,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -4480,14 +5064,18 @@ static int changed_cb(struct btrfs_root *left_root, struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { - if (key->type != BTRFS_INODE_REF_KEY && - key->type != BTRFS_INODE_EXTREF_KEY) - return 0; - ret = compare_refs(sctx, left_path, key); - if (!ret) + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { return 0; - if (ret < 0) - return ret; + } result = BTRFS_COMPARE_TREE_CHANGED; ret = 0; } @@ -4522,7 +5110,6 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; @@ -4544,19 +5131,6 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. @@ -4566,7 +5140,7 @@ join_trans: spin_unlock(&send_root->root_item_lock); if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "btrfs: the root that you're trying to " + WARN(1, KERN_WARNING "BTRFS: the root that you're trying to " "send was modified in between. This is " "probably a bug.\n"); ret = -EIO; @@ -4580,19 +5154,6 @@ join_trans: goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -4620,12 +5181,6 @@ out_finish: out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -4662,6 +5217,21 @@ out: return ret; } +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu\n", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) { int ret = 0; @@ -4673,6 +5243,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4681,38 +5254,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) fs_info = send_root->fs_info; /* + * The subvolume must remain read-only during send, protect against + * making it RW. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* * This is done when we lookup the root, it should already be complete * by the time we get here. */ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); /* - * If we just created this root we need to make sure that the orphan - * cleanup has been done and committed since we search the commit root, - * so check its commit root transid with our otransid and if they match - * commit the transaction to make sure everything is updated. + * Userspace tools do the checks and warn the user if it's + * not RO. */ - down_read(&send_root->fs_info->extent_commit_sem); - if (btrfs_header_generation(send_root->commit_root) == - btrfs_root_otransid(&send_root->root_item)) { - struct btrfs_trans_handle *trans; - - up_read(&send_root->fs_info->extent_commit_sem); - - trans = btrfs_attach_transaction_barrier(send_root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto out; - } - /* ENOENT means theres no transaction */ - } else { - ret = btrfs_commit_transaction(trans, send_root); - if (ret) - goto out; - } - } else { - up_read(&send_root->fs_info->extent_commit_sem); + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; } arg = memdup_user(arg_, sizeof(*arg)); @@ -4723,8 +5284,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } @@ -4753,8 +5314,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - sctx->mnt = mnt_file->f_path.mnt; - sctx->send_root = send_root; sctx->clone_roots_cnt = arg->clone_sources_count; @@ -4771,6 +5330,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); if (!sctx->clone_roots) { @@ -4798,11 +5360,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = clone_sources_tmp[i]; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } + clone_sources_to_rollback = i + 1; + spin_lock(&clone_root->root_item_lock); + clone_root->send_in_progress++; + if (!btrfs_root_readonly(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + sctx->clone_roots[i].root = clone_root; } vfree(clone_sources_tmp); @@ -4813,11 +5391,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = arg->parent_root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -4831,6 +5425,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sort(sctx->clone_roots, sctx->clone_roots_cnt, sizeof(*sctx->clone_roots), __clone_root_cmp_sort, NULL); + sort_clone_roots = 1; ret = send_subvol(sctx); if (ret < 0) @@ -4846,6 +5441,48 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2d8ac1b..c02f633 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,8 @@ #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" +#include "hash.h" +#include "props.h" #include "xattr.h" #include "volumes.h" #include "export.h" @@ -152,11 +154,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", + printk(KERN_CRIT + "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", sb->s_id, function, line, errno, errstr, &vaf); va_end(args); } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", + printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } @@ -250,7 +253,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, */ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", + WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", errno); } trans->aborted = errno; @@ -294,8 +297,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); va_end(args); /* Caller calls BUG() */ } @@ -322,7 +325,9 @@ enum { Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, - Opt_commit_interval, + Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, + Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, + Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_err, }; @@ -332,8 +337,11 @@ static match_table_t tokens = { {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, + {Opt_datasum, "datasum"}, {Opt_nodatacow, "nodatacow"}, + {Opt_datacow, "datacow"}, {Opt_nobarrier, "nobarrier"}, + {Opt_barrier, "barrier"}, {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, @@ -344,18 +352,25 @@ static match_table_t tokens = { {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, + {Opt_treelog, "treelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_skip_balance, "skip_balance"}, @@ -368,6 +383,20 @@ static match_table_t tokens = { {Opt_err, NULL}, }; +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -383,6 +412,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + bool compress = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) @@ -409,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) token = match_token(p, tokens, args); switch (token) { case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_info(root->fs_info, "allowing degraded mounts"); btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: @@ -422,28 +452,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); + btrfs_set_and_info(root, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(root, NODATASUM)) { + if (btrfs_test_opt(root, NODATACOW)) + btrfs_info(root->fs_info, "setting datasum, datacow enabled"); + else + btrfs_info(root->fs_info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { - printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); - } else { - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, NODATACOW)) { + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + btrfs_info(root->fs_info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(root->fs_info, "setting nodatacow"); + } } - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; + case Opt_datacow: + btrfs_clear_and_info(root, NODATACOW, + "setting datacow"); + break; case Opt_compress_force: case Opt_compress_force_type: compress_force = true; /* Fallthrough */ case Opt_compress: case Opt_compress_type: + compress = true; if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -461,7 +508,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -471,33 +517,36 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - pr_info("btrfs: force %s compression\n", - compress_type); - } else - pr_info("btrfs: use %s compression\n", - compress_type); + btrfs_set_and_info(root, FORCE_COMPRESS, + "force %s compression", + compress_type); + } else if (compress) { + if (!btrfs_test_opt(root, COMPRESS)) + btrfs_info(root->fs_info, + "btrfs: use %s compression\n", + compress_type); + } break; case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_and_info(root, SSD, + "use ssd allocation scheme"); break; case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); + btrfs_set_and_info(root, SSD_SPREAD, + "use spread ssd allocation scheme"); break; case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_and_info(root, NOSSD, + "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_barrier: + btrfs_clear_and_info(root, NOBARRIER, + "turning on barriers"); break; case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); + btrfs_set_and_info(root, NOBARRIER, + "turning off barriers"); break; case Opt_thread_pool: ret = match_int(&args[0], &intarg); @@ -521,7 +570,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_inline, root->sectorsize); } - printk(KERN_INFO "btrfs: max_inline at %llu\n", + btrfs_info(root->fs_info, "max_inline at %llu", info->max_inline); } else { ret = -ENOMEM; @@ -535,24 +584,34 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->alloc_start = memparse(num, NULL); mutex_unlock(&info->chunk_mutex); kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", + btrfs_info(root->fs_info, "allocations start at %llu", info->alloc_start); } else { ret = -ENOMEM; goto out; } break; + case Opt_acl: + root->fs_info->sb->s_flags |= MS_POSIXACL; + break; case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); + btrfs_set_and_info(root, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(root, NOTREELOG, + "enabling tree log"); break; case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + btrfs_set_and_info(root, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(root, FLUSHONCOMMIT, + "turning off flush-on-commit"); break; case Opt_ratio: ret = match_int(&args[0], &intarg); @@ -560,7 +619,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", + btrfs_info(root->fs_info, "metadata ratio %d", info->metadata_ratio); } else { ret = -EINVAL; @@ -568,25 +627,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); + btrfs_set_and_info(root, DISCARD, + "turning on discard"); + break; + case Opt_nodiscard: + btrfs_clear_and_info(root, DISCARD, + "turning off discard"); break; case Opt_space_cache: - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - printk(KERN_INFO "btrfs: disabling disk space caching\n"); - btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); break; case Opt_inode_cache: - printk(KERN_INFO "btrfs: enabling inode map caching\n"); - btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + btrfs_set_and_info(root, CHANGE_INODE_CACHE, + "enabling inode map caching"); + break; + case Opt_noinode_cache: + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); break; case Opt_clear_cache: - printk(KERN_INFO "btrfs: force clearing of disk cache\n"); - btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + btrfs_set_and_info(root, CLEAR_CACHE, + "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); @@ -594,12 +663,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_enospc_debug: btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag\n"); - btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); + btrfs_set_and_info(root, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(root, AUTO_DEFRAG, + "disabling auto defrag"); break; case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery\n"); + btrfs_info(root->fs_info, "enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; case Opt_skip_balance: @@ -607,14 +683,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); + btrfs_info(root->fs_info, + "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_info(root->fs_info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: @@ -623,8 +699,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", + btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); } else { ret = -EINVAL; @@ -635,8 +710,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_check_integrity_including_extent_data: case Opt_check_integrity: case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); + btrfs_err(root->fs_info, + "support for check_integrity* not compiled in!"); ret = -EINVAL; goto out; #endif @@ -656,28 +731,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) intarg = 0; ret = match_int(&args[0], &intarg); if (ret < 0) { - printk(KERN_ERR - "btrfs: invalid commit interval\n"); + btrfs_err(root->fs_info, "invalid commit interval"); ret = -EINVAL; goto out; } if (intarg > 0) { if (intarg > 300) { - printk(KERN_WARNING - "btrfs: excessive commit interval %d\n", + btrfs_warn(root->fs_info, "excessive commit interval %d", intarg); } info->commit_interval = intarg; } else { - printk(KERN_INFO - "btrfs: using default commit interval %ds\n", + btrfs_info(root->fs_info, "using default commit interval %ds", BTRFS_DEFAULT_COMMIT_INTERVAL); info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); + btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -686,7 +757,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } out: if (!ret && btrfs_test_opt(root, SPACE_CACHE)) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + btrfs_info(root->fs_info, "disk space caching is enabled"); kfree(orig); return ret; } @@ -749,7 +820,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, break; case Opt_subvolrootid: printk(KERN_WARNING - "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); + "BTRFS: 'subvolrootid' mount option is deprecated and has " + "no effect\n"); break; case Opt_device: device_name = match_strdup(&args[0]); @@ -878,7 +950,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { - printk("btrfs: open_ctree failed\n"); + printk(KERN_ERR "BTRFS: open_ctree failed\n"); return err; } @@ -1116,7 +1188,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, dput(root); root = ERR_PTR(-EINVAL); deactivate_locked_super(s); - printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", subvol_name); } @@ -1241,7 +1313,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, fs_info->thread_pool_size = new_pool_size; - printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); @@ -1347,7 +1419,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } else { if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { btrfs_err(fs_info, - "Remounting read-write after error is not allowed\n"); + "Remounting read-write after error is not allowed"); ret = -EINVAL; goto restore; } @@ -1359,8 +1431,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(*flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable remount is not allowed\n"); + btrfs_warn(fs_info, + "too many missing devices, writeable remount is not allowed"); ret = -EACCES; goto restore; } @@ -1385,16 +1457,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume dev_replace"); goto restore; } if (!fs_info->uuid_root) { - pr_info("btrfs: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the uuid tree" - "%d\n", ret); + btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); goto restore; } } @@ -1774,7 +1845,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); + printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); } static void btrfs_print_info(void) @@ -1819,10 +1890,16 @@ static int __init init_btrfs_fs(void) { int err; - err = btrfs_init_sysfs(); + err = btrfs_hash_init(); if (err) return err; + btrfs_props_init(); + + err = btrfs_init_sysfs(); + if (err) + goto free_hash; + btrfs_init_compress(); err = btrfs_init_cachep(); @@ -1896,6 +1973,8 @@ free_cachep: free_compress: btrfs_exit_compress(); btrfs_exit_sysfs(); +free_hash: + btrfs_hash_exit(); return err; } @@ -1914,6 +1993,7 @@ static void __exit exit_btrfs_fs(void) btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); btrfs_exit_compress(); + btrfs_hash_exit(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5b326cd..782374d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,24 +22,641 @@ #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/kobject.h> +#include <linux/bug.h> +#include <linux/genhd.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "sysfs.h" +#include "volumes.h" + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); +} + +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); +} + +static int can_modify_feature(struct btrfs_feature_attr *fa) +{ + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; + + return val; +} + +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + struct btrfs_trans_handle *trans; + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) + return ret; + + return count; +} + +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; + + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; + + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); + + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); + +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(mixed_backref), + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(big_metadata), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + NULL +}; + +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, +}; + +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); + +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); +BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group_cache *block_group; + int index = kobj - sinfo->block_group_kobjs; + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + val += block_group->key.offset; + else + val += btrfs_block_group_used(&block_group->item); + } + up_read(&sinfo->groups_sem); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static struct attribute *raid_attributes[] = { + BTRFS_RAID_ATTR_PTR(total_bytes), + BTRFS_RAID_ATTR_PTR(used_bytes), + NULL +}; + +static void release_raid_kobj(struct kobject *kobj) +{ + kobject_put(kobj->parent); +} + +struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_attrs = raid_attributes, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) + +static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); + return snprintf(buf, PAGE_SIZE, "%lld\n", val); +} + +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); + +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(flags), + BTRFS_ATTR_PTR(total_bytes), + BTRFS_ATTR_PTR(bytes_used), + BTRFS_ATTR_PTR(bytes_pinned), + BTRFS_ATTR_PTR(bytes_reserved), + BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(disk_used), + BTRFS_ATTR_PTR(disk_total), + BTRFS_ATTR_PTR(total_bytes_pinned), + NULL, +}; + +static void space_info_release(struct kobject *kobj) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + percpu_counter_destroy(&sinfo->total_bytes_pinned); + kfree(sinfo); +} + +struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_attrs = space_info_attrs, +}; + +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(global_rsv_reserved), + BTRFS_ATTR_PTR(global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); +} + +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->fs_root; + int ret; + + if (len >= BTRFS_LABEL_SIZE) { + pr_err("BTRFS: unable to set label with more than %d bytes\n", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + strcpy(fs_info->super_copy->label, buf); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + + if (!ret) + return len; + + return ret; +} +BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); + +static struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(label), + NULL, +}; + +static void btrfs_release_super_kobj(struct kobject *kobj) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + complete(&fs_info->kobj_unregister); +} + +static struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_super_kobj, + .default_attrs = btrfs_attrs, +}; + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_info, super_kobj); +} + +#define NUM_FEATURE_BITS 64 +static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; +static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; + +static u64 supported_feature_masks[3] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, +}; + +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->super_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->super_kobj, + &agroup); + } + + } + return 0; +} + +static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + kobject_del(&fs_info->super_kobj); + kobject_put(&fs_info->super_kobj); + wait_for_completion(&fs_info->kobj_unregister); +} + +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + kobject_del(fs_info->device_dir_kobj); + kobject_put(fs_info->device_dir_kobj); + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); + __btrfs_sysfs_remove_one(fs_info); +} + +const char * const btrfs_feature_set_names[3] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; + int i; + char *str; + + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += snprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); + } + + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != + ARRAY_SIZE(btrfs_feature_attrs)); + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != + ARRAY_SIZE(btrfs_feature_attrs[0])); + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; + } + + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, 13, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +static int add_device_membership(struct btrfs_fs_info *fs_info) +{ + int error = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *dev; + + fs_info->device_dir_kobj = kobject_create_and_add("devices", + &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) + return -ENOMEM; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + struct hd_struct *disk = dev->bdev->bd_part; + struct kobject *disk_kobj = &part_to_dev(disk)->kobj; + + error = sysfs_create_link(fs_info->device_dir_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + return error; +} /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +{ + int error; + + init_completion(&fs_info->kobj_unregister); + fs_info->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, + "%pU", fs_info->fsid); + if (error) + return error; + + error = sysfs_create_group(&fs_info->super_kobj, + &btrfs_feature_attr_group); + if (error) { + __btrfs_sysfs_remove_one(fs_info); + return error; + } + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = add_device_membership(fs_info); + if (error) + goto failure; + + fs_info->space_info_kobj = kobject_create_and_add("allocation", + &fs_info->super_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_one(fs_info); + return error; +} + int btrfs_init_sysfs(void) { + int ret; btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; + + init_feature_attrs(); + + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) { + kset_unregister(btrfs_kset); + return ret; + } + return 0; } void btrfs_exit_sysfs(void) { + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h new file mode 100644 index 0000000..f3cea37 --- /dev/null +++ b/fs/btrfs/sysfs.h @@ -0,0 +1,64 @@ +#ifndef _BTRFS_SYSFS_H_ +#define _BTRFS_SYSFS_H_ + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ +static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, _mode, _show, _store) +#define BTRFS_ATTR(_name, _mode, _show) \ + BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) + +#define BTRFS_RAID_ATTR(_name, _show) \ +static struct kobj_attribute btrfs_raid_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) +#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) + + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +/* convert from attribute */ +#define to_btrfs_feature_attr(a) \ + container_of(a, struct btrfs_feature_attr, kobj_attr) +#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) +#define attr_to_btrfs_feature_attr(a) \ + to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +extern const char * const btrfs_feature_set_names[3]; +extern struct kobj_type space_info_ktype; +extern struct kobj_type btrfs_raid_ktype; +#endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b353bc8..312560a 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -21,7 +21,7 @@ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 6fc8201..c8d9ddf 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache) ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); if (ret) { - test_msg("Error removing middle peice %d\n", ret); + test_msg("Error removing middle piece %d\n", ret); return ret; } @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) } if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - test_msg("Left over peices after removing overlapping\n"); + test_msg("Left over pieces after removing overlapping\n"); return -1; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c6a872a..34cd831 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -62,7 +62,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -183,8 +183,8 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->start_time = get_seconds(); - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.href_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; @@ -196,17 +196,14 @@ loop: */ smp_mb(); if (!list_empty(&fs_info->tree_mod_seq_list)) - WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) - WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->delayed_refs.lock); - atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); - atomic_set(&cur_trans->delayed_refs.ref_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); @@ -472,6 +469,7 @@ again: h->type = type; h->allocating_chunk = false; h->reloc_reserved = false; + h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -647,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->fs_info->global_block_rsv.space_info->full && - btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_check_space_for_delayed_refs(trans, root)) return 1; return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); @@ -711,8 +709,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); trans->delayed_ref_updates = 0; - if (btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 1); + if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { + cur = max_t(unsigned long, cur, 32); trans->delayed_ref_updates = 0; btrfs_run_delayed_refs(trans, root, cur); } @@ -788,12 +786,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, return __btrfs_end_transaction(trans, root, 1); } -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - return __btrfs_end_transaction(trans, root, 1); -} - /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of @@ -1105,7 +1097,7 @@ int btrfs_defrag_root(struct btrfs_root *root) break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + pr_debug("BTRFS: defrag_root cancelled\n"); ret = -EAGAIN; break; } @@ -1746,6 +1738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; btrfs_wait_delalloc_flush(root->fs_info); + + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting @@ -1810,7 +1804,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); - btrfs_scrub_pause(root); /* btrfs_commit_tree_roots is responsible for getting the * various roots consistent with each other. Every pointer * in the tree of tree roots has to point to the most up to date @@ -1833,6 +1826,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * Since the transaction is done, we should set the inode map cache flag + * before any other comming transaction. + */ + if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) + btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + else + btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -1975,10 +1977,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); + /* + * Make sure root is not involved in send, + * if we fail with first root, we return + * directly rather than continue. + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress) { + spin_unlock(&fs_info->trans_lock); + spin_unlock(&root->root_item_lock); + return 0; + } + spin_unlock(&root->root_item_lock); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - pr_debug("btrfs: cleaner removing %llu\n", root->objectid); + pr_debug("BTRFS: cleaner removing %llu\n", root->objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7657d11..6ac037e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -93,6 +93,7 @@ struct btrfs_trans_handle { short adding_csums; bool allocating_chunk; bool reloc_reserved; + bool sync; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -154,8 +155,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f7fc51..39d83da 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); + size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { @@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_orphan_item(root, offset); + ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; @@ -3194,7 +3195,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, - struct extent_buffer *src, + struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; @@ -3202,6 +3203,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3209,6 +3212,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = (*last_extent == 0); + bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3217,6 +3223,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; + first_key.objectid = (u64)-1; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3237,6 +3245,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if ((i == (nr - 1))) + last_key = ins_keys[i]; + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -3248,6 +3259,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (need_find_last_extent && + first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; + } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3312,6 +3338,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, list_del(&sums->list); kfree(sums); } + + if (!has_extents) + return ret; + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, i, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = offset + len; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; return ret; } @@ -3349,21 +3497,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0); - if (ret) - return ret; + int extent_inserted = 0; INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); if (ret) return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3485,7 +3639,11 @@ again: * start over after this. */ - wait_event(ordered->wait, ordered->csum_bytes_left == 0); + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); @@ -3630,6 +3788,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -3745,11 +3904,15 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; + } if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; } ins_nr = 1; ins_start_slot = path->slots[0]; @@ -3763,13 +3926,14 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, ins_nr, inode_only); - if (ret) { + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -3784,12 +3948,13 @@ next_slot: } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b0a523b2..840a38b 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,8 +5,8 @@ */ #include <linux/slab.h> -#include <linux/export.h> #include "ulist.h" +#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 @@ -14,10 +14,6 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - * * A sample usage for ulists is the enumeration of directed graphs without * visiting a node twice. The pseudo-code could look like this: * @@ -50,12 +46,10 @@ */ void ulist_init(struct ulist *ulist) { - ulist->nnodes = 0; - ulist->nodes = ulist->int_nodes; - ulist->nodes_alloced = ULIST_SIZE; + INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; + ulist->nnodes = 0; } -EXPORT_SYMBOL(ulist_init); /** * ulist_fini - free up additionally allocated memory for the ulist @@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init); * This is useful in cases where the base 'struct ulist' has been statically * allocated. */ -void ulist_fini(struct ulist *ulist) +static void ulist_fini(struct ulist *ulist) { - /* - * The first ULIST_SIZE elements are stored inline in struct ulist. - * Only if more elements are alocated they need to be freed. - */ - if (ulist->nodes_alloced > ULIST_SIZE) - kfree(ulist->nodes); - ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); } -EXPORT_SYMBOL(ulist_fini); /** * ulist_reinit - prepare a ulist for reuse @@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist) ulist_fini(ulist); ulist_init(ulist); } -EXPORT_SYMBOL(ulist_reinit); /** * ulist_alloc - dynamically allocate a ulist @@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -EXPORT_SYMBOL(ulist_alloc); /** * ulist_free - free dynamically allocated ulist @@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist) ulist_fini(ulist); kfree(ulist); } -EXPORT_SYMBOL(ulist_free); static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { @@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { - int ret = 0; - struct ulist_node *node = NULL; + int ret; + struct ulist_node *node; + node = ulist_rbtree_search(ulist, val); if (node) { if (old_aux) *old_aux = node->aux; return 0; } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; - if (ulist->nnodes >= ulist->nodes_alloced) { - u64 new_alloced = ulist->nodes_alloced + 128; - struct ulist_node *new_nodes; - void *old = NULL; - int i; - - for (i = 0; i < ulist->nnodes; i++) - rb_erase(&ulist->nodes[i].rb_node, &ulist->root); - - /* - * if nodes_alloced == ULIST_SIZE no memory has been allocated - * yet, so pass NULL to krealloc - */ - if (ulist->nodes_alloced > ULIST_SIZE) - old = ulist->nodes; + node->val = val; + node->aux = aux; +#ifdef CONFIG_BTRFS_DEBUG + node->seqnum = ulist->nnodes; +#endif - new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, - gfp_mask); - if (!new_nodes) - return -ENOMEM; - - if (!old) - memcpy(new_nodes, ulist->int_nodes, - sizeof(ulist->int_nodes)); - - ulist->nodes = new_nodes; - ulist->nodes_alloced = new_alloced; - - /* - * krealloc actually uses memcpy, which does not copy rb_node - * pointers, so we have to do it ourselves. Otherwise we may - * be bitten by crashes. - */ - for (i = 0; i < ulist->nnodes; i++) { - ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); - if (ret < 0) - return ret; - } - } - ulist->nodes[ulist->nnodes].val = val; - ulist->nodes[ulist->nnodes].aux = aux; - ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]); - BUG_ON(ret); - ++ulist->nnodes; + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; return 1; } -EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist @@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add); */ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - if (ulist->nnodes == 0) + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) return NULL; - if (uiter->i < 0 || uiter->i >= ulist->nnodes) + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) return NULL; - - return &ulist->nodes[uiter->i++]; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; +#ifdef CONFIG_BTRFS_DEBUG + uiter->i = 0; +#endif + } + node = list_entry(uiter->cur_list, struct ulist_node, list); +#ifdef CONFIG_BTRFS_DEBUG + ASSERT(node->seqnum == uiter->i); + ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); + uiter->i++; +#endif + return node; } -EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index fb36731..7f78cbf 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -17,18 +17,12 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. */ - -/* - * number of elements statically allocated inside struct ulist - */ -#define ULIST_SIZE 16 - struct ulist_iterator { +#ifdef CONFIG_BTRFS_DEBUG int i; +#endif + struct list_head *cur_list; /* hint to start search */ }; /* @@ -37,6 +31,12 @@ struct ulist_iterator { struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ + +#ifdef CONFIG_BTRFS_DEBUG + int seqnum; /* sequence number this node is added */ +#endif + + struct list_head list; /* used to link node */ struct rb_node rb_node; /* used to speed up search */ }; @@ -46,28 +46,11 @@ struct ulist { */ unsigned long nnodes; - /* - * number of nodes we already have room for - */ - unsigned long nodes_alloced; - - /* - * pointer to the array storing the elements. The first ULIST_SIZE - * elements are stored inline. In this case the it points to int_nodes. - * After exceeding ULIST_SIZE, dynamic memory is allocated. - */ - struct ulist_node *nodes; - + struct list_head nodes; struct rb_root root; - - /* - * inline storage space for the first ULIST_SIZE entries - */ - struct ulist_node int_nodes[ULIST_SIZE]; }; void ulist_init(struct ulist *ulist); -void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); @@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) #endif diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index fbda900..f6a4c03 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, ret = -ENOENT; if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto out; } @@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); } else if (ret < 0) { - pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", + btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d " + "(0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, (unsigned long long)key.offset, type); goto out; @@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for uuid item!\n", + btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!", ret); goto out; } @@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); item_size = btrfs_item_size_nr(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); ret = -ENOENT; goto out; @@ -299,7 +300,7 @@ again_search_slot: offset = btrfs_item_ptr_offset(leaf, slot); item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto skip; } @@ -349,6 +350,6 @@ skip: out: btrfs_free_path(path); if (ret) - pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 92303f4..bab0b84 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,7 +125,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev, ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); @@ -200,7 +200,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "btrfs: open %s failed\n", device_path); + printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -912,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "btrfs: device label %s ", disk_super->label); + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { - printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid); + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); } printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); @@ -1813,7 +1813,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } if (!*device) { - pr_err("btrfs: no missing device found\n"); + btrfs_err(root->fs_info, "no missing device found"); return -ENOENT; } @@ -3052,7 +3052,7 @@ loop: error: btrfs_free_path(path); if (enospc_errors) { - printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); if (!ret) ret = -ENOSPC; @@ -3138,8 +3138,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { - printk(KERN_ERR "btrfs: with mixed groups data and " - "metadata balance options must be the same\n"); + btrfs_err(fs_info, "with mixed groups data and " + "metadata balance options must be the same"); ret = -EINVAL; goto out; } @@ -3165,8 +3165,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "data profile %llu\n", + btrfs_err(fs_info, "unable to start balance with target " + "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; @@ -3174,8 +3174,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->meta.target, 1) || (bctl->meta.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "metadata profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; @@ -3183,8 +3183,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->sys.target, 1) || (bctl->sys.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "system profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target system profile %llu", bctl->sys.target); ret = -EINVAL; goto out; @@ -3193,7 +3193,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow dup'ed data chunks only in mixed mode */ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + btrfs_err(fs_info, "dup for data is not allowed"); ret = -EINVAL; goto out; } @@ -3213,11 +3213,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); + btrfs_info(fs_info, "force reducing metadata integrity"); } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); + btrfs_err(fs_info, "balance will reduce metadata " + "integrity, use force if you want this"); ret = -EINVAL; goto out; } @@ -3303,7 +3302,7 @@ static int balance_kthread(void *data) mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: continuing balance\n"); + btrfs_info(fs_info, "continuing balance"); ret = btrfs_balance(fs_info->balance_ctl, NULL); } @@ -3325,7 +3324,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); + btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -3543,7 +3542,7 @@ update_tree: BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3555,7 +3554,7 @@ update_tree: BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3590,7 +3589,7 @@ out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fs_info->uuid_root); if (ret) - pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); else fs_info->update_uuid_tree_gen = 1; up(&fs_info->uuid_tree_rescan_sem); @@ -3654,7 +3653,7 @@ static int btrfs_uuid_rescan_kthread(void *data) */ ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); if (ret < 0) { - pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } @@ -3695,7 +3694,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_scan task\n"); + btrfs_warn(fs_info, "failed to start uuid_scan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -3711,7 +3710,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_rescan task\n"); + btrfs_warn(fs_info, "failed to start uuid_rescan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -4033,7 +4032,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { - printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested\n", type); BUG_ON(1); } @@ -4065,7 +4064,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (!device->writeable) { WARN(1, KERN_ERR - "btrfs: read-only device in alloc_list\n"); + "BTRFS: read-only device in alloc_list\n"); continue; } @@ -5193,13 +5192,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", chunk_start); return -EIO; } if (em->start != chunk_start) { - printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", em->start, chunk_start); free_extent_map(em); return -EIO; @@ -5298,6 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err) bio_put(bio); bio = bbio->orig_bio; } + + /* + * We have original bio now. So increment bi_remaining to + * account for it in endio + */ + atomic_inc(&bio->bi_remaining); + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -5411,7 +5417,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, if (!q->merge_bvec_fn) return 1; - bvm.bi_size = bio->bi_size - prev->bv_len; + bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) return 0; return 1; @@ -5426,7 +5432,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, bio->bi_private = bbio; btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = physical >> 9; + bio->bi_iter.bi_sector = physical >> 9; #ifdef DEBUG { struct rcu_string *name; @@ -5464,7 +5470,7 @@ again: while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_size; + u64 len = bio->bi_iter.bi_size; atomic_inc(&bbio->stripes_pending); submit_stripe_bio(root, bbio, bio, physical, dev_nr, @@ -5486,7 +5492,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; - bio->bi_sector = logical >> 9; + bio->bi_iter.bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); } @@ -5497,7 +5503,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; u64 *raid_map = NULL; @@ -5506,7 +5512,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int total_devs = 1; struct btrfs_bio *bbio = NULL; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, @@ -6123,7 +6129,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "error %d while searching for dev_stats item for device %s!\n", ret, rcu_str_deref(device->name)); goto out; } @@ -6133,7 +6140,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "delete too small dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6146,7 +6154,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "insert dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6199,16 +6208,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_GENERATION_ERRS)); + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) @@ -6221,7 +6228,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_in_rcu(KERN_INFO "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6242,12 +6250,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root, mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, device not found\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, not yet valid\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 05740b9..ad8328d 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,11 +22,13 @@ #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" +#include "props.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -313,8 +315,8 @@ err: */ const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - &btrfs_xattr_acl_access_handler, - &btrfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL, }; @@ -331,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name) XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || + !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, @@ -373,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + value, size, flags); + if (size == 0) value = ""; /* empty EA, do not remove */ @@ -402,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + NULL, 0, XATTR_REPLACE); + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index b3cc803..5049608 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,8 +21,6 @@ #include <linux/xattr.h> -extern const struct xattr_handler btrfs_xattr_acl_access_handler; -extern const struct xattr_handler btrfs_xattr_acl_default_handler; extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 9acb846..8e57191 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_in = 0; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "btrfs: deflateInit failed\n"); + printk(KERN_WARNING "BTRFS: deflateInit failed\n"); ret = -1; goto out; } @@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws, while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); ret = -1; @@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -1; } while (workspace->inf_strm.total_in < srclen) { @@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -1; } diff --git a/fs/buffer.c b/fs/buffer.c index 6024877..651dba1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); @@ -2982,11 +2982,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) * let it through, and the IO layer will turn it into * an EIO. */ - if (unlikely(bio->bi_sector >= maxsector)) + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; - maxsector -= bio->bi_sector; - bytes = bio->bi_size; + maxsector -= bio->bi_iter.bi_sector; + bytes = bio->bi_iter.bi_size; if (likely((bytes >> 9) <= maxsector)) return; @@ -2994,7 +2994,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) bytes = maxsector << 9; /* Truncate the bio.. */ - bio->bi_size = bytes; + bio->bi_iter.bi_size = bytes; bio->bi_io_vec[0].bv_len = bytes; /* ..and clear the end of the buffer for reads */ @@ -3029,14 +3029,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; bio->bi_io_vec[0].bv_len = bh->b_size; bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_size = bh->b_size; + bio->bi_iter.bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index ac9a2ef..264e9bf 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -25,3 +25,16 @@ config CEPH_FSCACHE caching support for Ceph clients using FS-Cache endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 32e3010..85a4230 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ debugfs.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 0000000..4c2d452 --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,209 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, + int type) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct posix_acl *acl = ACL_NOT_CACHED; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + acl = get_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); + + return acl; +} + +void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + dentry = d_find_alias(inode); + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_dput; + } + + if (value) + ret = __ceph_setxattr(dentry, name, value, size, 0); + else + ret = __ceph_removexattr(dentry, name); + + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_dput; + } + + ceph_set_cached_acl(inode, type, acl); + +out_dput: + dput(dentry); +out_free: + kfree(value); +out: + return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (!default_acl && !acl) + cache_no_acl(inode); + + if (default_acl) { + error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1e561c0..b53278c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -209,10 +209,15 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } } SetPageUptodate(page); @@ -252,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; + if (rc < 0) + goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -262,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) flush_dcache_page(page); SetPageUptodate(page); ceph_readpage_to_fscache(inode, page); +unlock: unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; @@ -1203,6 +1211,41 @@ const struct address_space_operations ceph_aops = { /* * vm ops */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + ret = filemap_fault(vma, vmf); + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} /* * Reuse write_begin here for simplicity. @@ -1210,23 +1253,41 @@ const struct address_space_operations ceph_aops = { static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); - struct page *page = vmf->page; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct page *page = vmf->page; loff_t off = page_offset(page); - loff_t size, len; - int ret; - - /* Update time before taking page lock */ - file_update_time(vma->vm_file); + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; - size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else len = size & ~PAGE_CACHE_MASK; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); lock_page(page); @@ -1248,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) + if (ret != VM_FAULT_LOCKED) { unlock_page(page); + } else { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + return ret; } static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba94940..da95f61 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return fscache_maybe_release_page(ci->fscache, page, gfp); } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return 1; } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3c0a4bd..1754338 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -555,21 +555,34 @@ retry: cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } else if (new_cap) - ceph_put_cap(mdsc, new_cap); + } else { + if (new_cap) + ceph_put_cap(mdsc, new_cap); + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } if (!ci->i_snap_realm) { /* @@ -611,15 +624,9 @@ retry: if (ci->i_auth_cap == NULL || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) ci->i_auth_cap = cap; - } else if (ci->i_auth_cap == cap) { - ci->i_auth_cap = NULL; - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); - } - spin_unlock(&mdsc->cap_dirty_lock); + ci->i_cap_exporting_issued = 0; + } else { + WARN_ON(ci->i_auth_cap == cap); } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", @@ -628,7 +635,7 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - if (mseq > cap->mseq) + if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; @@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (cap != ocap && __cap_is_valid(cap) && + if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } @@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* @@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - if (ci->i_auth_cap) - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); - else - list_add(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -1735,13 +1751,12 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: spin_lock(&ci->i_ceph_lock); @@ -1755,13 +1770,14 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { + if (!session || session != cap->session) { spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; @@ -1780,7 +1796,7 @@ retry: out: spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); @@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode) d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns - * connected dentry. After calling d_invalidate(), the - * dentry become disconnected. + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. * * For directory inode, d_find_alias() can return - * disconnected dentry. But directory inode should have + * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { @@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we @@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, issued |= implemented | __ceph_caps_dirty(ci); cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); @@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } @@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); + + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* max size increase? */ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); @@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, check_caps = 1; } - cap->seq = seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; @@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; - } - if (t->session->s_mds == mds) - cap = t; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; - - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ - /* - * we can't flush dirty caps that we've seen the - * EXPORT but no IMPORT for - */ - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", - inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap, false); + goto out_unlock; } - /* else, we already released it */ + if (tsession) { + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + spin_unlock(&ci->i_ceph_lock); + /* add placeholder for the export tagert */ + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, NULL); + goto retry; + } + + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + ceph_add_cap_releases(mdsc, tsession); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + } + goto retry; + +out_unlock: spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } } /* @@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, void *snaptrace, int snaptrace_len) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; int mds = session->s_mds; unsigned issued = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); @@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + + spin_lock(&ci->i_ceph_lock); + cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (cap && cap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + cap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (cap->seq != le32_to_cpu(ph->seq) || + cap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, cap->seq, + cap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); } - spin_unlock(&mdsc->cap_dirty_lock); - } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + ci->i_cap_exporting_issued = cap->issued; + __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, false); @@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); - /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); } /* @@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; int op; u32 seq, mseq; @@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; - int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, + handle_cap_import(mdsc, inode, h, peer, session, snaptrace, snaptrace_len); } @@ -3007,8 +3115,6 @@ done: done_unlocked: if (inode) iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2a0bcae..6da4df8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -693,6 +693,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); + + if (!err) + err = ceph_init_acl(dentry, dentry->d_inode, dir); + if (err) d_drop(dentry); return err; @@ -1037,14 +1041,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - valid = 1; + if (dentry->d_inode) + valid = ceph_is_any_caps(dentry->d_inode); + else + valid = 1; } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) + if (valid) { ceph_dentry_lru_touch(dentry); - else + } else { + ceph_dir_clear_complete(dir); d_drop(dentry); + } iput(dir); return valid; } @@ -1293,6 +1302,8 @@ const struct inode_operations ceph_dir_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de8982..dfd2ce3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -408,51 +408,92 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; + u64 off = iocb->ki_pos; int num_pages, ret; + size_t len = i->count; - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); - /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); if (ret < 0) - goto done; + return ret; - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + if (file->f_flags & O_DIRECT) { + while (iov_iter_count(i)) { + void __user *data = i->iov[0].iov_base + i->iov_offset; + size_t len = i->iov[0].iov_len - i->iov_offset; + + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, + num_pages, true); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = striped_read(inode, off, len, + pages, num_pages, checkeof, + 1, (unsigned long)data & ~PAGE_MASK); + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < len) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = len = ret; + + while (left) { + void __user *data = i->iov[0].iov_base + + i->iov_offset; + l = min(i->iov[0].iov_len - i->iov_offset, + left); + + ret = ceph_copy_page_vector_to_user(&pages[k], + data, off, + l); + if (ret > 0) { + iov_iter_advance(i, ret); + left -= ret; + off += ret; + k = calc_pages_for(iocb->ki_pos, + len - left + 1) - 1; + BUG_ON(k >= num_pages && left); + } else + break; + } + } + ceph_release_page_vector(pages, num_pages); + } - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else - ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); return ret; } @@ -489,83 +530,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; - int num_ops = 1; struct page **pages; int num_pages; - u64 len; int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; + int page_align; int ret; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb->ki_pos; + struct iov_iter i; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, pos, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); + (pos + count) >> PAGE_CACHE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - len = left; - - snapc = ci->i_snap_realm->cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + iov_iter_init(&i, iov, nr_segs, count, 0); + + while (iov_iter_count(&i) > 0) { + void __user *data = i.iov->iov_base + i.iov_offset; + u64 len = i.iov->iov_len - i.iov_offset; + + page_align = (unsigned long)data & ~PAGE_MASK; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } - /* write from beginning of first page, regardless of io alignment */ - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; - num_pages = calc_pages_for(page_align, len); - if (file->f_flags & O_DIRECT) { + num_pages = calc_pages_for(page_align, len); pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); @@ -577,60 +614,175 @@ more: * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { + (pos+len) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + +out: + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + iov_iter_advance(&i, (size_t)len); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + struct iov_iter i; + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + iov_iter_init(&i, iov, nr_segs, count, 0); + + while ((len = iov_iter_count(&i)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; } - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + iov_iter_advance(&i, ret); + } + if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; - req->r_inode = inode; - own_pages = true; - } - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, own_pages); + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, false); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); out: - ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - left -= len; - data += len; - if (left) - goto more; + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *ppos = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); - } else if (ret != -EOLDSNAPC && written > 0) { - ret = written; + iocb->ki_pos = pos; } return ret; } @@ -647,55 +799,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; + size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + return ret; if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) + (fi->flags & CEPH_F_SYNC)) { + struct iov_iter i; + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + if (!read) { + ret = generic_segment_checks(iov, &nr_segs, + &len, VERIFY_WRITE); + if (ret) + goto out; + } + + iov_iter_init(&i, iov, nr_segs, len, read); + /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = ceph_sync_read(iocb, &i, &checkeof); + } else { + /* + * We can't modify the content of iov, + * so we only read from beginning. + */ + if (read) { + iocb->ki_pos = pos; + len = iocb->ki_nbytes; + read = 0; + } + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)len, + ceph_cap_string(got)); + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + } out: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + if (statret == 0 && iocb->ki_pos < inode->i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + read += ret; - base += ret; len -= ret; checkeof = 0; goto again; } } + if (ret >= 0) ret += read; @@ -772,11 +953,13 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) { + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { mutex_unlock(&inode->i_mutex); - written = ceph_sync_write(file, iov->iov_base, count, - pos, &iocb->ki_pos); + if (file->f_flags & O_DIRECT) + written = ceph_sync_direct_write(iocb, iov, + nr_segs, count); + else + written = ceph_sync_write(iocb, iov, nr_segs, count); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1018,7 +1201,7 @@ static long ceph_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9a8e396..32d519d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,6 +9,7 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/posix_acl.h> #include "super.h" #include "mds_client.h" @@ -95,6 +96,8 @@ const struct inode_operations ceph_file_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -335,12 +338,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_cap_exporting_issued = 0; for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; @@ -436,6 +437,16 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -670,6 +681,7 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); xattr_blob = NULL; } @@ -978,7 +990,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -1039,6 +1050,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } } + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (le32_to_cpu(rinfo->head->result) == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + /* * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later @@ -1108,7 +1142,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1130,44 +1163,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_drop(dn); - goto done; - } + if (!dn->d_inode) { + ihold(in); dn = splice_dentry(dn, in, &have_lease, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - ihold(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; /* fill out a snapdir LOOKUPSNAP dentry */ @@ -1177,52 +1194,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } dout(" linking snapped dir %p to dn %p\n", in, dn); + ihold(in); dn = splice_dentry(dn, in, NULL, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ } - - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; - - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; - } - } - done: dout("fill_trace done err=%d\n", err); return err; @@ -1272,7 +1252,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; @@ -1305,6 +1285,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1329,9 +1310,10 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { + ret = ceph_init_dentry(dn); + if (ret < 0) { dput(dn); + err = ret; goto out; } } else if (dn->d_inode && @@ -1351,9 +1333,6 @@ retry_lookup: spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; @@ -1366,26 +1345,39 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL, false); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { @@ -1474,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - /* nevermind! */ + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); goto out; @@ -1495,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); - +out: if (check) ceph_check_caps(ci, 0, NULL); -out: iput(inode); } @@ -1622,6 +1616,8 @@ static const struct inode_operations ceph_symlink_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; /* @@ -1695,6 +1691,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1810,6 +1807,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } + if (mask) { req->r_inode = inode; ihold(inode); @@ -1829,6 +1832,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) return err; out: spin_unlock(&ci->i_ceph_lock); +out_put: ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622f..dc66c9e 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d90861f..f4f050a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -98,7 +98,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -145,7 +145,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -217,7 +217,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -238,7 +238,7 @@ bad: */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { if (*p == end) { @@ -262,7 +262,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); @@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; @@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc, struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); - } else if (req->r_dentry->d_inode) { + } else { /* dentry target */ inode = req->r_dentry->d_inode; - } else { - /* dir + name */ - inode = dir; - hash = ceph_dentry_hash(dir, req->r_dentry); - is_hash = true; + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } } } @@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc, * * called under mdsc->mutex */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; - int target; if (mds >= mdsc->mdsmap->m_max_mds) return; + mi = &mdsc->mdsmap->m_info[mds]; dout("open_export_target_sessions for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { - target = mi->export_targets[i]; - ts = __ceph_lookup_mds_session(mdsc, target); - if (!ts) { - ts = register_session(mdsc, target); - if (IS_ERR(ts)) - return; - } - if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); - else - dout(" mds%d target mds%d %p is %s\n", session->s_mds, - i, ts, session_state_name(ts->s_state)); - ceph_put_mds_session(ts); + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); } } @@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_mds_session *session = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; if (session->s_trim_caps <= 0) return -1; @@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ session->s_trim_caps--; @@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) { - /* do nothing; not an authority problem */ - } else if (req->r_direct_mode != USE_AUTH_MDS) { + if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; } else { - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = NULL; - - if (req->r_session) - cap = ceph_get_cap_for_mds(ci, - req->r_session->s_mds); - - dout("already using auth"); - if ((!cap || cap != ci->i_auth_cap) || - (cap->mseq != req->r_sent_on_mseq)) { - dout("but cap changed, so resending"); + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; @@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4c053d0..6828891 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a9..4440f44 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6a0951e..2df963f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -686,6 +686,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -818,7 +819,11 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + s->s_flags |= MS_POSIXACL; +#endif + s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; fsc->sb = s; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ef4ac38..19793b5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -287,14 +287,12 @@ struct ceph_inode_info { unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + unsigned i_cap_exporting_issued; int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ @@ -335,7 +333,6 @@ struct ceph_inode_info { u32 i_fscache_gen; /* sequence, for delayed fscache validate */ struct work_struct i_revalidate_work; #endif - struct inode vfs_inode; /* at end */ }; @@ -529,6 +526,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) } extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -691,6 +690,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -724,6 +724,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, /* xattr.c */ extern int ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); @@ -732,6 +735,38 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern void __init ceph_xattr_init(void); extern void ceph_xattr_exit(void); +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); +void ceph_forget_all_cached_acls(struct inode *inode); + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif + /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, @@ -744,6 +779,7 @@ extern int ceph_add_cap(struct inode *inode, extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, u64 cap_id, u32 migrate_seq, u32 issue_seq); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index be661d8..898b656 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -6,16 +6,30 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + static bool ceph_is_valid_xattr(const char *name) { return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -663,10 +677,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) } } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int err; struct ceph_inode_xattr *xattr; @@ -675,7 +688,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { @@ -725,6 +737,15 @@ out: return err; } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(dentry->d_inode, name, value, size); +} + ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; @@ -863,8 +884,8 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; @@ -879,9 +900,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_xattr *xattr = NULL; int required_blob_size; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -958,6 +976,18 @@ out: return err; } +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + return __ceph_setxattr(dentry, name, value, size, flags); +} + static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); @@ -984,7 +1014,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) return err; } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; @@ -994,9 +1024,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name) int required_blob_size; int dirty; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1053,3 +1080,13 @@ out: return err; } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index aa33976..2c29db6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); -extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid); +extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + const unsigned char *path); extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 124aa02..d707edb 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4010,7 +4010,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4179,7 +4179,7 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4263,7 +4263,7 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 11ff5f1..a514e0a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -193,7 +193,7 @@ check_name(struct dentry *direntry) static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid, int *created) + __u32 *oplock, struct cifs_fid *fid) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, .device = 0, }; - *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) @@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, opened); + &oplock, &fid); if (rc) { cifs_del_pending_open(&open); goto out; } + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + rc = finish_open(file, direntry, generic_file_open, opened); if (rc) { if (server->ops->close) @@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; - int created = FILE_CREATED; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", inode, direntry->d_name.name, direntry); @@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, server->ops->new_lease_key(&fid); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, &created); + &oplock, &fid); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 36f9ebb..49719b8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } @@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index cc023471..92aee08 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) { - int rc = 0; + int rc; u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; - struct cifs_tcon *ptcon; if (!CIFSCouldBeMFSymlink(fattr)) /* it's not a symlink */ return 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; - ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); - if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) - rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, - &bytes_read, cifs_sb, xid); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(path, buf, + &bytes_read, cifs_sb, xid); else - goto out; + rc = -ENOSYS; - if (rc != 0) + if (rc) goto out; if (bytes_read == 0) /* not a symlink */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dc52e13..3881610 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; compat_caddr_t datap; - int nmsgs, i; + u32 nmsgs; + int i; if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; diff --git a/fs/coredump.c b/fs/coredump.c index bc3fbcd..e3ad709 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -40,7 +40,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> diff --git a/fs/coredump.h b/fs/coredump.h deleted file mode 100644 index e39ff07..0000000 --- a/fs/coredump.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _FS_COREDUMP_H -#define _FS_COREDUMP_H - -extern int __get_dumpable(unsigned long mm_flags); - -#endif diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e501ac3..06610cf 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -17,14 +17,30 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> -#include <linux/cramfs_fs.h> #include <linux/slab.h> -#include <linux/cramfs_fs_sb.h> #include <linux/vfs.h> #include <linux/mutex.h> - +#include <uapi/linux/cramfs_fs.h> #include <asm/uaccess.h> +#include "internal.h" + +/* + * cramfs super-block data in memory + */ +struct cramfs_sb_info { + unsigned long magic; + unsigned long size; + unsigned long blocks; + unsigned long files; + unsigned long flags; +}; + +static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; @@ -219,10 +235,11 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i return read_buffers[buffer] + offset; } -static void cramfs_put_super(struct super_block *sb) +static void cramfs_kill_sb(struct super_block *sb) { - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); + kfree(sbi); } static int cramfs_remount(struct super_block *sb, int *flags, char *data) @@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) printk(KERN_ERR "cramfs: wrong endianness\n"); - goto out; + return -EINVAL; } /* check at 512 byte offset */ @@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "cramfs: wrong endianness\n"); else if (!silent) printk(KERN_ERR "cramfs: wrong magic\n"); - goto out; + return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { printk(KERN_ERR "cramfs: unsupported filesystem features\n"); - goto out; + return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { printk(KERN_ERR "cramfs: root is not a directory\n"); - goto out; + return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); @@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) (root_offset != 512 + sizeof(struct cramfs_super)))) { printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); - goto out; + return -EINVAL; } /* Set it all up.. */ sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, &super.root, 0); if (IS_ERR(root)) - goto out; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto out; + return -ENOMEM; return 0; -out: - kfree(sbi); - sb->s_fs_info = NULL; - return -EINVAL; } static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = { }; static const struct super_operations cramfs_ops = { - .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, }; @@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", .mount = cramfs_mount, - .kill_sb = kill_block_super, + .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("cramfs"); diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h new file mode 100644 index 0000000..349d712 --- /dev/null +++ b/fs/cramfs/internal.h @@ -0,0 +1,4 @@ +/* Uncompression interfaces to the underlying zlib */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen); +int cramfs_uncompress_init(void); +void cramfs_uncompress_exit(void); diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 0233298..1760c1b 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -19,7 +19,7 @@ #include <linux/errno.h> #include <linux/vmalloc.h> #include <linux/zlib.h> -#include <linux/cramfs_fs.h> +#include "internal.h" static z_stream stream; static int initialized; diff --git a/fs/dcache.c b/fs/dcache.c index 4bdb300..265e0ce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } @@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); @@ -3111,26 +3116,28 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) +static char *__dentry_path(struct dentry *d, char *buf, int buflen) { + struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; + if (buflen < 2) + goto Elong; + rcu_read_lock(); restart: + dentry = d; end = buf + buflen; len = buflen; prepend(&end, &len, "\0", 1); - if (buflen < 1) - goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; - int error; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b..ac44a69 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -204,7 +204,7 @@ out: } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); diff --git a/fs/direct-io.c b/fs/direct-io.c index 0e04142..160a548 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -375,7 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio = bio_alloc(GFP_KERNEL, nr_vecs); bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -719,7 +719,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->bio) { loff_t cur_offset = sdio->cur_page_fs_offset; loff_t bio_next_offset = sdio->logical_offset_in_bio + - sdio->bio->bi_size; + sdio->bio->bi_iter.bi_size; /* * See whether this new request is contiguous with the old. diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d90909e..3190ca9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; + struct linger linger; switch (sn->sn_header.sn_type) { case SCTP_SEND_FAILED: @@ -713,11 +714,11 @@ static void process_sctp_notification(struct connection *con, return; /* Peel off a new sock */ - sctp_lock_sock(con->sock->sk); + lock_sock(con->sock->sk); ret = sctp_do_peeloff(con->sock->sk, sn->sn_assoc_change.sac_assoc_id, &new_con->sock); - sctp_release_sock(con->sock->sk); + release_sock(con->sock->sk); if (ret < 0) { log_print("Can't peel off a socket for " "connection %d to node %d: err=%d", @@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con, } add_sock(new_con->sock, new_con); + linger.l_onoff = 1; + linger.l_linger = 0; + ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (ret < 0) + log_print("set socket option SO_LINGER failed"); + log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c36c448..b167ca4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -659,19 +659,17 @@ out_lock: return rc; } -static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, - size_t *bufsiz) +static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; + char *buf; mm_segment_t old_fs; int rc; lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!lower_buf) { - rc = -ENOMEM; - goto out; - } + if (!lower_buf) + return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, @@ -680,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(old_fs); if (rc < 0) goto out; - rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, + rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb, lower_buf, rc); out: kfree(lower_buf); - return rc; + return rc ? ERR_PTR(rc) : buf; } static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *buf; - size_t len = PATH_MAX; - int rc; - - rc = ecryptfs_readlink_lower(dentry, &buf, &len); - if (rc) + size_t len; + char *buf = ecryptfs_readlink_lower(dentry, &len); + if (IS_ERR(buf)) goto out; fsstack_copy_attr_atime(dentry->d_inode, ecryptfs_dentry_to_lower(dentry)->d_inode); @@ -1003,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, char *target; size_t targetsiz; - rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); - if (!rc) { + target = ecryptfs_readlink_lower(dentry, &targetsiz); + if (!IS_ERR(target)) { kfree(target); stat->size = targetsiz; + } else { + rc = PTR_ERR(target); } } return rc; diff --git a/fs/efs/super.c b/fs/efs/super.c index c6f57a7..50215bb 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } +static void efs_kill_sb(struct super_block *s) +{ + struct efs_sb_info *sbi = SUPER_INFO(s); + kill_block_super(s); + kfree(sbi); +} + static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .mount = efs_mount, - .kill_sb = kill_block_super, + .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("efs"); @@ -105,12 +112,6 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static void efs_put_super(struct super_block *s) -{ - kfree(s->s_fs_info); - s->s_fs_info = NULL; -} - static int efs_remount(struct super_block *sb, int *flags, char *data) { *flags |= MS_RDONLY; @@ -120,7 +121,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .destroy_inode = efs_destroy_inode, - .put_super = efs_put_super, .statfs = efs_statfs, .remount_fs = efs_remount, }; @@ -259,7 +259,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - int ret = -EINVAL; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) @@ -270,7 +269,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { printk(KERN_ERR "EFS: device does not support %d byte blocks\n", EFS_BLOCKSIZE); - goto out_no_fs_ul; + return -EINVAL; } /* read the vh (volume header) block */ @@ -278,7 +277,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!bh) { printk(KERN_ERR "EFS: cannot read volume header\n"); - goto out_no_fs_ul; + return -EINVAL; } /* @@ -290,13 +289,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) brelse(bh); if (sb->fs_start == -1) { - goto out_no_fs_ul; + return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { printk(KERN_ERR "EFS: cannot read superblock\n"); - goto out_no_fs_ul; + return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { @@ -304,7 +303,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); - goto out_no_fs_ul; + return -EINVAL; } brelse(bh); @@ -319,24 +318,16 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { printk(KERN_ERR "EFS: get root inode failed\n"); - ret = PTR_ERR(root); - goto out_no_fs; + return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { printk(KERN_ERR "EFS: get root dentry failed\n"); - ret = -ENOMEM; - goto out_no_fs; + return -ENOMEM; } return 0; - -out_no_fs_ul: -out_no_fs: - s->s_fs_info = NULL; - kfree(sb); - return ret; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 35470d9..d6a88e7 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct file *file; struct eventfd_ctx *ctx; - - file = eventfd_fget(fd); - if (IS_ERR(file)) - return (struct eventfd_ctx *) file; - ctx = eventfd_ctx_get(file->private_data); - fput(file); - + struct fd f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + ctx = eventfd_ctx_fileget(f.file); + fdput(f); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b5e258..af90312 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1907,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } } } - if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) { - tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); - } /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" @@ -62,7 +62,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> @@ -843,7 +842,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); @@ -1088,8 +1086,8 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= - ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1139,9 +1137,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; - flush_signal_handlers(current, 0); do_close_on_exec(current->files); } @@ -1173,6 +1169,10 @@ void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); @@ -1224,11 +1224,10 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -static int check_unsafe_exec(struct linux_binprm *bprm) +static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; - int res = 0; if (p->ptrace) { if (p->ptrace & PT_PTRACE_CAP) @@ -1244,31 +1243,25 @@ static int check_unsafe_exec(struct linux_binprm *bprm) if (current->no_new_privs) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - for (t = next_thread(p); t != p; t = next_thread(t)) { + while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); - if (p->fs->users > n_fs) { + if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; - } else { - res = -EAGAIN; - if (!p->fs->in_exec) { - p->fs->in_exec = 1; - res = 1; - } - } + else + p->fs->in_exec = 1; spin_unlock(&p->fs->lock); - - return res; } -/* - * Fill the binprm structure from the inode. +/* + * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). @@ -1430,14 +1423,7 @@ static int exec_binprm(struct linux_binprm *bprm) audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - current->did_exec = 1; proc_exec_connector(current); - - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; /* to catch use-after-free */ - } } return ret; @@ -1453,7 +1439,6 @@ static int do_execve_common(const char *filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; - bool clear_in_exec; int retval; /* @@ -1485,10 +1470,7 @@ static int do_execve_common(const char *filename, if (retval) goto out_free; - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; + check_unsafe_exec(bprm); current->in_execve = 1; file = open_exec(filename); @@ -1504,7 +1486,7 @@ static int do_execve_common(const char *filename, retval = bprm_mm_init(bprm); if (retval) - goto out_file; + goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1551,15 +1533,8 @@ out: mmput(bprm->mm); } -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; + current->fs->in_exec = 0; current->in_execve = 0; out_free: @@ -1609,67 +1584,22 @@ void set_binfmt(struct linux_binfmt *new) if (new) __module_get(new->module); } - EXPORT_SYMBOL(set_binfmt); /* - * set_dumpable converts traditional three-value dumpable to two flags and - * stores them into mm->flags. It modifies lower two bits of mm->flags, but - * these bits are not changed atomically. So get_dumpable can observe the - * intermediate state. To avoid doing unexpected behavior, get get_dumpable - * return either old dumpable or new one by paying attention to the order of - * modifying the bits. - * - * dumpable | mm->flags (binary) - * old new | initial interim final - * ---------+----------------------- - * 0 1 | 00 01 01 - * 0 2 | 00 10(*) 11 - * 1 0 | 01 00 00 - * 1 2 | 01 11 11 - * 2 0 | 11 10(*) 00 - * 2 1 | 11 11 01 - * - * (*) get_dumpable regards interim value of 10 as 11. + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { - switch (value) { - case SUID_DUMP_DISABLE: - clear_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_USER: - set_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_ROOT: - set_bit(MMF_DUMP_SECURELY, &mm->flags); - smp_wmb(); - set_bit(MMF_DUMPABLE, &mm->flags); - break; - } -} - -int __get_dumpable(unsigned long mm_flags) -{ - int ret; + unsigned long old, new; - ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; -} + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; -/* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean - * value. - */ -int get_dumpable(struct mm_struct *mm) -{ - return __get_dumpable(mm->flags); + do { + old = ACCESS_ONCE(mm->flags); + new = (old & ~MMF_DUMPABLE_MASK) | value; + } while (cmpxchg(&mm->flags, old, new) != old); } SYSCALL_DEFINE3(execve, diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a52a5d2..ee4317fa 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) if (offset >= i_size) { *uptodate = true; - EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); return ZERO_PAGE(0); } @@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) *uptodate = true; else *uptodate = PageUptodate(page); - EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { - EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", pcol->that_locked_page->index); *uptodate = true; return pcol->that_locked_page; @@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page) struct page_collect *pcol = priv; if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { - EXOFS_DBGMSG("index=0x%lx\n", page->index); + EXOFS_DBGMSG2("index=0x%lx\n", page->index); page_cache_release(page); return; } - EXOFS_DBGMSG("that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", ZERO_PAGE(0) == page ? -1 : page->index); } @@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, WARN_ON(1); } + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + return 0; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = { /* Not implemented Yet */ .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = NULL, /* TODO: Should be trivial to do */ + .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ .get_xip_mem = NULL, @@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) if (likely(!ret)) truncate_setsize(inode, newsize); - EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", inode->i_ino, newsize, ret); return ret; } @@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, /* If object is lost on target we might as well enable it's * delete. */ - if ((ret == -ENOENT) || (ret == -EINVAL)) - ret = 0; + ret = 0; goto out; } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); goto out; } WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); @@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); goto out; } if (attrs[1].len) { @@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[2]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); goto out; } if (attrs[2].len) { diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index b744228..dae8846 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - layout->group_width; + (layout->group_width - layout->parity); if (layout->parity) { unsigned stripe_length = (layout->group_width - layout->parity) * @@ -286,7 +286,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, if (length) { ore_calc_stripe_info(layout, offset, length, &ios->si); ios->length = ios->si.length; - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; if (layout->parity) _ore_post_alloc_raid_stuff(ios); } @@ -430,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) if (likely(!ret)) continue; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", @@ -536,6 +541,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u64 H = LmodS - G * T; u32 N = div_u64(H, U); + u32 Nlast; /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; @@ -568,6 +574,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->length = T - H; if (si->length > length) si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); @@ -583,13 +593,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, int ret; if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned nr_pages = ios->nr_pages * ios->layout->group_width / - (ios->layout->group_width - - ios->layout->parity); - unsigned bio_size = (nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -609,8 +622,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", - per_dev->bio->bi_vcnt); + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); ret = -ENOMEM; goto out; } @@ -1098,7 +1115,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; - ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 110b6b3..1b8001b 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -148,13 +148,6 @@ ext2_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -189,19 +182,14 @@ ext2_get_acl(struct inode *inode, int type) /* * inode->i_mutex: down */ -static int -ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -250,169 +238,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int ext2_init_acl(struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) { - /* This is an extended ACL */ - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; + struct posix_acl *default_acl, *acl; + int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} -/* - * Extended attribut handlers - */ -static size_t -ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - - error = ext2_set_acl(dentry->d_inode, type, acl); - -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; - -const struct xattr_handler ext2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 503bfb0..44937f9 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); -extern int ext2_acl_chmod (struct inode *); +extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); #else @@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *); #define ext2_get_acl NULL #define ext2_set_acl NULL -static inline int -ext2_acl_chmod (struct inode *inode) -{ - return 0; -} - static inline int ext2_init_acl (struct inode *inode, struct inode *dir) { return 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a5b3a5d..44c36e5 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 8a33764..94ed3684 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1566,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) } setattr_copy(inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = ext2_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 256dd5f..c268d0a 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, }; @@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2885349..20d6697 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; err = ext2_get_block(inode, blk, &tmp_bh, 1); if (err < 0) goto out; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 2d7557d..9142614 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache; static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, - [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - &ext2_xattr_acl_access_handler, - &ext2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT2_FS_SECURITY &ext2_xattr_security_handler, diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 5e41ccc..60edf29 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -57,8 +57,6 @@ struct ext2_xattr_entry { extern const struct xattr_handler ext2_xattr_user_handler; extern const struct xattr_handler ext2_xattr_trusted_handler; -extern const struct xattr_handler ext2_xattr_acl_access_handler; -extern const struct xattr_handler ext2_xattr_acl_default_handler; extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index dbb5ad5..8bbaf5b 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext3_new_inode */ static int -ext3_set_acl(handle_t *handle, struct inode *inode, int type, +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext3_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext3_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = __ext3_set_acl(handle, inode, type, acl); ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext3_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; - -const struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index dbc921e..ea1c69e 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); -extern int ext3_acl_chmod (struct inode *); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> #define ext3_get_acl NULL - -static inline int -ext3_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext3_set_acl NULL static inline int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index bafdd48..e66e480 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -309,43 +309,17 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname * old = fname; + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; fname = fname->next; - kfree (old); - } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } -} + kfree(old); + } while (fname); + *root = RB_ROOT; +} static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, loff_t pos) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 25cb413..aad0531 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2bd8548..384b6eb 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3365,7 +3365,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (ia_valid & ATTR_MODE) - rc = ext3_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f8cde46..f197736 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2569,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2580,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index b1fc963..c6874be 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache; static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_SECURITY @@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - &ext3_xattr_acl_access_handler, - &ext3_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT3_FS_SECURITY &ext3_xattr_security_handler, diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 2be4f69..32e93eb 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -60,8 +60,6 @@ struct ext3_xattr_entry { extern const struct xattr_handler ext3_xattr_user_handler; extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_acl_access_handler; -extern const struct xattr_handler ext3_xattr_acl_default_handler; extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 39a54a0..d40c8db 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext4_new_inode */ static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext4_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext4_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; - + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + error = __ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext4_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto release_and_out; + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); } - error = ext4_set_acl(handle, inode, type, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler ext4_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 18cb39e..da2c795 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext4_set_acl NULL static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3f11656..41eb9dc 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb) /* Called when the filesystem is unmounted */ void ext4_release_system_zone(struct super_block *sb) { - struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; - struct rb_node *parent; - struct ext4_system_zone *entry; + struct ext4_system_zone *entry, *n; - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - entry = rb_entry(n, struct ext4_system_zone, node); + rbtree_postorder_for_each_entry_safe(entry, n, + &EXT4_SB(sb)->system_blks, node) kmem_cache_free(ext4_system_zone_cachep, entry); - if (!parent) - EXT4_SB(sb)->system_blks = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + EXT4_SB(sb)->system_blks = RB_ROOT; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 680bb33..d638c57 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -353,41 +353,16 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + + *root = RB_ROOT; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e618503..ece5556 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -268,6 +268,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 17ac112..3fe29de 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35f65cf..10cff47 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * extent, we have to mark the cluster as used (store negative * cluster number in partial_cluster). */ - unaligned = pblk & (sbi->s_cluster_ratio - 1); + unaligned = EXT4_PBLK_COFF(sbi, pblk); if (unaligned && (ee_len == num) && (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); @@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * accidentally freeing it later on */ pblk = ext4_ext_pblock(ex); - if (pblk & (sbi->s_cluster_ratio - 1)) + if (EXT4_PBLK_COFF(sbi, pblk)) *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); ex--; @@ -3461,7 +3477,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3da2194..43e64f6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -617,6 +617,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bae9875..82edf5b 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -849,15 +849,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; + int retries; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; goto out; } @@ -867,7 +868,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_journal; } if (ret == -ENOSPC) { @@ -875,6 +876,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, inode, flags, fsdata); + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; goto out; } @@ -887,7 +892,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; - goto out; + goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); @@ -904,16 +909,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; - handle = NULL; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); page_cache_release(page); +out_journal: + ext4_journal_stop(handle); out: - if (handle) - ext4_journal_stop(handle); brelse(iloc.bh); return ret; } @@ -1837,7 +1841,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, { int error; struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -1846,7 +1849,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, return error; raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); if (EXT4_XATTR_LEN(entry->e_name_len) + @@ -1924,9 +1926,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) } /* Clear the content within i_blocks. */ - if (i_size < EXT4_MIN_INLINE_DATA_SIZE) - memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, - EXT4_MIN_INLINE_DATA_SIZE - i_size); + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0757634..6e39895 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -144,8 +144,8 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ static int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1236,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1249,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1289,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } @@ -1784,7 +1772,7 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: @@ -3513,11 +3501,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (EXT4_SB(sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -EOPNOTSUPP; - } - trace_ext4_punch_hole(inode, offset, length); /* @@ -4598,6 +4581,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + if (S_ISREG(inode->i_mode) && (attr->ia_size < inode->i_size)) { if (ext4_should_order_data(inode)) { @@ -4675,7 +4662,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext4_std_error(inode->i_sb, error); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 60589b6..6bea806 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -101,9 +101,8 @@ static long swap_inode_boot_loader(struct super_block *sb, handle_t *handle; int err; struct inode *inode_bl; - struct ext4_inode_info *ei; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { err = -EINVAL; @@ -115,9 +114,6 @@ static long swap_inode_boot_loader(struct super_block *sb, goto swap_boot_out; } - sbi = EXT4_SB(sb); - ei = EXT4_I(inode); - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); if (IS_ERR(inode_bl)) { err = PTR_ERR(inode_bl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4d113ef..04a5c75 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5a0408d..d050e04 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1425,9 +1425,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EIO); } if (unlikely(ino == dir->i_ino)) { - EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", - dentry->d_name.len, - dentry->d_name.name); + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -3225,6 +3224,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; @@ -3235,4 +3235,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d488f80..ab95508 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio) { int i; int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; @@ -298,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); bio->bi_end_io = NULL; @@ -366,7 +366,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); if (!bio) return -ENOMEM; - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c977f4e..1f7784d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb) } ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb) } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting @@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -4071,10 +4080,10 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sbi)); + "reserved pool", ext4_calculate_resv_clusters(sb)); goto failed_mount4a; } @@ -4184,7 +4193,7 @@ failed_mount_wq: } failed_mount3: ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1423c48..e175e94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -95,8 +95,8 @@ static struct mb_cache *ext4_xattr_cache; static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -108,8 +108,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - &ext4_xattr_acl_access_handler, - &ext4_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c767dbd..819d639 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler; extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 27a0820..2e35da1 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o -f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index d0fc287..fa8da4c 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -17,9 +17,6 @@ #include "xattr.h" #include "acl.h" -#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ - (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) - static inline size_t f2fs_acl_size(int count) { if (count <= 4) { @@ -167,19 +164,11 @@ fail: struct posix_acl *f2fs_get_acl(struct inode *inode, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(sbi, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -205,21 +194,15 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } -static int f2fs_set_acl(struct inode *inode, int type, +static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -261,154 +244,31 @@ static int f2fs_set_acl(struct inode *inode, int type, return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(sbi, POSIX_ACL)) { - acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - - if (!test_opt(sbi, POSIX_ACL) || !acl) - goto cleanup; - - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); -cleanup: - posix_acl_release(acl); - return error; + return __f2fs_set_acl(inode, type, acl, NULL); } -int f2fs_acl_chmod(struct inode *inode) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct posix_acl *acl; - int error; - umode_t mode = get_inode_mode(inode); - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(mode)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + struct posix_acl *default_acl, *acl; + int error = 0; - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL); - posix_acl_release(acl); - return error; -} - -static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const char *xname = POSIX_ACL_XATTR_DEFAULT; - size_t size; - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else { - acl = NULL; + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); } - error = f2fs_set_acl(inode, type, acl, NULL); - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler f2fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; - -const struct xattr_handler f2fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 4963313..e086465 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,18 +37,13 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_acl_chmod(struct inode *); +extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL -static inline int f2fs_acl_chmod(struct inode *inode) -{ - return 0; -} - static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *page) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5716e5e..293d048 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab; */ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: page = grab_cache_page(mapping, index); @@ -50,7 +50,7 @@ repeat: */ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = grab_cache_page(mapping, index); @@ -61,11 +61,12 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_readpage(sbi, page, index, READ_SYNC)) + if (f2fs_submit_page_bio(sbi, page, index, + READ_SYNC | REQ_META | REQ_PRIO)) goto repeat; lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + if (unlikely(sbi->por_doing || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto redirty_out; + + if (wbc->for_reclaim) + goto redirty_out; wait_on_page_writeback(page); @@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; + +redirty_out: + dec_page_count(sbi, F2FS_DIRTY_META); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); long written; if (wbc->for_kupdate) return 0; - if (get_pages(sbi, F2FS_DIRTY_META) == 0) + /* collect a number of dirty meta pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) return 0; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); + written = sync_meta_pages(sbi, META, nrpages); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write -= written; return 0; @@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; long nwritten = 0; @@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) + if (unlikely(nr_pages == 0)) break; for (i = 0; i < nr_pages; i++) { @@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, unlock_page(page); break; } - if (nwritten++ >= nr_to_write) + nwritten++; + if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); @@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } if (nwritten) - f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + f2fs_submit_merged_bio(sbi, type, WRITE); return nwritten; } @@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = { int acquire_orphan_inode(struct f2fs_sb_info *sbi) { - unsigned int max_orphans; int err = 0; - /* - * considering 512 blocks in a segment 5 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*507 orphan entries - */ - max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; - mutex_lock(&sbi->orphan_inode_mutex); - if (sbi->n_orphans >= max_orphans) + spin_lock(&sbi->orphan_inode_lock); + if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); + return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct list_head *head, *this; struct orphan_inode_entry *new = NULL, *orphan = NULL; - mutex_lock(&sbi->orphan_inode_mutex); + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new->ino = ino; + + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; list_for_each(this, head) { orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) - goto out; + if (orphan->ino == ino) { + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, new); + return; + } + if (orphan->ino > ino) break; orphan = NULL; } - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - /* add new_oentry into list which is sorted by inode number */ if (orphan) list_add(&new->list, this->prev); else list_add_tail(&new->list, head); -out: - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct list_head *head; struct orphan_inode_entry *orphan; - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { @@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) break; } } - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) iput(inode); } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +void recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blkaddr, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return 0; + return; sbi->por_doing = true; start_blk = __start_cp_addr(sbi) + 1; @@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); sbi->por_doing = false; - return 0; + return; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { - struct list_head *head, *this, *next; + struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; - struct page *page = NULL; unsigned int nentries = 0; - unsigned short index = 1; - unsigned short orphan_blocks; - - orphan_blocks = (unsigned short)((sbi->n_orphans + + unsigned short index; + unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + struct page *page = NULL; + struct orphan_inode_entry *orphan = NULL; + + for (index = 0; index < orphan_blocks; index++) + grab_meta_page(sbi, start_blk + index); - mutex_lock(&sbi->orphan_inode_mutex); + index = 1; + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; /* loop for each orphan inode entry and write them in Jornal block */ - list_for_each_safe(this, next, head) { - struct orphan_inode_entry *orphan; + list_for_each_entry(orphan, head, list) { + if (!page) { + page = find_get_page(META_MAPPING(sbi), start_blk++); + f2fs_bug_on(!page); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + f2fs_put_page(page, 0); + } - orphan = list_entry(this, struct orphan_inode_entry, list); + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* @@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); index++; - start_blk++; nentries = 0; page = NULL; } - if (page) - goto page_exist; + } - page = grab_meta_page(sbi, start_blk); - orphan_blk = (struct f2fs_orphan_block *)page_address(page); - memset(orphan_blk, 0, sizeof(*orphan_blk)); -page_exist: - orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); } - if (!page) - goto end; - orphan_blk->blk_addr = cpu_to_le16(index); - orphan_blk->blk_count = cpu_to_le16(orphan_blocks); - orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); -end: - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ - cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { @@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) + if (unlikely(entry->inode == inode)) return -EEXIST; } list_add_tail(&new->list, head); @@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + + struct list_head *this, *head; if (!S_ISDIR(inode->i_mode)) return; @@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode) return; } + head = &sbi->dir_inode_list; list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); @@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode) struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + + struct list_head *this, *head; struct inode *inode = NULL; spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); @@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { - struct list_head *head = &sbi->dir_inode_list; + struct list_head *head; struct dir_inode_entry *entry; struct inode *inode; retry: spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; if (list_empty(head)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -585,7 +596,7 @@ retry: * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); } goto retry; } @@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); - filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); - filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; @@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { clear_prefree_segments(sbi); F2FS_RESET_SB_DIRT(sbi); } @@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); - f2fs_submit_bio(sbi, DATA, true); - f2fs_submit_bio(sbi, NODE, true); - f2fs_submit_bio(sbi, META, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); /* * update checkpoint pack index @@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) void init_orphan_info(struct f2fs_sb_info *sbi) { - mutex_init(&sbi->orphan_inode_mutex); + spin_lock_init(&sbi->orphan_inode_lock); INIT_LIST_HEAD(&sbi->orphan_inode_list); sbi->n_orphans = 0; + /* + * considering 512 blocks in a segment 8 blocks are needed for cp + * and log segment summaries. Remaining blocks are used to keep + * orphan entries with the limitation one reserved segment + * for cp pack we can have max 1020*504 orphan entries + */ + sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) + * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); - if (unlikely(!orphan_entry_slab)) + if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry), NULL); - if (unlikely(!inode_entry_slab)) { + if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3438c..2261ccd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -24,6 +24,188 @@ #include "segment.h" #include <trace/events/f2fs.h> +static void f2fs_read_end_io(struct bio *bio, int err) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ + struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (unlikely(err)) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); + } + + if (bio->bi_private) + complete(bio->bi_private); + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + int rw; + + if (!io->bio) + return; + + rw = fio->rw; + + if (is_read_io(rw)) { + trace_f2fs_submit_read_bio(io->sbi->sb, rw, + fio->type, io->bio); + submit_bio(rw, io->bio); + } else { + trace_f2fs_submit_write_bio(io->sbi->sb, rw, + fio->type, io->bio); + /* + * META_FLUSH is only from the checkpoint procedure, and we + * should wait this metadata bio for FS consistency. + */ + if (fio->type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + io->bio->bi_private = &wait; + submit_bio(rw, io->bio); + wait_for_completion(&wait); + } else { + submit_bio(rw, io->bio); + } + } + + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + mutex_lock(&io->io_mutex); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + mutex_unlock(&io->io_mutex); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int rw) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, blk_addr, rw); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, blk_addr); + + mutex_lock(&io->io_mutex); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = blk_addr; + + mutex_unlock(&io->io_mutex); + trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -37,7 +219,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); rn = F2FS_NODE(node_page); @@ -51,19 +233,39 @@ int reserve_new_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + /* if inode_page exists, index should be zero */ + f2fs_bug_on(!need_put && index); + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { @@ -71,6 +273,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, pgoff_t start_fofs, end_fofs; block_t start_blkaddr; + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return 0; + read_lock(&fi->ext.ext_lock); if (fi->ext.len == 0) { read_unlock(&fi->ext.ext_lock); @@ -109,6 +314,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; + int need_update = true; f2fs_bug_on(blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@ -117,6 +323,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@ -163,14 +372,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } - goto end_update; + } else { + need_update = false; } - write_unlock(&fi->ext.ext_lock); - return; + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } end_update: write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); + if (need_update) + sync_inode_page(dn); + return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -196,7 +412,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return ERR_PTR(-ENOENT); /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) + if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -208,11 +424,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, sync ? READ_SYNC : READA); + if (err) + return ERR_PTR(err); + if (sync) { wait_on_page_locked(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } @@ -246,7 +465,7 @@ repeat: } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) { + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } @@ -266,16 +485,16 @@ repeat: return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -286,12 +505,12 @@ repeat: * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * - * Also, caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - * Note that, npage is set only by make_empty_dir. + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. */ struct page *get_new_data_page(struct inode *inode, - struct page *npage, pgoff_t index, bool new_i_size) + struct page *ipage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -299,24 +518,16 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, npage, npage, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); if (err) return ERR_PTR(err); - - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - if (!npage) - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } - } - if (!npage) - f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) { + err = -ENOMEM; + goto put_err; + } if (PageUptodate(page)) return page; @@ -325,15 +536,18 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) - return ERR_PTR(err); + goto put_err; + lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + err = -EIO; + goto put_err; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -344,140 +558,187 @@ repeat: i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - mark_inode_dirty_sync(inode); } return page; -} -static void read_end_io(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); } -/* - * Fill the locked page with data located in the block address. - * Return unlocked page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int __allocate_data_block(struct dnode_of_data *dn) { - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_summary sum; + block_t new_blkaddr; + struct node_info ni; + int type; - trace_f2fs_readpage(page, blk_addr, type); + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; - down_read(&sbi->bio_sem); + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - /* Initialize the bio */ - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + type = CURSEG_WARM_DATA; - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - bio_put(bio); - up_read(&sbi->bio_sem); - f2fs_put_page(page, 1); - return -EFAULT; - } + allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); + + /* direct IO doesn't use extent cache to maximize the performance */ + set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + update_extent_cache(new_blkaddr, dn); + clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); - submit_bio(type, bio); - up_read(&sbi->bio_sem); + dn->data_blkaddr = new_blkaddr; return 0; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, +static int get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; - pgoff_t pgofs; - int err; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) { - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; - } + if (check_extent_cache(inode, pgofs, bh_result)) + goto out; + + if (create) + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - trace_f2fs_get_data_block(inode, iblock, bh_result, err); - return (err == -ENOENT) ? 0 : err; + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; + + if (dn.data_blkaddr != NULL_ADDR) { + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; } - /* It does not support data allocation */ - f2fs_bug_on(create); + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; + +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : - ADDRS_PER_BLOCK; - - clear_buffer_new(bh_result); + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + } + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + blkaddr = dn.data_blkaddr; + } /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: f2fs_put_dnode(&dn); - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; +unlock_out: + if (create) + f2fs_unlock_op(sbi); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; } static int f2fs_read_data_page(struct file *file, struct page *page) { - return mpage_readpage(page, get_data_block_ro); + struct inode *inode = page->mapping->host; + int ret; + + /* If the file has inline data, try to read it directlly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + else + ret = mpage_readpage(page, get_data_block); + + return ret; } static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); } -int do_write_data_page(struct page *page) +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - block_t old_blk_addr, new_blk_addr; + block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; @@ -486,10 +747,10 @@ int do_write_data_page(struct page *page) if (err) return err; - old_blk_addr = dn.data_blkaddr; + old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (old_blkaddr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@ -498,15 +759,13 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blk_addr != NEW_ADDR && + if (unlikely(old_blkaddr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); + rewrite_data_page(page, old_blkaddr, fio); } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); + write_data_page(page, &dn, &new_blkaddr, fio); + update_extent_cache(new_blkaddr, &dn); } out_writepage: f2fs_put_dnode(&dn); @@ -521,9 +780,13 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned offset = 0; bool need_balance_fs = false; int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; if (page->index < end_index) goto write; @@ -543,7 +806,7 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (sbi->por_doing) { + if (unlikely(sbi->por_doing)) { err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; } @@ -552,10 +815,18 @@ write: if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); - err = do_write_data_page(page); + err = do_write_data_page(page, &fio); } else { f2fs_lock_op(sbi); - err = do_write_data_page(page); + + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { + err = f2fs_write_inline_data(inode, page, offset); + f2fs_unlock_op(sbi); + goto out; + } else { + err = do_write_data_page(page, &fio); + } + f2fs_unlock_op(sbi); need_balance_fs = true; } @@ -564,8 +835,10 @@ write: else if (err) goto redirty_out; - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); + if (wbc->for_reclaim) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + need_balance_fs = false; + } clear_cold_data(page); out: @@ -617,7 +890,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_dir_inode(inode); @@ -638,27 +912,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_balance_fs(sbi); repeat: + err = f2fs_convert_inline_data(inode, pos + len); + if (err) + return err; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) + goto inline_data; + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - goto err; - - if (dn.data_blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); - - f2fs_put_dnode(&dn); - if (err) - goto err; - + err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); + if (err) { + f2fs_put_page(page, 1); + return err; + } +inline_data: if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -674,15 +949,19 @@ repeat: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (f2fs_has_inline_data(inode)) + err = f2fs_read_inline_data(inode, page); + else + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) return err; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return -EIO; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -691,11 +970,6 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; - -err: - f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); - return err; } static int f2fs_write_end(struct file *file, @@ -714,23 +988,43 @@ static int f2fs_write_end(struct file *file, update_inode_page(inode); } - unlock_page(page); - page_cache_release(page); + f2fs_put_page(page, 1); return copied; } +static int check_direct_IO(struct inode *inode, int rw, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + int i; + + if (rw == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + for (i = 0; i < nr_segs; i++) + if (iov[i].iov_len & blocksize_mask) + return -EINVAL; + return 0; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (rw == WRITE) + /* Let buffer I/O handle the inline data case. */ + if (f2fs_has_inline_data(inode)) + return 0; + + if (check_direct_IO(inode, rw, iov, offset, nr_segs)) return 0; - /* Needs synchronization with the cleaner */ return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + get_data_block); } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, @@ -759,6 +1053,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); SetPageUptodate(page); + mark_inode_dirty(inode); + if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); set_dirty_dir_page(inode, page); @@ -769,7 +1065,7 @@ static int f2fs_set_data_page_dirty(struct page *page) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping, block, get_data_block_ro); + return generic_block_bmap(mapping, block, get_data_block); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a84b0a8..3de9d20 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -24,7 +24,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *debugfs_root; +static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) @@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_inode = sbi->inline_inode; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = sbi->node_inode->i_mapping->nrpages; - si->meta_pages = sbi->meta_inode->i_mapping->nrpages; + si->node_pages = NODE_MAPPING(sbi)->nrpages; + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->sits = SIT_I(sbi)->dirty_sentries; si->fnids = NM_I(sbi)->fcnt; @@ -165,9 +166,9 @@ get_cache: /* free nids */ si->cache_mem = NM_I(sbi)->fcnt; si->cache_mem += NM_I(sbi)->nat_cnt; - npages = sbi->node_inode->i_mapping->nrpages; + npages = NODE_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - npages = sbi->meta_inode->i_mapping->nrpages; + npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); @@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d\n", si->node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); - seq_printf(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - nodes %4d in %4d\n", + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta %4d in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs %5d > %lu\n", + seq_printf(s, " - NATs: %5d > %lu\n", si->nats, NM_WOUT_THRESHOLD); seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", si->sits, si->fnids); @@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { - debugfs_root = debugfs_create_dir("f2fs", NULL); - if (debugfs_root) - debugfs_create_file("status", S_IRUGO, debugfs_root, - NULL, &stat_fops); + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + goto bail; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) + goto free_debugfs_dir; + + return; + +free_debugfs_dir: + debugfs_remove(f2fs_debugfs_root); + +bail: + f2fs_debugfs_root = NULL; + return; } void f2fs_destroy_root_stats(void) { - debugfs_remove_recursive(debugfs_root); - debugfs_root = NULL; + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 594fc1b..2b7c255 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; - if (namelen > F2FS_NAME_LEN) - return NULL; - if (npages == 0) return NULL; @@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); - /* update parent inode number before releasing dentry page */ - F2FS_I(inode)->i_pino = dir->i_ino; - f2fs_put_page(page, 1); } static void init_dent_inode(const struct qstr *name, struct page *ipage) { - struct f2fs_node *rn; + struct f2fs_inode *ri; /* copy name info. to this inode page */ - rn = F2FS_NODE(ipage); - rn->i.i_namelen = cpu_to_le32(name->len); - memcpy(rn->i.i_name, name->name, name->len); + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); set_page_dirty(ipage); } @@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_acl(inode, dir, page); if (err) - goto error; + goto put_error; err = f2fs_init_security(inode, dir, name, page); if (err) - goto error; + goto put_error; wait_on_page_writeback(page); } else { @@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode, } return page; -error: +put_error: f2fs_put_page(page, 1); +error: remove_inode_page(inode); return ERR_PTR(err); } @@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) update_inode_page(dir); - else - mark_inode_dirty(dir); if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); @@ -432,10 +427,11 @@ next: } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in } start: - if (current_depth == MAX_DIR_HASH_DEPTH) + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ @@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; - if (inode && S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - update_inode_page(dir); - } else { - mark_inode_dirty(dir); - } - if (inode) { + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + update_inode_page(dir); + } inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { @@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); - for ( ; n < npages; n++) { + for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 89dc750..fc3c558 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,8 +22,10 @@ #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(condition) +#define f2fs_down_write(x, y) down_write(x) #endif /* @@ -37,6 +39,7 @@ #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -97,6 +100,13 @@ struct dir_inode_entry { struct inode *inode; /* vfs inode pointer */ }; +/* for the list of blockaddresses to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t blkaddr; /* block address to be discarded */ + int len; /* # of consecutive blocks of the discard */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -155,13 +165,15 @@ enum { LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called - * by get_datablock_ro. + * by get_data_block. */ }; #define F2FS_LINK_MAX 32000 /* maximum link count per file */ /* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ + struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ @@ -308,6 +320,14 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; + + /* for small discard management */ + struct list_head discard_list; /* 4KB discard list */ + int nr_discards; /* # of discards in the list */ + int max_discards; /* max. discards to be issued */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ }; /* @@ -338,6 +358,7 @@ enum count_type { * with waiting the bio's completion * ... Only can be used with META. */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { DATA, NODE, @@ -346,6 +367,20 @@ enum page_type { META_FLUSH, }; +struct f2fs_io_info { + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ +}; + +#define is_read_io(rw) (((rw) & 1) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct mutex io_mutex; /* mutex for bio */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -359,9 +394,10 @@ struct f2fs_sb_info { /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ - struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ - sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ - struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for bio operations */ + struct f2fs_bio_info read_io; /* for read bios */ + struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -376,8 +412,9 @@ struct f2fs_sb_info { /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ - struct mutex orphan_inode_mutex; /* for orphan inode list */ + spinlock_t orphan_inode_lock; /* for orphan inode list */ unsigned int n_orphans; /* # of orphan inodes */ + unsigned int max_orphans; /* max orphan inodes */ /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ @@ -414,6 +451,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. @@ -423,6 +463,7 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + int inline_inode; /* # of inline_data inodes */ int bg_gc; /* background gc calls */ unsigned int n_dirty_dirs; /* # of dir inodes */ #endif @@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page) return (struct f2fs_node *)page_address(page); } +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) { sbi->s_dirty = 1; @@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex); + f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { WARN_ON((nid >= NM_I(sbi)->max_nid)); - if (nid >= NM_I(sbi)->max_nid) + if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; } @@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) static inline int F2FS_HAS_BLOCKS(struct inode *inode) { if (F2FS_I(inode)->i_xattr_nid) - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; else - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, @@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + (block_t)count; - if (valid_block_count > sbi->user_block_count) { + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } @@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, return true; } -static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { @@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - return 0; } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { block_t valid_block_count; unsigned int valid_node_count; spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + (block_t)count; - sbi->alloc_valid_block_count += (block_t)count; - valid_node_count = sbi->total_valid_node_count + count; - - if (valid_block_count > sbi->user_block_count) { + valid_block_count = sbi->total_valid_block_count + 1; + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } - if (valid_node_count > sbi->total_node_count) { + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); return false; } if (inode) - inode->i_blocks += count; - sbi->total_valid_node_count = valid_node_count; - sbi->total_valid_block_count = valid_block_count; + inode->i_blocks++; + + sbi->alloc_valid_block_count++; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); return true; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_block_count < count); - f2fs_bug_on(sbi->total_valid_node_count < count); - f2fs_bug_on(inode->i_blocks < count); + f2fs_bug_on(!sbi->total_valid_block_count); + f2fs_bug_on(!sbi->total_valid_node_count); + f2fs_bug_on(!inode->i_blocks); - inode->i_blocks -= count; - sbi->total_valid_node_count -= count; - sbi->total_valid_block_count -= (block_t)count; + inode->i_blocks--; + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); } @@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); } -static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); f2fs_bug_on(!sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); - return 0; } static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) @@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) static inline void f2fs_put_page(struct page *page, int unlock) { - if (!page || IS_ERR(page)) + if (!page) return; if (unlock) { @@ -876,7 +928,9 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ + FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, { if (ri->i_inline & F2FS_INLINE_XATTR) set_inode_flag(fi, FI_INLINE_XATTR); + if (ri->i_inline & F2FS_INLINE_DATA) + set_inode_flag(fi, FI_INLINE_DATA); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, if (is_inode_flag_set(fi, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(fi, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; } static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) @@ -948,16 +1006,33 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); +} + +static inline void *inline_data_addr(struct page *page) +{ + struct f2fs_inode *ri; + ri = (struct f2fs_inode *)page_address(page); + return (void *)&(ri->i_addr[1]); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; } +#define get_inode_mode(i) \ + ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); +int truncate_blocks(struct inode *, u64); void f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); @@ -1027,7 +1102,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); +void remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); @@ -1059,19 +1134,19 @@ void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, - block_t, block_t *); -void write_data_page(struct inode *, struct page *, struct dnode_of_data*, - block_t, block_t *); -void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void write_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *, unsigned int, block_t, block_t *); +void write_data_page(struct page *, struct dnode_of_data *, block_t *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); void rewrite_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); +void allocate_data_block(struct f2fs_sb_info *, struct page *, + block_t, block_t *, struct f2fs_summary *, int); +void f2fs_wait_on_page_writeback(struct page *, enum page_type); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, @@ -1079,6 +1154,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); +int __init create_segment_manager_caches(void); +void destroy_segment_manager_caches(void); /* * checkpoint.c @@ -1090,7 +1167,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); +void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); @@ -1105,13 +1182,17 @@ void destroy_checkpoint_caches(void); /* * data.c */ +void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, + struct f2fs_io_info *); int reserve_new_block(struct dnode_of_data *); +int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); void update_extent_cache(block_t, struct dnode_of_data *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); -int do_write_data_page(struct page *); +int do_write_data_page(struct page *, struct f2fs_io_info *); /* * gc.c @@ -1144,7 +1225,7 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, sits, fnids; int total_count, utilization; - int bg_gc; + int bg_gc, inline_inode; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1164,7 +1245,7 @@ struct f2fs_stat_info { static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) { - return (struct f2fs_stat_info*)sbi->stat_info; + return (struct f2fs_stat_info *)sbi->stat_info; } #define stat_inc_call_count(si) ((si)->call_count++) @@ -1173,6 +1254,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) #define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) #define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode++); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode--); \ + } while (0) + #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ @@ -1216,6 +1308,8 @@ void f2fs_destroy_root_stats(void); #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) #define stat_inc_read_hit(sb) +#define stat_inc_inline_inode(inode) +#define stat_dec_inline_inode(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_seg_count(si, type) @@ -1238,4 +1332,13 @@ extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; + +/* + * inline.c + */ +bool f2fs_may_inline(struct inode *); +int f2fs_read_inline_data(struct inode *, struct page *); +int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); +int recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d714f4..0dfcef5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr; struct dnode_of_data dn; int err; @@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); - goto out; - } - - old_blk_addr = dn.data_blkaddr; - - if (old_blk_addr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - goto out; - } - } - f2fs_put_dnode(&dn); + err = f2fs_reserve_block(&dn, page->index); f2fs_unlock_op(sbi); + if (err) + goto out; file_update_time(vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping || + if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || - !PageUptodate(page)) { + !PageUptodate(page))) { unlock_page(page); err = -EFAULT; goto out; @@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; - if (f2fs_readonly(inode->i_sb)) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; - for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); if (blkaddr == NULL_ADDR) continue; @@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (page->mapping != inode->i_mapping) { + if (unlikely(page->mapping != inode->i_mapping)) { f2fs_put_page(page, 1); return; } @@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); } -static int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; - int err; + int count = 0, err = 0; trace_f2fs_truncate_blocks_enter(inode, from); + if (f2fs_has_inline_data(inode)) + goto done; + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { @@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from) free_next: err = truncate_inode_blocks(inode, free_from); f2fs_unlock_op(sbi); - +done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + err = f2fs_convert_inline_data(inode, attr->ia_size); + if (err) + return err; + truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_SB(inode->i_sb)); @@ -390,7 +382,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = f2fs_acl_chmod(inode); + err = posix_acl_chmod(inode, get_inode_mode(inode)); if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; clear_inode_flag(fi, FI_ACL_MODE); @@ -405,6 +397,7 @@ const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -459,12 +452,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; int ret = 0; + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -499,12 +496,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) <= (offset + len)) { - i_size_write(inode, offset); - mark_inode_dirty(inode); - } - return ret; } @@ -521,6 +512,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; + ret = f2fs_convert_inline_data(inode, offset + len); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -532,22 +527,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (ret) { - f2fs_unlock_op(sbi); - break; - } - - if (dn.data_blkaddr == NULL_ADDR) { - ret = reserve_new_block(&dn); - if (ret) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - break; - } - } - f2fs_put_dnode(&dn); + ret = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); + if (ret) + break; if (pg_start == pg_end) new_size = offset + len; @@ -578,7 +561,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - ret = punch_hole(inode, offset, len, mode); + ret = punch_hole(inode, offset, len); else ret = expand_inode_data(inode, offset, len, mode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b7ad1ec..ea0371e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) kfree(gc_th); sbi->gc_thread = NULL; } - out: return err; } @@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > MAX_VICTIM_SEARCH) - p->max_search = MAX_VICTIM_SEARCH; + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; } @@ -429,7 +428,7 @@ next_step: /* set page dirty and write it */ if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); + f2fs_wait_on_page_writeback(node_page, NODE); set_page_dirty(node_page); } else { if (!PageWriteback(node_page)) @@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA); if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { @@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) inode_dec_dirty_dents(inode); } set_cold_data(page); - do_write_data_page(page); + do_write_data_page(page, &fio); clear_cold_data(page); } out: @@ -631,7 +635,7 @@ next_iput: goto next_step; if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); /* * In the case of FG_GC, it'd be better to reclaim this victim @@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return; blk_start_plug(&plug); @@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&ilist); gc_more: - if (!(sbi->sb->s_flags & MS_ACTIVE)) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 507056d..5d5eb60 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,7 +20,7 @@ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 0000000..31ee5b1 --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,222 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +bool f2fs_may_inline(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + block_t nr_blocks; + loff_t i_size; + + if (!test_opt(sbi, INLINE_DATA)) + return false; + + nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; + if (inode->i_blocks > nr_blocks) + return false; + + i_size = i_size_read(inode); + if (i_size > MAX_INLINE_DATA) + return false; + + return true; +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + void *src_addr, *dst_addr; + + if (page->index) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + goto out; + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + f2fs_put_page(ipage, 1); + +out: + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + +static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) +{ + int err; + struct page *ipage; + struct dnode_of_data dn; + void *src_addr, *dst_addr; + block_t new_blk_addr; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + + f2fs_lock_op(sbi); + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + /* + * i_addr[0] is not used for inline data, + * so reserving new block will not destroy inline data + */ + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) { + f2fs_unlock_op(sbi); + return err; + } + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + SetPageUptodate(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + write_data_page(page, &dn, &new_blk_addr, &fio); + update_extent_cache(new_blk_addr, &dn); + f2fs_wait_on_page_writeback(page, DATA); + + /* clear inline data and flag after data writeback */ + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_dec_inline_inode(inode); + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +{ + struct page *page; + int err; + + if (!f2fs_has_inline_data(inode)) + return 0; + else if (to_size <= MAX_INLINE_DATA) + return 0; + + page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + err = __f2fs_convert_inline_data(inode, page); + f2fs_put_page(page, 1); + return err; +} + +int f2fs_write_inline_data(struct inode *inode, + struct page *page, unsigned size) +{ + void *src_addr, *dst_addr; + struct page *ipage; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + ipage = dn.inode_page; + + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + src_addr = kmap(page); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, size); + kunmap(page); + + /* Release the first data block if it is allocated */ + if (!f2fs_has_inline_data(inode)) { + truncate_data_blocks_range(&dn, 1); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_inc_inline_inode(inode); + } + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + return 0; +} + +int recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && ri->i_inline & F2FS_INLINE_DATA) { +process_inline: + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + src_addr = inline_data_addr(npage); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return -1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { + truncate_blocks(inode, 0); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + goto process_inline; + } + return 0; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d0eaa9f..4d67ed7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); + inode->i_rdev = + old_decode_dev(le32_to_cpu(ri->i_addr[0])); else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = + new_decode_dev(le32_to_cpu(ri->i_addr[1])); } } @@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); ri->i_addr[1] = 0; } else { ri->i_addr[0] = 0; - ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); ri->i_addr[2] = 0; } } @@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; - struct f2fs_node *rn; struct f2fs_inode *ri; /* Check if ino is within scope */ @@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -175,13 +177,11 @@ bad_inode: void update_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *rn; struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = F2FS_I(inode)->i_advise; @@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); remove_inode_page(inode); + stat_dec_inline_inode(inode); f2fs_unlock_op(sbi); sb_end_intwrite(inode->i_sb); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 575adac..397d459 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + F2FS_I(old_inode)->i_pino = new_dir->i_ino; new_inode->i_ctime = CURRENT_TIME; if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); @@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + F2FS_I(old_inode)->i_pino = new_dir->i_ino; + update_inode_page(old_inode); } else { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); + mark_inode_dirty(old_dir); update_inode_page(old_dir); } @@ -496,6 +501,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -522,6 +528,7 @@ const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4ac4150..b0649b7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) */ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - struct blk_plug plug; struct page *page; pgoff_t index; int i; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; - blk_start_plug(&plug); for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; index = current_nat_addr(sbi, nid); @@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) if (!page) continue; if (PageUptodate(page)) { + mark_page_accessed(page); f2fs_put_page(page, 1); continue; } - if (f2fs_readpage(sbi, page, index, READ)) - continue; - + f2fs_submit_page_mbio(sbi, page, index, &fio); + mark_page_accessed(page); f2fs_put_page(page, 0); } - blk_finish_plug(&plug); + f2fs_submit_merged_bio(sbi, META, READ); } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) @@ -391,8 +393,8 @@ got: /* * Caller should call f2fs_put_dnode(dn). - * Also, it should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) @@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode, 1); + dec_valid_node_count(sbi, dn->inode); set_node_addr(sbi, &ni, NULL_ADDR); if (dn->nid == dn->inode->i_ino) { @@ -516,6 +518,10 @@ invalidate: F2FS_SET_SB_DIRT(sbi); f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + dn->node_page->index, dn->node_page->index); + dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } @@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, return 0; /* get indirect nodes in the path */ - for (i = 0; i < depth - 1; i++) { + for (i = 0; i < idx + 1; i++) { /* refernece count'll be increased */ pages[i] = get_node_page(sbi, nid[i]); if (IS_ERR(pages[i])) { - depth = i + 1; err = PTR_ERR(pages[i]); + idx = i - 1; goto fail; } nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } /* free direct nodes linked to a partial indirect node */ - for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); if (!child_nid) continue; @@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, set_nid(pages[idx], i, 0, false); } - if (offset[depth - 1] == 0) { + if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; truncate_node(dn); @@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, f2fs_put_page(pages[idx], 1); } offset[idx]++; - offset[depth - 1] = 0; + offset[idx + 1] = 0; + idx--; fail: - for (i = depth - 3; i >= 0; i--) + for (i = idx; i >= 0; i--) f2fs_put_page(pages[i], 1); trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); @@ -678,11 +685,10 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; - struct f2fs_node *rn; + struct f2fs_inode *ri; struct dnode_of_data dn; struct page *page; @@ -699,7 +705,7 @@ restart: set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = F2FS_NODE(page); + ri = F2FS_INODE(page); switch (level) { case 0: case 1: @@ -709,7 +715,7 @@ restart: nofs = noffset[1]; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; nofs += 1 + NIDS_PER_BLOCK; @@ -718,7 +724,7 @@ restart: nofs = 5 + 2 * NIDS_PER_BLOCK; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; break; @@ -728,7 +734,7 @@ restart: skip_partial: while (cont) { - dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -751,14 +757,14 @@ skip_partial: if (err < 0 && err != -ENOENT) goto fail; if (offset[1] == 0 && - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (page->mapping != node_mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto restart; } wait_on_page_writeback(page); - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); } @@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page) set_new_dnode(&dn, inode, page, npage, nid); if (page) - dn.inode_page_locked = 1; + dn.inode_page_locked = true; truncate_node(&dn); return 0; } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +void remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; nid_t ino = inode->i_ino; struct dnode_of_data dn; - int err; page = get_node_page(sbi, ino); if (IS_ERR(page)) - return PTR_ERR(page); + return; - err = truncate_xattr_node(inode, page); - if (err) { + if (truncate_xattr_node(inode, page)) { f2fs_put_page(page, 1); - return err; + return; } - /* 0 is possible, after f2fs_new_inode() is failed */ f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - return 0; } struct page *new_inode_page(struct inode *inode, const struct qstr *name) @@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct node_info old_ni, new_ni; struct page *page; int err; - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(mapping, dn->nid); + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); if (!page) return ERR_PTR(-ENOMEM); - if (!inc_valid_node_count(sbi, dn->inode, 1)) { + if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { err = -ENOSPC; goto fail; } @@ -898,14 +899,14 @@ fail: * LOCKED_PAGE: f2fs_put_page(page, 1) * error: nothing */ -static int read_node_page(struct page *page, int type) +static int read_node_page(struct page *page, int rw) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct node_info ni; get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { f2fs_put_page(page, 1); return -ENOENT; } @@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type) if (PageUptodate(page)) return LOCKED_PAGE; - return f2fs_readpage(sbi, page, ni.blk_addr, type); + return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); } /* @@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type) */ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; int err; - apage = find_get_page(mapping, nid); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); return; } f2fs_put_page(apage, 0); - apage = grab_cache_page(mapping, nid); + apage = grab_cache_page(NODE_MAPPING(sbi), nid); if (!apage) return; @@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *page; int err; repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -960,11 +959,11 @@ repeat: goto got_it; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } @@ -981,7 +980,6 @@ got_it: struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct blk_plug plug; struct page *page; int err, i, end; @@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start) if (!nid) return ERR_PTR(-ENOENT); repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -1017,12 +1015,12 @@ repeat: blk_finish_plug(&plug); lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn) int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct writeback_control *wbc) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index, end; struct pagevec pvec; int step = ino ? 2 : 0; @@ -1062,7 +1059,7 @@ next_step: while (index <= end) { int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) @@ -1095,7 +1092,7 @@ next_step: else if (!trylock_page(page)) continue; - if (unlikely(page->mapping != mapping)) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: unlock_page(page); continue; @@ -1122,7 +1119,7 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - mapping->a_ops->writepage(page, wbc); + NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); wrote++; if (--wbc->nr_to_write == 0) @@ -1143,31 +1140,31 @@ continue_unlock: } if (wrote) - f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); - + f2fs_submit_merged_bio(sbi, NODE, WRITE); return nwritten; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; - int nr_pages; int ret2 = 0, ret = 0; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* until radix tree lookup accepts end_index */ - if (page->index > end) + if (unlikely(page->index > end)) continue; if (ino && ino_of_node(page) == ino) { @@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) ret2 = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) + if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) ret2 = -EIO; if (!ret) ret = ret2; @@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; block_t new_addr; struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) goto redirty_out; wait_on_page_writeback(page); @@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page, get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); return 0; @@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); - write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); + write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); @@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = 3 * max_hw_blocks(sbi); + wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - wbc->nr_to_write); @@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) return -1; /* 0 nid should not be used */ - if (nid == 0) + if (unlikely(nid == 0)) return 0; if (build) { @@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i, for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - if (start_nid >= nm_i->max_nid) + if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); @@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; if (i++ == FREE_NID_PAGES) @@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; struct list_head *this; retry: - if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; spin_lock(&nm_i->free_nid_list_lock); @@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct f2fs_node *src, *dst; + struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - ipage = grab_cache_page(mapping, ino); + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); if (!ipage) return -ENOMEM; @@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = F2FS_NODE(page); - dst = F2FS_NODE(ipage); + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); - dst->i.i_size = 0; - dst->i.i_blocks = cpu_to_le64(1); - dst->i.i_links = cpu_to_le32(1); - dst->i.i_xattr_nid = 0; + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; new_ni = old_ni; new_ni.ino = ino; - if (!inc_valid_node_count(sbi, NULL, 1)) + if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR); inc_valid_inode_count(sbi); @@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } +/* + * ra_sum_pages() merge contiguous pages into one bio and submit. + * these pre-readed pages are linked in pages list. + */ +static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, + int start, int nrpages) +{ + struct page *page; + int page_idx = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; page_idx < start + nrpages; page_idx++) { + /* alloc temporal page for read node summary info*/ + page = alloc_page(GFP_F2FS_ZERO); + if (!page) { + struct page *tmp; + list_for_each_entry_safe(page, tmp, pages, lru) { + list_del(&page->lru); + unlock_page(page); + __free_pages(page, 0); + } + return -ENOMEM; + } + + lock_page(page); + page->index = page_idx; + list_add_tail(&page->lru, pages); + } + + list_for_each_entry(page, pages, lru) + f2fs_submit_page_mbio(sbi, page, page->index, &fio); + + f2fs_submit_merged_bio(sbi, META, READ); + return 0; +} + int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page; + struct page *page, *tmp; block_t addr; - int i, last_offset; - - /* alloc temporal page for read node */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (!page) - return -ENOMEM; - lock_page(page); + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int i, last_offset, nrpages, err = 0; + LIST_HEAD(page_list); /* scan the node segment */ last_offset = sbi->blocks_per_seg; addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i++, sum_entry++) { - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, bio_blocks); - if (f2fs_readpage(sbi, page, addr, READ_SYNC)) - goto out; + /* read ahead node pages */ + err = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (err) + return err; - lock_page(page); - rn = F2FS_NODE(page); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - addr++; + list_for_each_entry_safe(page, tmp, &page_list, lru) { + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + err = -EIO; + } else { + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + } + + list_del(&page->lru); + unlock_page(page); + __free_pages(page, 0); + } } - unlock_page(page); -out: - __free_pages(page, 0); - return 0; + return err; } static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3496bb3..c4c7988 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) - * `- direct node (x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node */ static inline bool IS_DNODE(struct page *node_page) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fdc8116..976a7a9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = F2FS_NODE(ipage); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct qstr name; @@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode) name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; + + if (unlikely(name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out; + } retry: de = f2fs_find_entry(dir, &name, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) @@ -90,17 +95,16 @@ out_unmap_put: kunmap(page); f2fs_put_page(page, 0); out: - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " - "ino = %x, name = %s, dir = %lx, err = %d", - ino_of_node(ipage), raw_inode->i_name, + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), raw_inode->i_name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } static int recover_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *raw_node = F2FS_NODE(node_page); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(node_page); if (!IS_INODE(node_page)) return 0; @@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -191,9 +195,10 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); + return err; } @@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; + if (recover_inline_data(inode, page)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); @@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, end = start + ADDRS_PER_BLOCK; f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { f2fs_unlock_op(sbi); - return err; + goto out; } wait_on_page_writeback(dn.node_page); @@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, err: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - - f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " - "recovered_data = %d blocks, err = %d", - inode->i_ino, recovered, err); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx, recovered = %d blocks, err = %d", + inode->i_ino, recovered, err); return err; } @@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi, blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); + page = alloc_page(GFP_F2FS_ZERO); if (!page) return -ENOMEM; @@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi, while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -412,8 +421,8 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); if (!err) @@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); - if (unlikely(!fsync_entry_slab)) + if (!fsync_entry_slab) return -ENOMEM; INIT_LIST_HEAD(&inode_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa284d3..7caac5f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,12 +14,163 @@ #include <linux/blkdev.h> #include <linux/prefetch.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include <trace/events/f2fs.h> +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * Example: + * LSB <--> MSB + * f2fs_set_bit(0, bitmap) => 0000 0001 + * f2fs_set_bit(7, bitmap) => 1000 0000 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~0UL << quot; + submask = (unsigned char)(0xff << rest) >> rest; + submask <<= quot; + mask &= submask; + tmp &= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG-1)) { + tmp = *(p++); + if (tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~(~0UL << quot); + submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); + submask <<= quot; + mask += submask; + tmp |= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffz(tmp); +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. @@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static void f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); + sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, + unsigned int segno, struct seg_entry *se) +{ + struct list_head *head = &SM_I(sbi)->discard_list; + struct discard_entry *new; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long dmap[entries]; + unsigned int start = 0, end = -1; + int i; + + if (!test_opt(sbi, DISCARD)) + return; + + /* zero block will be discarded through the prefree list */ + if (!se->valid_blocks || se->valid_blocks == max_blocks) + return; + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, segno) + start; + new->len = end - start; + + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += end - start; + } +} + /* * Should call clear_prefree_segments after checkpoint is done. */ @@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { + struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *this, *next; + struct discard_entry *entry; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) if (!test_opt(sbi, DISCARD)) continue; - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, start) << - sbi->log_sectors_per_block, - (1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg)) * (end - start), - GFP_NOFS, 0); + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); } mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_safe(this, next, head) { + entry = list_entry(this, struct discard_entry, list); + f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + list_del(&entry->list); + SM_I(sbi)->nr_discards -= entry->len; + kmem_cache_free(discard_entry_slab, entry); + } } static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg, block_t start) { struct seg_entry *se = get_seg_entry(sbi, seg->segno); - block_t ofs; - for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { - if (!f2fs_test_bit(ofs, se->ckpt_valid_map) - && !f2fs_test_bit(ofs, se->cur_valid_map)) - break; - } - seg->next_blkoff = ofs; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long target_map[entries]; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; } /* @@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_private *p = bio->bi_private; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - p->sbi->sb->s_flags |= MS_RDONLY; - } - end_page_writeback(page); - dec_page_count(p->sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); - - if (p->is_sync) - complete(p->wait); - - if (!get_pages(p->sbi, F2FS_WRITEBACK) && - !list_empty(&p->sbi->cp_wait.task_list)) - wake_up(&p->sbi->cp_wait); - - kfree(p); - bio_put(bio); -} - -struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) -{ - struct bio *bio; - - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - bio->bi_bdev = bdev; - bio->bi_private = NULL; - - return bio; -} - -static void do_submit_bio(struct f2fs_sb_info *sbi, - enum page_type type, bool sync) -{ - int rw = sync ? WRITE_SYNC : WRITE; - enum page_type btype = type > META ? META : type; - - if (type >= META_FLUSH) - rw = WRITE_FLUSH_FUA; - - if (btype == META) - rw |= REQ_META; - - if (sbi->bio[btype]) { - struct bio_private *p = sbi->bio[btype]->bi_private; - p->sbi = sbi; - sbi->bio[btype]->bi_end_io = f2fs_end_io_write; - - trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); - - if (type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - p->is_sync = true; - p->wait = &wait; - submit_bio(rw, sbi->bio[btype]); - wait_for_completion(&wait); - } else { - p->is_sync = false; - submit_bio(rw, sbi->bio[btype]); - } - sbi->bio[btype] = NULL; - } -} - -void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) -{ - down_write(&sbi->bio_sem); - do_submit_bio(sbi, type, sync); - up_write(&sbi->bio_sem); -} - -static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, enum page_type type) -{ - struct block_device *bdev = sbi->sb->s_bdev; - int bio_blocks; - - verify_block_addr(sbi, blk_addr); - - down_write(&sbi->bio_sem); - - inc_page_count(sbi, F2FS_WRITEBACK); - - if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) - do_submit_bio(sbi, type, false); -alloc_new: - if (sbi->bio[type] == NULL) { - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } - - bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks); - sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - sbi->bio[type]->bi_private = priv; - /* - * The end_io will be assigned at the sumbission phase. - * Until then, let bio_add_page() merge consecutive IOs as much - * as possible. - */ - } - - if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - do_submit_bio(sbi, type, false); - goto alloc_new; - } - - sbi->last_block_in_bio[type] = blk_addr; - - up_write(&sbi->bio_sem); - trace_f2fs_submit_write_page(page, blk_addr, type); -} - -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool sync) -{ - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, type, sync); - wait_on_page_writeback(page); - } -} - static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_6(page, p_type); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, enum page_type p_type) +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int old_cursegno; - int type; - type = __get_segment_type(page, p_type); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); @@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); - if (p_type == NODE) + if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); - /* writeout dirty page into bdev */ - submit_write_page(sbi, page, *new_blkaddr, p_type); - mutex_unlock(&curseg->curseg_mutex); } +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(page, fio->type); + + allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); + + /* writeout dirty page into bdev */ + f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); +} + void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { + struct f2fs_io_info fio = { + .type = META, + .rw = WRITE_SYNC | REQ_META | REQ_PRIO + }; + set_page_writeback(page); - submit_write_page(sbi, page, page->index, META); + f2fs_submit_page_mbio(sbi, page, page->index, &fio); } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio, unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); + do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); } -void write_data_page(struct inode *inode, struct page *page, - struct dnode_of_data *dn, block_t old_blkaddr, - block_t *new_blkaddr) +void write_data_page(struct page *page, struct dnode_of_data *dn, + block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct f2fs_summary sum; struct node_info ni; - f2fs_bug_on(old_blkaddr == NULL_ADDR); + f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, old_blkaddr, - new_blkaddr, &sum, DATA); + do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); } -void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blk_addr) +void rewrite_data_page(struct page *page, block_t old_blkaddr, + struct f2fs_io_info *fio) { - submit_write_page(sbi, page, old_blk_addr, DATA); + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, unsigned int segno, old_cursegno; block_t next_blkaddr = next_blkaddr_of_node(page); unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + struct f2fs_io_info fio = { + .type = NODE, + .rw = WRITE_SYNC, + }; curseg = CURSEG_I(sbi, type); @@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, /* rewrite node page */ set_page_writeback(page); - submit_write_page(sbi, page, new_blkaddr, NODE); - f2fs_submit_bio(sbi, NODE, true); + f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); + f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); locate_dirty_segment(sbi, old_cursegno); @@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + f2fs_submit_merged_bio(sbi, type, WRITE); + wait_on_page_writeback(page); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) + add_discard_addrs(sbi, segno, se); + if (flushed) goto to_sit_page; @@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } +static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; + block_t blk_addr, prev_blk_addr = 0; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + int blkno = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { + + blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); + + if (blkno != start && prev_blk_addr + 1 != blk_addr) + break; + prev_blk_addr = blk_addr; +repeat: + page = grab_cache_page(mapping, blk_addr); + if (!page) { + cond_resched(); + goto repeat; + } + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + + mark_page_accessed(page); + f2fs_put_page(page, 0); + } + + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned int start; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; - struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; - struct page *page; - int i; - - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; + do { + readed = ra_sit_pages(sbi, start_blk, nrpages); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) + == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } } - } - mutex_unlock(&curseg->curseg_mutex); - page = get_current_sit_page(sbi, start); - sit_blk = (struct f2fs_sit_block *)page_address(page); - sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); + mutex_unlock(&curseg->curseg_mutex); + + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); got_it: - check_block_count(sbi, start, &sit); - seg_info_from_raw_sit(se, &sit); - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } } - } + start_blk += readed; + } while (start_blk < sit_blk_cnt); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + + INIT_LIST_HEAD(&sm_info->discard_list); + sm_info->nr_discards = 0; + sm_info->max_discards = 0; err = build_sit_info(sbi); if (err) @@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) sbi->sm_info = NULL; kfree(sm_info); } + +int __init create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry), NULL); + if (!discard_entry_slab) + return -ENOMEM; + return 0; +} + +void destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(discard_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 269f690..5731682 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -20,13 +20,8 @@ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) -#define IS_DATASEG(t) \ - ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ - (t == CURSEG_WARM_DATA)) - -#define IS_NODESEG(t) \ - ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ - (t == CURSEG_WARM_NODE)) +#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ @@ -83,25 +78,20 @@ (segno / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(sit_i, segno) \ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) #define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) #define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + (sectors >> (sbi)->log_sectors_per_block) #define MAX_BIO_BLOCKS(max_hw_blocks) \ (min((int)max_hw_blocks, BIO_MAX_PAGES)) -/* during checkpoint, bio_private is used to synchronize the last bio */ -struct bio_private { - struct f2fs_sb_info *sbi; - bool is_sync; - void *wait; -}; - /* * indicate a block allocation direction: RIGHT and LEFT. * RIGHT means allocating new sections towards the end of volume. @@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return ((prefree_segments(sbi) / sbi->segs_per_sec) - + free_sections(sbi) < overprovision_sections(sbi)); + return (prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) return false; - return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))); + return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi)); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { - return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments); + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. - * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write - * data in the original place likewise other traditional file systems. - * But, currently set 100 in percentage, which means it is disabled. - * See below need_inplace_update(). + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ -#define MIN_IPU_UTIL 100 +#define DEF_MIN_IPU_UTIL 70 + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_DISABLE, +}; + static inline bool need_inplace_update(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + /* IPU can be done only for the user data */ if (S_ISDIR(inode->i_mode)) return false; - if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) + + switch (SM_I(sbi)->ipu_policy) { + case F2FS_IPU_FORCE: return true; + case F2FS_IPU_SSR: + if (need_SSR(sbi)) + return true; + break; + case F2FS_IPU_UTIL: + if (utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_SSR_UTIL: + if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_DISABLE: + break; + } return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bafff72..1a85f83 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -50,6 +50,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_inline_data, Opt_err, }; @@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_inline_data, "inline_data"}, {Opt_err, NULL}, }; @@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; return NULL; } @@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(max_victim_search), NULL, }; @@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; @@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); - + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); int i; for (i = 0; i < total_segs; i++) { @@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (ino < F2FS_ROOT_INO(sbi)) + if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); /* @@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { + if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); @@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); - if (fsmeta >= total) + if (unlikely(fsmeta >= total)) return 1; - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } @@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); @@ -798,9 +817,10 @@ retry: /* sanity checking of raw super */ if (sanity_check_raw_super(sb, *raw_super)) { brelse(*raw_super_buf); - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %dth superblock", block + 1); - if(block == 0) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + if (block == 0) { block++; goto retry; } else { @@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; + int i; /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); @@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } @@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->bio_sem); + + mutex_init(&sbi->read_io.io_mutex); + sbi->read_io.sbi = sbi; + sbi->read_io.bio = NULL; + for (i = 0; i < NR_PAGE_TYPE; i++) { + mutex_init(&sbi->write_io[i].io_mutex); + sbi->write_io[i].sbi = sbi; + sbi->write_io[i].bio = NULL; + } + init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); @@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* if there are nt orphan nodes free them */ - err = -EINVAL; - if (recover_orphan_inodes(sbi)) - goto free_node_inode; + recover_orphan_inodes(sbi); /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto free_node_inode; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + err = -EINVAL; goto free_root_inode; + } sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { @@ -1053,7 +1083,7 @@ static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); - if (f2fs_inode_cachep == NULL) + if (!f2fs_inode_cachep) return -ENOMEM; return 0; } @@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void) err = create_node_manager_caches(); if (err) goto free_inodecache; - err = create_gc_caches(); + err = create_segment_manager_caches(); if (err) goto free_node_manager_caches; + err = create_gc_caches(); + if (err) + goto free_segment_manager_caches; err = create_checkpoint_caches(); if (err) goto free_gc_caches; @@ -1102,6 +1135,8 @@ free_checkpoint_caches: destroy_checkpoint_caches(); free_gc_caches: destroy_gc_caches(); +free_segment_manager_caches: + destroy_segment_manager_caches(); free_node_manager_caches: destroy_node_manager_caches(); free_inodecache: @@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); + destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); kset_unregister(f2fs_kset); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aa7a3f1..89d0422 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "f2fs.h" #include "xattr.h" @@ -216,8 +217,8 @@ const struct xattr_handler f2fs_xattr_security_handler = { static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -229,8 +230,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - &f2fs_xattr_acl_access_handler, - &f2fs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -522,7 +523,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index, if (found) free = free + ENTRY_SIZE(here); - if (free < newsize) { + if (unlikely(free < newsize)) { error = -ENOSPC; goto exit; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 02a08fb..b21d9eb 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -108,8 +108,6 @@ struct f2fs_xattr_entry { #ifdef CONFIG_F2FS_FS_XATTR extern const struct xattr_handler f2fs_xattr_user_handler; extern const struct xattr_handler f2fs_xattr_trusted_handler; -extern const struct xattr_handler f2fs_xattr_acl_access_handler; -extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; @@ -348,21 +348,16 @@ out: return NULL; } -static void close_files(struct files_struct * files) +static struct fdtable *close_files(struct files_struct * files) { - int i, j; - struct fdtable *fdt; - - j = 0; - /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. + * files structure. */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + int i, j = 0; + for (;;) { unsigned long set; i = j * BITS_PER_LONG; @@ -381,6 +376,8 @@ static void close_files(struct files_struct * files) set >>= 1; } } + + return fdt; } struct files_struct *get_files_struct(struct task_struct *task) @@ -398,14 +395,9 @@ struct files_struct *get_files_struct(struct task_struct *task) void put_files_struct(struct files_struct *files) { - struct fdtable *fdt; - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* not really needed, since nobody can see us */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = close_files(files); + /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); @@ -645,16 +637,16 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -struct file *fget(unsigned int fd) +static struct file *__fget(unsigned int fd, fmode_t mask) { - struct file *file; struct files_struct *files = current->files; + struct file *file; rcu_read_lock(); file = fcheck_files(files, fd); if (file) { /* File object ref couldn't be taken */ - if (file->f_mode & FMODE_PATH || + if ((file->f_mode & mask) || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; } @@ -663,25 +655,16 @@ struct file *fget(unsigned int fd) return file; } +struct file *fget(unsigned int fd) +{ + return __fget(fd, FMODE_PATH); +} EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - struct file *file; - struct files_struct *files = current->files; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (!atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } - rcu_read_unlock(); - - return file; + return __fget(fd, 0); } - EXPORT_SYMBOL(fget_raw); /* @@ -700,56 +683,33 @@ EXPORT_SYMBOL(fget_raw); * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ -struct file *fget_light(unsigned int fd, int *fput_needed) +struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed) { - struct file *file; struct files_struct *files = current->files; + struct file *file; *fput_needed = 0; if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - if (file && (file->f_mode & FMODE_PATH)) + file = __fcheck_files(files, fd); + if (file && (file->f_mode & mask)) file = NULL; } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!(file->f_mode & FMODE_PATH) && - atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + file = __fget(fd, mask); + if (file) + *fput_needed = 1; } return file; } +struct file *fget_light(unsigned int fd, int *fput_needed) +{ + return __fget_light(fd, FMODE_PATH, fput_needed); +} EXPORT_SYMBOL(fget_light); struct file *fget_raw_light(unsigned int fd, int *fput_needed) { - struct file *file; - struct files_struct *files = current->files; - - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - } - - return file; + return __fget_light(fd, 0, fput_needed); } void set_close_on_exec(unsigned int fd, int flag) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f4a10e..e0259a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean. We don't want to mess with writeback - * lists in this function since flusher thread may be doing for example - * sync in parallel and if we move the inode, it could get skipped. So - * here we make sure inode is on some writeback list and leave it there - * unless we have completely cleaned the inode. + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ef74ad5..0a648bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); } -static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - -static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = fuse_dev_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, buf->page = bufs[page_nr].page; buf->offset = bufs[page_nr].offset; buf->len = bufs[page_nr].len; - buf->ops = &fuse_dev_pipe_buf_ops; + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + buf->ops = &nosteal_pipe_buf_ops; pipe->nrbufs++; page_nr++; @@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && (num != 0 || file_size == end)) + if (!err && offset == 0 && + (this_num == PAGE_CACHE_SIZE || file_size == end)) SetPageUptodate(page); unlock_page(page); page_cache_release(page); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c3eb2c4..1d1292c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode) get_fuse_inode(inode)->i_time = 0; } +/** + * Mark the attributes as stale due to an atime change. Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ + if (!IS_RDONLY(inode)) + fuse_invalidate_attr(inode); +} + /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace @@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) } __free_page(page); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return err; } @@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry) link[req->out.args[0].size] = '\0'; out: fuse_put_request(fc, req); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return link; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7e70506..77bcc30 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (sync) { + if (ff->fc->no_open) { + /* + * Drop the release request when client does not + * implement 'open' + */ + req->background = 0; + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else if (sync) { req->background = 0; fuse_request_send(ff->fc, req); path_put(&req->misc.release.path); @@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir) { - struct fuse_open_out outarg; struct fuse_file *ff; - int err; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; ff = fuse_file_alloc(fc); if (!ff) return -ENOMEM; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); - if (err) { - fuse_file_free(ff); - return err; + ff->fh = 0; + ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ + if (!fc->no_open || isdir) { + struct fuse_open_out outarg; + int err; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + + } else if (err != -ENOSYS || isdir) { + fuse_file_free(ff); + return err; + } else { + fc->no_open = 1; + } } if (isdir) - outarg.open_flags &= ~FOPEN_DIRECT_IO; + ff->open_flags &= ~FOPEN_DIRECT_IO; - ff->fh = outarg.fh; ff->nodeid = nodeid; - ff->open_flags = outarg.open_flags; file->private_data = fuse_file_get(ff); return 0; @@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page) SetPageUptodate(page); } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); out: unlock_page(page); return err; @@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) fuse_read_update_size(inode, pos, req->misc.read.attr_ver); } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); } for (i = 0; i < req->num_pages; i++) { @@ -2710,6 +2727,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, inode = file->f_mapping->host; i_size = i_size_read(inode); + if ((rw == READ) && (offset > i_size)) + return 0; + /* optimization for short read */ if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7d27309..2da5db2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -485,6 +485,9 @@ struct fuse_conn { * and hence races in setting them will not cause malfunction */ + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode); void fuse_invalidate_entry_cache(struct dentry *entry); +void fuse_invalidate_atime(struct inode *inode); + /** * Acquire reference to fuse_conn */ diff --git a/fs/generic_acl.c b/fs/generic_acl.c deleted file mode 100644 index b3f3676..0000000 --- a/fs/generic_acl.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * (C) 2005 Andreas Gruenbacher <agruen@suse.de> - * - * This file is released under the GPL. - * - * Generic ACL support for in-memory filesystems. - */ - -#include <linux/sched.h> -#include <linux/gfp.h> -#include <linux/fs.h> -#include <linux/generic_acl.h> -#include <linux/posix_acl.h> -#include <linux/posix_acl_xattr.h> - - -static size_t -generic_acl_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - struct posix_acl *acl; - const char *xname; - size_t size; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return 0; - posix_acl_release(acl); - - switch (type) { - case ACL_TYPE_ACCESS: - xname = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xname = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return 0; - } - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int -generic_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -generic_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto failed; - switch (type) { - case ACL_TYPE_ACCESS: - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - goto failed; - inode->i_ctime = CURRENT_TIME; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; - } - } - set_cached_acl(inode, type, acl); - error = 0; -failed: - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_init - Take care of acl inheritance at @inode create time - * - * Files created inside a directory with a default ACL inherit the - * directory's default ACL. - */ -int -generic_acl_init(struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error; - - if (!S_ISLNK(inode->i_mode)) - acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); - if (acl) { - if (S_ISDIR(inode->i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - } else { - inode->i_mode &= ~current_umask(); - } - error = 0; - - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_chmod - change the access acl of @inode upon chmod() - * - * A chmod also changes the permissions of the owner, group/mask, and - * other ACL entries. - */ -int -generic_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - } - return error; -} - -const struct xattr_handler generic_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; - -const struct xattr_handler generic_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f69ac0a..ba94566 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) if (!ip->i_eattr) return NULL; - acl = get_cached_acl(&ip->i_inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - name = gfs2_acl_name(type); if (name == NULL) return ERR_PTR(-EINVAL); @@ -80,7 +76,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode) return error; } -static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; int len; @@ -88,219 +84,49 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) const char *name = gfs2_acl_name(type); BUG_ON(name == NULL); - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - if (len == 0) - return 0; - data = kmalloc(len, GFP_NOFS); - if (data == NULL) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); - if (!error) - set_cached_acl(inode, type, acl); -out: - kfree(data); - return error; -} - -int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl; - umode_t mode = inode->i_mode; - int error = 0; - - if (!sdp->sd_args.ar_posix_acl) - return 0; - if (S_ISLNK(inode->i_mode)) - return 0; - - acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) { - mode &= ~current_umask(); - return gfs2_set_mode(inode, mode); - } - - if (S_ISDIR(inode->i_mode)) { - error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_NOFS, &mode); - if (error < 0) - return error; - if (error == 0) - goto munge; - - error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl); - if (error) - goto out; -munge: - error = gfs2_set_mode(inode, mode); -out: - posix_acl_release(acl); - return error; -} - -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) -{ - struct inode *inode = &ip->i_inode; - struct posix_acl *acl; - char *data; - unsigned int len; - int error; - - acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return gfs2_setattr_simple(inode, attr); - - error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); - if (error) - return error; - - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (data == NULL) - goto out; - posix_acl_to_xattr(&init_user_ns, acl, data, len); - error = gfs2_xattr_acl_chmod(ip, attr, data); - kfree(data); - set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - -static int gfs2_acl_type(const char *name) -{ - if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0) - return ACL_TYPE_ACCESS; - if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl; - int type; - int error; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - - type = gfs2_acl_type(name); - if (type < 0) - return type; - - acl = gfs2_get_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, - int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl = NULL; - int error = 0, type; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - - type = gfs2_acl_type(name); - if (type < 0) - return type; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!value) - goto set_acl; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - - error = -EINVAL; if (acl->a_count > GFS2_ACL_MAX_ENTRIES) - goto out_release; + return -EINVAL; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; - if (error <= 0) { - posix_acl_release(acl); + if (error == 0) acl = NULL; - if (error < 0) - return error; - } - error = gfs2_set_mode(inode, mode); if (error) - goto out_release; + return error; } -set_acl: - error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); - if (!error) { - if (acl) - set_cached_acl(inode, type, acl); - else - forget_cached_acl(inode, type); + if (acl) { + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) + goto out; + } else { + data = NULL; + len = 0; } -out_release: - posix_acl_release(acl); + + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (error) + goto out; + + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); out: + kfree(data); return error; } - -const struct xattr_handler gfs2_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .flags = GFS2_EATYPE_SYS, - .get = gfs2_xattr_system_get, - .set = gfs2_xattr_system_set, -}; - diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 0da38dc..301260c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -17,8 +17,6 @@ #define GFS2_ACL_MAX_ENTRIES 25 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); -extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); -extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern const struct xattr_handler gfs2_xattr_system_handler; +extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b7fc035..49436fa 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1006,6 +1007,36 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_length(iov, nr_segs); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + goto out; + if (rw == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); @@ -1050,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); - gfs2_log_unlock(sdp); head = bh = page_buffers(page); do { - gfs2_log_lock(sdp); bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - if (!list_empty(&bd->bd_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) kmem_cache_free(gfs2_bufdata_cachep, bd); + } bh = bh->b_this_page; } while (bh != head); + gfs2_log_unlock(sdp); return try_to_free_buffers(page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2e5fc26..fa32655 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, leaf->lf_entries = 0; leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); leaf->lf_next = 0; - memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); dent = (struct gfs2_dirent *)(leaf+1); gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); *pbh = bh; @@ -1612,11 +1617,31 @@ out: return ret; } +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + static int dir_new_leaf(struct inode *inode, const struct qstr *name) { struct buffer_head *bh, *obh; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; int error; u32 index; u64 bn; @@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) if (error) return error; do { + dist++; oleaf = (struct gfs2_leaf *)obh->b_data; bn = be64_to_cpu(oleaf->lf_next); if (!bn) @@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) brelse(obh); return -ENOSPC; } + leaf->lf_dist = cpu_to_be32(dist); oleaf->lf_next = cpu_to_be64(bh->b_blocknr); brelse(bh); brelse(obh); @@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) /** * gfs2_dir_add - Add new filename into directory - * @dip: The GFS2 inode - * @filename: The new name - * @inode: The inode number of the entry - * @type: The type of the entry + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. * * Returns: 0 on success, error code on failure */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip) + const struct gfs2_inode *nip, struct gfs2_diradd *da) { struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - struct gfs2_dirent *dent; + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; struct gfs2_leaf *leaf; int error; while(1) { - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, - &bh); + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + tv = CURRENT_TIME; if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } + da->dent = NULL; + da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; + struct timespec tv = CURRENT_TIME; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!entries) gfs2_consist_inode(dip); leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } brelse(bh); if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -2017,22 +2061,36 @@ out: * gfs2_diradd_alloc_required - find if adding entry will require an allocation * @ip: the file being written to * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info * - * Returns: 1 if alloc required, 0 if not, -ve on error + * Returns: 0 if ok, -ve on error */ -int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) { + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); struct gfs2_dirent *dent; struct buffer_head *bh; + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); if (!dent) { - return 1; + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; } if (IS_ERR(dent)) return PTR_ERR(dent); - brelse(bh); + da->bh = bh; + da->dent = dent; return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f03bbd..126c65d 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,6 +16,14 @@ struct inode; struct gfs2_inode; struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; +}; extern struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, @@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir, extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip); + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct file_ra_state *f_ra); @@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + struct gfs2_diradd *da); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c8420f7..ca0be6c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) glock_hash_walk(thaw_glock, sdp); } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { - int ret; spin_lock(&gl->gl_spin); - ret = gfs2_dump_glock(seq, gl); + gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); - return ret; } static void dump_glock_func(struct gfs2_glock *gl) @@ -1647,14 +1645,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) * @seq: the seq_file struct * @gh: the glock holder * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; char flags_buf[32]; + rcu_read_lock(); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", @@ -1664,7 +1662,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); - return 0; + rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) @@ -1719,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; - int error = 0; dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ @@ -1745,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time); - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(seq, gh); - if (error) - goto out; - } + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) - error = glops->go_dump(seq, gl); -out: - return error; + glops->go_dump(seq, gl); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1951,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - return dump_glock(seq, iter_ptr); + dump_glock(seq, iter_ptr); + return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6647d77..32572f7 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index db908f6..3bf0631 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd; int error; @@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + gfs2_log_flush(sdp, gl); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); spin_lock(&gl->gl_spin); @@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; WARN_ON_ONCE(!(flags & DIO_METADATA)); - gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); - truncate_inode_pages(mapping, 0); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); if (gl->gl_object) { struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; @@ -192,8 +194,11 @@ static void inode_go_sync(struct gfs2_glock *gl) if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; @@ -410,6 +415,9 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && (gh->gh_state == LM_ST_EXCLUSIVE)) { @@ -429,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh) * @seq: The iterator * @ip: the inode * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) - return 0; + return; gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, (unsigned long long)i_size_read(&ip->i_inode)); - return 0; } /** @@ -552,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE | GLOF_LVB, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ba1ea67..cf0e344 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -93,6 +93,7 @@ struct gfs2_rgrpd { struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ @@ -217,7 +218,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; @@ -350,7 +351,15 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - struct work_struct gl_delete; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; struct rcu_head gl_rcu; }; @@ -419,10 +428,13 @@ enum { }; struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; struct list_head qd_list; struct kqid qd_id; + struct gfs2_sbd *qd_sbd; struct lockref qd_lockref; struct list_head qd_lru; + unsigned qd_hash; unsigned long qd_flags; /* QDF_... */ @@ -441,6 +453,7 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; + struct rcu_head qd_rcu; }; struct gfs2_trans { @@ -720,13 +733,15 @@ struct gfs2_sbd { spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; - unsigned int sd_quota_chunks; - unsigned char **sd_quota_bitmap; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; /* Log stuff */ + struct address_space sd_aspace; + spinlock_t sd_log_lock; struct gfs2_trans *sd_log_tr; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7119504..5c52418 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); if (!inode) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, brelse(dibh); } +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip, int arq) + struct gfs2_inode *ip, struct gfs2_diradd *da) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; int error; - if (arq) { + if (da->nr_blocks) { error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - dip->i_rgd->rd_length + - 2 * RES_DINODE + - RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); if (error) goto fail_ipreserv; } else { @@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip); + error = gfs2_dir_add(&dip->i_inode, name, ip, da); if (error) goto fail_end_trans; @@ -552,6 +571,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unsigned int size, int excl, int *opened) { const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; struct gfs2_holder ghs[2]; struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; @@ -560,7 +580,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct dentry *d; int error; u32 aflags = 0; - int arq; + struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -585,6 +605,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = PTR_ERR(inode); if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); + error = PTR_ERR(d); + if (IS_ERR(d)) + goto fail_gunlock; error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -602,7 +625,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; } - arq = error = gfs2_diradd_alloc_required(dir, name); + error = gfs2_diradd_alloc_required(dir, name, &da); if (error < 0) goto fail_gunlock; @@ -611,10 +634,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!inode) goto fail_gunlock; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto fail_free_vfs_inode; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) - goto fail_free_inode; + goto fail_free_acls; inode->i_mode = mode; set_nlink(inode, S_ISDIR(mode) ? 2 : 1); @@ -682,7 +709,16 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_acl_create(dip, inode); + if (default_acl) { + error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + if (error) goto fail_gunlock3; @@ -690,7 +726,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip, arq); + error = link_dinode(dip, name, ip, &da); if (error) goto fail_gunlock3; @@ -716,9 +752,16 @@ fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); gfs2_rs_delete(ip, NULL); +fail_free_acls: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); +fail_free_vfs_inode: free_inode_nonrcu(inode); inode = NULL; fail_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { clear_nlink(inode); @@ -779,6 +822,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, } d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + iput(inode); + gfs2_glock_dq_uninit(&gh); + return d; + } if (file && S_ISREG(inode->i_mode)) error = finish_open(file, dentry, gfs2_open_common, opened); @@ -817,7 +865,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - int alloc_required; + struct gfs2_diradd da = { .bh = NULL, }; int error; if (S_ISDIR(inode->i_mode)) @@ -872,13 +920,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (ip->i_inode.i_nlink == (u32)-1) goto out_gunlock; - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); if (error < 0) goto out_gunlock; - error = 0; - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(dip); if (error) goto out_gunlock; @@ -887,10 +934,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(dip, sdp->sd_max_dirres) + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); if (error) goto out_ipres; } else { @@ -903,7 +947,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_end_trans; - error = gfs2_dir_add(dir, &dentry->d_name, ip); + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); if (error) goto out_brelse; @@ -919,12 +963,13 @@ out_brelse: out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(dip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(dip); out_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq(ghs + 1); out_child: gfs2_glock_dq(ghs); @@ -1254,7 +1299,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required = 0; + struct gfs2_diradd da = { .nr_blocks = 0, }; unsigned int x; int error; @@ -1388,14 +1433,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - if (nip == NULL) - alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - error = alloc_required; - if (error < 0) - goto out_gunlock; + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(ndip); if (error) goto out_gunlock; @@ -1404,10 +1449,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); if (error) goto out_ipreserv; } else { @@ -1441,19 +1484,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); if (error) goto out_end_trans; out_end_trans: gfs2_trans_end(sdp); out_ipreserv: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(ndip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(ndip); out_gunlock: + gfs2_dir_no_add(&da); while (x--) { gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); @@ -1607,10 +1651,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_quota_lock(ip, nuid, ngid); + error = get_write_access(inode); if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1637,6 +1693,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } @@ -1678,10 +1736,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) error = gfs2_setattr_size(inode, attr->ia_size); else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) error = setattr_chown(inode, attr); - else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) - error = gfs2_acl_chmod(ip, attr); - else + else { error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + } out: if (!error) @@ -1841,6 +1900,7 @@ const struct inode_operations gfs2_file_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1862,6 +1922,7 @@ const struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, .atomic_open = gfs2_atomic_open, }; @@ -1877,6 +1938,5 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, }; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 610613f..9dcb977 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 010b9fb..7669379 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@ -272,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) nrvecs = max(nrvecs/2, 1U); } - bio->bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; @@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; int error; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0650db2..c272e73 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void) gfs2_str2qstr(&gfs2_qdot, "."); gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); error = gfs2_sys_init(); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9324150..c7f2469 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned long index; unsigned int bufnum; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ @@ -258,6 +261,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); @@ -273,12 +277,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int tr->tr_num_databuf_rm++; } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } spin_unlock(&sdp->sd_ail_lock); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 82303b4..c6872d0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -36,6 +36,7 @@ #include "log.h" #include "quota.h" #include "dir.h" +#include "meta_io.h" #include "trace_gfs2.h" #define DO 0 @@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_meta_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); @@ -217,14 +231,14 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) page = alloc_page(GFP_NOFS); if (unlikely(!page)) - return -ENOBUFS; + return -ENOMEM; ClearPageUptodate(page); ClearPageDirty(page); lock_page(page); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -956,40 +970,6 @@ fail: return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; - - if (undo) - goto fail_quotad; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - - return 0; - - -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; -} - static const match_table_t nolock_tokens = { { Opt_jid, "jid=%d\n", }, { Opt_err, NULL }, @@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } @@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: @@ -1366,8 +1340,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 98236d0..8bec0e31 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -52,6 +52,11 @@ #include <linux/dqblk_xfs.h> #include <linux/lockref.h> #include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" @@ -67,16 +72,44 @@ #include "inode.h" #include "util.h" -struct gfs2_quota_change_host { - u64 qc_change; - u32 qc_flags; /* GFS2_QCF_... */ - struct kqid qc_id; -}; +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); struct list_lru gfs2_qd_lru; +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + static void gfs2_qd_dispose(struct list_head *list) { struct gfs2_quota_data *qd; @@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list) list_del(&qd->qd_list); spin_unlock(&qd_lock); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); @@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list) atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } } @@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd) return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) - return -ENOMEM; + return NULL; + qd->qd_sbd = sdp; qd->qd_lockref.count = 1; spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; - *qdp = qd; - - return 0; + return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); - return error; + return NULL; } -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) { - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; - int error, found; - - *qdp = NULL; + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; - for (;;) { - found = 0; - spin_lock(&qd_lock); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qid_eq(qd->qd_id, qid) && - lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); - found = 1; - break; - } + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; } + } - if (!found) - qd = NULL; + return NULL; +} - if (!qd && new_qd) { - qd = new_qd; - list_add(&qd->qd_list, &sdp->sd_quota_list); - atomic_inc(&sdp->sd_quota_count); - new_qd = NULL; - } - spin_unlock(&qd_lock); +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); - if (qd) { - if (new_qd) { - gfs2_glock_put(new_qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, new_qd); - } - *qdp = qd; - return 0; - } + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); - error = qd_alloc(sdp, qid, &new_qd); - if (error) - return error; + if (qd) + return 0; + + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + + return 0; } + static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; @@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd) static int slot_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - unsigned int c, o = 0, b; - unsigned char byte = 0; + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; - if (qd->qd_slot_count++) { - spin_unlock(&qd_lock); - return 0; + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; +out: + qd->qd_slot_count++; } + spin_unlock(&sdp->sd_bitmap_lock); - for (c = 0; c < sdp->sd_quota_chunks; c++) - for (o = 0; o < PAGE_SIZE; o++) { - byte = sdp->sd_quota_bitmap[c][o]; - if (byte != 0xFF) - goto found; - } - - goto fail; - -found: - for (b = 0; b < 8; b++) - if (!(byte & (1 << b))) - break; - qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; - - if (qd->qd_slot >= sdp->sd_quota_slots) - goto fail; - - sdp->sd_quota_bitmap[c][o] |= 1 << b; - - spin_unlock(&qd_lock); - - return 0; - -fail: - qd->qd_slot_count--; - spin_unlock(&qd_lock); - return -ENOSPC; + return error; } static void slot_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&qd_lock); -} - -static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); + spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } - spin_unlock(&qd_lock); + spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; + slot_hold(qd); return 1; } @@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) return error; } -static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = make_kqid(&init_user_ns, - (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, - be32_to_cpu(str->qc_id)); -} - int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); @@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; u64 dblock; u32 extlen = 0; int error; @@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; - sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); - + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); error = -ENOMEM; - - sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_NOFS); + sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); - if (!sdp->sd_quota_bitmap[x]) - goto fail; - } + memset(sdp->sd_quota_bitmap, 0, bm_size); for (x = 0; x < blocks; x++) { struct buffer_head *bh; + const struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { @@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) goto fail; } + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { - struct gfs2_quota_change_host qc; struct gfs2_quota_data *qd; - - gfs2_quota_change_in(&qc, bh->b_data + - sizeof(struct gfs2_meta_header) + - y * sizeof(struct gfs2_quota_change)); - if (!qc.qc_change) + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) continue; - error = qd_alloc(sdp, qc.qc_id, &qd); - if (error) { + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { brelse(bh); goto fail; } set_bit(QDF_CHANGE, &qd->qd_flags); - qd->qd_change = qc.qc_change; + qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; spin_lock(&qd_lock); - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); spin_unlock(&qd_lock); + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); + found++; } @@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; - unsigned int x; spin_lock(&qd_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - /* - * To be removed in due course... we should be able to - * ensure that all refs to the qd have done by this point - * so that this rather odd test is not required - */ - spin_lock(&qd->qd_lockref.lock); - if (qd->qd_lockref.count > 1 || - (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - spin_unlock(&qd->qd_lockref.lock); - list_move(&qd->qd_list, head); - spin_unlock(&qd_lock); - schedule(); - spin_lock(&qd_lock); - continue; - } - spin_unlock(&qd->qd_lockref.lock); - list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ list_lru_del(&gfs2_qd_lru, &qd->qd_lru); atomic_dec(&sdp->sd_quota_count); spin_unlock(&qd_lock); - if (!qd->qd_lockref.count) { - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - } else - gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_glock_put(qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); spin_lock(&qd_lock); } @@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); if (sdp->sd_quota_bitmap) { - for (x = 0; x < sdp->sd_quota_chunks; x++) - kfree(sdp->sd_quota_bitmap[x]); - kfree(sdp->sd_quota_bitmap); + if (is_vmalloc_addr(sdp->sd_quota_bitmap)) + vfree(sdp->sd_quota_bitmap); + else + kfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } } @@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 96e4f34a..55d506e 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) extern const struct quotactl_ops gfs2_quotactl_ops; extern struct shrinker gfs2_qd_shrinker; extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c8d6161..a1da213 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -57,6 +57,11 @@ * 3 = Used (metadata) */ +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + static const char valid_change[16] = { /* current */ /* n */ 0, 1, 1, 1, @@ -65,8 +70,9 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); /** @@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; rs->rs_free = 0; clear_bit(GBF_FULL, &bi->bi_flags); - smp_mb__after_clear_bit(); } } @@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; } if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); @@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) return 0; - return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); + return gfs2_rgrp_bh_get(rgd); } /** @@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, * @rbm: The current position in the resource group * @ip: The inode for which we are searching for blocks * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure * * This checks the current position in the rgrp to see whether there is * a reservation covering this block. If not then this function is a @@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, const struct gfs2_inode *ip, - u32 minext) + u32 minext, + struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; @@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, */ if (minext) { extlen = gfs2_free_extlen(rbm, minext); - nblock = block + extlen; - if (extlen < minext) + if (extlen <= maxext->len) goto fail; } @@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * and skip if parts of it are already reserved */ nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); - if (nblock == block) - return 0; + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } fail: + nblock = block + extlen; + } ret = gfs2_rbm_from_block(rbm, nblock); if (ret < 0) return ret; @@ -1568,30 +1592,38 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: The requested extent length (0 for a single block) + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. + * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. * * Returns: 0 on success, -ENOSPC if there is no block of the requested state */ -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap) +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) { struct buffer_head *bh; int initial_bii; u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; u32 offset; u8 *buffer; int n = 0; int iters = rbm->rgd->rd_length; int ret; struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; /* If we are not starting at the beginning of a bitmap, then we * need to add one to the bitmap count to ensure that we search @@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, return 0; initial_bii = rbm->bii; - ret = gfs2_reservation_check_and_update(rbm, ip, minext); + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); if (ret == 0) return 0; if (ret > 0) { @@ -1655,6 +1689,24 @@ next_iter: break; } + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + return -ENOSPC; } @@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip while (1) { down_write(&sdp->sd_log_flush_lock); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } /* Skip unuseable resource groups */ - if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) @@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a return 0; } - /* Drop reservation, if we couldn't use reserved rgrp */ - if (gfs2_rs_active(rs)) - gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + /* Unlock rgrp if required */ if (!rg_locked) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, * */ -int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved); + rgd->rd_reserved, rgd->rd_extfail_pt); spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs); } spin_unlock(&rgd->rd_rsspin); - return 0; } static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) @@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); } /* Since all blocks are reserved in advance, this shouldn't happen */ if (error) { - fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", (unsigned long long)ip->i_no_addr, error, *nblocks, - test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); goto rgrp_error; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3a10d2f..463ab2e 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 35da5b1..60f60f6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = init_threads(sdp); if (error) return error; + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + if (error) + goto fail_threads; + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) fail: t_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&t_gh); - +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); return error; } @@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -857,9 +893,6 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); if (error) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 8c6a6f6..0b81f78 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> +#include <linux/posix_acl_xattr.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -1500,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = { const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, - &gfs2_xattr_system_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, NULL, }; diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h index 07c0d49..95c8ed9 100644 --- a/fs/hfsplus/acl.h +++ b/fs/hfsplus/acl.h @@ -12,16 +12,13 @@ /* posix_acl.c */ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); -extern int hfsplus_posix_acl_chmod(struct inode *); +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type); extern int hfsplus_init_posix_acl(struct inode *, struct inode *); #else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ #define hfsplus_get_posix_acl NULL - -static inline int hfsplus_posix_acl_chmod(struct inode *inode) -{ - return 0; -} +#define hfsplus_set_posix_acl NULL static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4a4fea0..9ee6298 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -532,6 +532,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .removexattr = hfsplus_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 37213d0..4551cbd 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = { .d_compare = hfsplus_compare_dentry, }; -static struct dentry *hfsplus_file_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) -{ - struct hfs_find_data fd; - struct super_block *sb = dir->i_sb; - struct inode *inode = NULL; - struct hfsplus_inode_info *hip; - int err; - - if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) - goto out; - - inode = HFSPLUS_I(dir)->rsrc_inode; - if (inode) - goto out; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - hip = HFSPLUS_I(inode); - inode->i_ino = dir->i_ino; - INIT_LIST_HEAD(&hip->open_dir_list); - mutex_init(&hip->extents_lock); - hip->extent_state = 0; - hip->flags = 0; - hip->userflags = 0; - set_bit(HFSPLUS_I_RSRC, &hip->flags); - - err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); - if (!err) { - err = hfsplus_find_cat(sb, dir->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); - } - if (err) { - iput(inode); - return ERR_PTR(err); - } - hip->rsrc_inode = dir; - HFSPLUS_I(dir)->rsrc_inode = inode; - igrab(dir); - - /* - * __mark_inode_dirty expects inodes to be hashed. Since we don't - * want resource fork inodes in the regular inode space, we make them - * appear hashed, but do not put on any lists. hlist_del() - * will work fine and require no locking. - */ - hlist_add_fake(&inode->i_hash); - - mark_inode_dirty(inode); -out: - d_add(dentry, inode); - return NULL; -} - static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { @@ -319,7 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) { - error = hfsplus_posix_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); if (unlikely(error)) return error; } @@ -385,7 +327,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } static const struct inode_operations hfsplus_file_inode_operations = { - .lookup = hfsplus_file_lookup, .setattr = hfsplus_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -393,6 +334,7 @@ static const struct inode_operations hfsplus_file_inode_operations = { .removexattr = hfsplus_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index b609cc1..df0c9af 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) char *value = NULL; ssize_t size; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) return acl; } -static int hfsplus_set_posix_acl(struct inode *inode, - int type, - struct posix_acl *acl) +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) { int err; char *xattr_name; size_t size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -115,7 +111,7 @@ end_set_acl: int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { int err = 0; - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; hfs_dbg(ACL_MOD, "[%s]: ino %lu, dir->ino %lu\n", @@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) if (S_ISLNK(inode->i_mode)) return 0; - acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - if (S_ISDIR(inode->i_mode)) { - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_DEFAULT, - acl); - if (unlikely(err)) - goto init_acl_cleanup; - } - - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (unlikely(err < 0)) - return err; - - if (err > 0) - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_ACCESS, - acl); - } else - inode->i_mode &= ~current_umask(); - -init_acl_cleanup: - posix_acl_release(acl); - return err; -} - -int hfsplus_posix_acl_chmod(struct inode *inode) -{ - int err; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (unlikely(err)) + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) return err; - err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return err; -} - -static int hfsplus_xattr_get_posix_acl(struct dentry *dentry, - const char *name, - void *buffer, - size_t size, - int type) -{ - int err = 0; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, buffer %p, size %zu, type %#x\n", - __func__, dentry->d_inode->i_ino, buffer, size, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = hfsplus_get_posix_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int hfsplus_xattr_set_posix_acl(struct dentry *dentry, - const char *name, - const void *value, - size_t size, - int flags, - int type) -{ - int err = 0; - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n", - __func__, inode->i_ino, value, size, flags, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - err = posix_acl_valid(acl); - if (err) - goto end_xattr_set_acl; - } + if (default_acl) { + err = hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - err = hfsplus_set_posix_acl(inode, type, acl); - -end_xattr_set_acl: - posix_acl_release(acl); + if (acl) { + if (!err) + err = hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return err; } - -static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry, - char *list, - size_t list_size, - const char *name, - size_t name_len, - int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - -const struct xattr_handler hfsplus_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; - -const struct xattr_handler hfsplus_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e9a97a0..3f99964 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -63,7 +63,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; if (!(rw & WRITE) && data) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 3c6136f..0b4a5c9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -7,6 +7,7 @@ */ #include "hfsplus_fs.h" +#include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" @@ -15,8 +16,8 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - &hfsplus_xattr_acl_access_handler, - &hfsplus_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &hfsplus_xattr_security_handler, NULL @@ -51,82 +52,6 @@ static inline int is_known_namespace(const char *name) return true; } -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t size) -{ -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - struct posix_acl *acl; - int err; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (err < 0) - return err; - mark_inode_dirty(inode); - } - /* - * We're changing the ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - - return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - posix_acl_release(acl); - - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); - - return 0; - } -#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ - return -EOPNOTSUPP; -} - -static int can_set_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); - - if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { - /* - * This makes sure that we aren't trying to set an - * attribute in a different namespace by prefixing it - * with "osx." - */ - if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) - return -EOPNOTSUPP; - - return 0; - } - - /* - * Don't allow setting an attribute in an unknown namespace. - */ - if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && - strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EOPNOTSUPP; - - return 0; -} - static void hfsplus_init_header_node(struct inode *attr_file, u32 clump_size, char *buf, u16 node_size) @@ -349,10 +274,6 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, value, size); - if (err) - return err; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN) == 0) name += XATTR_MAC_OSX_PREFIX_LEN; @@ -840,10 +761,6 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, NULL, 0); - if (err) - return err; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN) == 0) name += XATTR_MAC_OSX_PREFIX_LEN; @@ -940,6 +857,9 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, if (len > HFSPLUS_ATTR_MAX_STRLEN) return -EOPNOTSUPP; + if (is_known_namespace(name)) + return -EOPNOTSUPP; + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 841b569..9e21449 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -14,8 +14,6 @@ extern const struct xattr_handler hfsplus_xattr_osx_handler; extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; -extern const struct xattr_handler hfsplus_xattr_acl_access_handler; -extern const struct xattr_handler hfsplus_xattr_acl_default_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; extern const struct xattr_handler *hfsplus_xattr_handlers[]; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index db23ce1..fe649d3 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -186,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb) return inode; } -int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) +static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) { /* * do_statfs uses struct statfs64 internally, but the linux kernel @@ -268,7 +268,7 @@ static const struct super_operations hostfs_sbops = { .show_options = hostfs_show_options, }; -int hostfs_readdir(struct file *file, struct dir_context *ctx) +static int hostfs_readdir(struct file *file, struct dir_context *ctx) { void *dir; char *name; @@ -293,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx) return 0; } -int hostfs_file_open(struct inode *ino, struct file *file) +static int hostfs_file_open(struct inode *ino, struct file *file) { static DEFINE_MUTEX(open_mutex); char *name; @@ -359,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file) return 0; } -int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +static int hostfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int ret; @@ -394,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = { .read = generic_read_dir, }; -int hostfs_writepage(struct page *page, struct writeback_control *wbc) +static int hostfs_writepage(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; @@ -430,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc) return err; } -int hostfs_readpage(struct file *file, struct page *page) +static int hostfs_readpage(struct file *file, struct page *page) { char *buffer; long long start; @@ -455,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page) return err; } -int hostfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int hostfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; @@ -467,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -int hostfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int hostfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = mapping->host; void *buffer; @@ -549,8 +550,8 @@ static int read_name(struct inode *ino, char *name) return 0; } -int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { struct inode *inode; char *name; @@ -591,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return error; } -struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) +static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + unsigned int flags) { struct inode *inode; char *name; @@ -628,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, return ERR_PTR(err); } -int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) +static int hostfs_link(struct dentry *to, struct inode *ino, + struct dentry *from) { char *from_name, *to_name; int err; @@ -646,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) return err; } -int hostfs_unlink(struct inode *ino, struct dentry *dentry) +static int hostfs_unlink(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -662,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) +static int hostfs_symlink(struct inode *ino, struct dentry *dentry, + const char *to) { char *file; int err; @@ -674,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) return err; } -int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; int err; @@ -686,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) return err; } -int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -738,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename(struct inode *from_ino, struct dentry *from, + struct inode *to_ino, struct dentry *to) { char *from_name, *to_name; int err; @@ -756,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired) +static int hostfs_permission(struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; @@ -782,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired) return err; } -int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct hostfs_iattr attrs; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2d04f9a..06fe11e 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -2136,7 +2134,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif jbd_remove_debugfs_entry(); journal_destroy_caches(); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index aa603e0..1695ba8 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -675,7 +675,7 @@ repeat: jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -898,7 +898,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5203264..5fa344a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } read_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD: Unknown checksum type\n"); + printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } @@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; goto out; @@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD: journal checksum error\n"); + printk(KERN_ERR "JBD2: journal checksum error\n"); goto out; } @@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c " + printk(KERN_ERR "JBD2: Cannot load crc32c " "driver.\n"); journal->j_chksum_driver = NULL; return 0; @@ -2645,7 +2643,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3929c50..3b6bb19 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EIO; - printk(KERN_ERR "JBD: Invalid " + printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", blocknr); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 7aa9a32..8360674 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -932,7 +932,7 @@ repeat: jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -1166,7 +1166,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * once a transaction -bzzz */ jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } handle->h_buffer_credits--; } @@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_running_transaction (%p, %u)", journal->j_devname, @@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "already on other transaction"); if (unlikely(jh->b_transaction != journal->j_committing_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_committing_transaction (%p, %u)", journal->j_devname, @@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) ret = -EINVAL; } if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_next_transaction (%llu, %p, %u) != " "transaction (%p, %u)", journal->j_devname, @@ -1373,7 +1376,6 @@ out_unlock_bh: jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); - WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 223283c..009ec0b 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) char *value = NULL; int rc, xprefix; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc, xprefix; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int rc; cache_no_acl(inode); - if (S_ISLNK(*i_mode)) - return 0; /* Symlink always has no-ACL */ - - acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) { - *i_mode &= ~current_umask(); - } else { - if (S_ISDIR(*i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - - rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); - if (rc < 0) - return rc; - if (rc > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl); + if (rc) + return rc; + if (default_acl) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + if (acl) { + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return 0; @@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode) return 0; } - -int jffs2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return rc; -} - -static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_ACCESS); - return retlen; -} - -static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_DEFAULT); - return retlen; -} - -static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - - acl = jffs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return rc; -} - -static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_valid(acl); - if (rc) - goto out; - } - } else { - acl = NULL; - } - rc = jffs2_set_acl(dentry->d_inode, type, acl); - out: - posix_acl_release(acl); - return rc; -} - -const struct xattr_handler jffs2_acl_access_xattr_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; - -const struct xattr_handler jffs2_acl_default_xattr_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9b47724..2e2b574 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -27,17 +27,14 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type); -extern int jffs2_acl_chmod(struct inode *); +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); -extern const struct xattr_handler jffs2_acl_access_xattr_handler; -extern const struct xattr_handler jffs2_acl_default_xattr_handler; - #else #define jffs2_get_acl (NULL) -#define jffs2_acl_chmod(inode) (0) +#define jffs2_set_acl (NULL) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index e3aac22..9385560 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations = .mknod = jffs2_mknod, .rename = jffs2_rename, .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 1506673..256cd19 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 09b3ed4..a69e426 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { + struct inode *inode = dentry->d_inode; int rc; - rc = inode_change_ok(dentry->d_inode, iattr); + rc = inode_change_ok(inode, iattr); if (rc) return rc; - rc = jffs2_do_setattr(dentry->d_inode, iattr); + rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jffs2_acl_chmod(dentry->d_inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 4f47aa2..b8fd651 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) struct jffs2_xattr_datum *xd; xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); + if (!xd) + return NULL; xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; @@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) struct jffs2_xattr_ref *ref; ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); + if (!ref) + return NULL; ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 975a1f5..9a5449b 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_ they're killed. */ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) { - struct jffs2_node_frag *frag; - struct jffs2_node_frag *parent; - - if (!root->rb_node) - return; + struct jffs2_node_frag *frag, *next; dbg_fragtree("killing\n"); - - frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb)); - while(frag) { - if (frag->rb.rb_left) { - frag = frag_left(frag); - continue; - } - if (frag->rb.rb_right) { - frag = frag_right(frag); - continue; - } - + rbtree_postorder_for_each_entry_safe(frag, next, root, rb) { if (frag->node && !(--frag->node->frags)) { /* Not a hole, and it's the final remaining frag of this node. Free the node */ @@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) jffs2_free_full_dnode(frag->node); } - parent = frag_parent(frag); - if (parent) { - if (frag_left(parent) == frag) - parent->rb.rb_left = NULL; - else - parent->rb.rb_right = NULL; - } jffs2_free_node_frag(frag); - frag = parent; - cond_resched(); } } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index ae81b01..386303d 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) { - struct rb_node *this; - struct jffs2_tmp_dnode_info *tn; - - this = list->rb_node; + struct jffs2_tmp_dnode_info *tn, *next; - /* Now at bottom of tree */ - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb); + rbtree_postorder_for_each_entry_safe(tn, next, list, rb) { jffs2_free_full_dnode(tn->fn); jffs2_free_tmp_dnode_info(tn); - - this = rb_parent(this); - if (!this) - break; - - if (this->rb_left == &tn->rb) - this->rb_left = NULL; - else if (this->rb_right == &tn->rb) - this->rb_right = NULL; - else BUG(); - } } + *list = RB_ROOT; } diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 6e56333..c7c77b0 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3034e97..ad0f2e2 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/jffs2.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/mtd/mtd.h> #include "nodelist.h" /* -------- xdatum related functions ---------------- @@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_security_xattr_handler, #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL - &jffs2_acl_access_xattr_handler, - &jffs2_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &jffs2_trusted_xattr_handler, NULL @@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL case JFFS2_XPREFIX_ACL_ACCESS: - ret = &jffs2_acl_access_xattr_handler; + ret = &posix_acl_access_xattr_handler; break; case JFFS2_XPREFIX_ACL_DEFAULT: - ret = &jffs2_acl_default_xattr_handler; + ret = &posix_acl_default_xattr_handler; break; #endif case JFFS2_XPREFIX_TRUSTED: diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d254d6d..e973b85 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) return acl; } -static int jfs_set_acl(tid_t tid, struct inode *inode, int type, +static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, struct posix_acl *acl) { char *ea_name; @@ -80,21 +80,22 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type, int size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - switch(type) { - case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - default: - return -EINVAL; + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + rc = posix_acl_equiv_mode(acl, &inode->i_mode); + if (rc < 0) + return rc; + if (rc == 0) + acl = NULL; + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; } + if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_KERNEL); @@ -114,65 +115,43 @@ out: return rc; } +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc; + tid_t tid; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = __jfs_set_acl(tid, inode, type, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int rc = 0; - if (S_ISLNK(inode->i_mode)) - return 0; + rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (rc) + return rc; - acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + if (default_acl) { + rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } if (acl) { - if (S_ISDIR(inode->i_mode)) { - rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); - if (rc) - goto cleanup; - } - rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (rc < 0) - goto cleanup; /* posix_acl_release(NULL) is no-op */ - if (rc > 0) - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); -cleanup: + if (!rc) + rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - } else - inode->i_mode &= ~current_umask(); + } JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; return rc; } - -int jfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - tid_t tid; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - - tid = txBegin(inode->i_sb, 0); - mutex_lock(&JFS_IP(inode)->commit_mutex); - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&JFS_IP(inode)->commit_mutex); - - posix_acl_release(acl); - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index dd7442c..794da94 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/fs.h> +#include <linux/posix_acl.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" @@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = jfs_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } @@ -143,6 +144,7 @@ const struct inode_operations jfs_file_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index ad84fe5..489f993 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -21,8 +21,8 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type); +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_acl_chmod(struct inode *inode); #else @@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } -static inline int jfs_acl_chmod(struct inode *inode) -{ - return 0; -} - #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 360d27c..8d811e0 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(READ_SYNC, bio); @@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO\n"); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /* check if journaling to disk has been disabled */ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(WRITE_SYNC, bio); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index d165cde..49ba7ff 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) * count from hitting zero before we're through */ inc_io(page); - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); nr_underway++; @@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (bio) { if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) goto add_failed; - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); @@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = + pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; len = xlen << inode->i_blkbits; diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index e9e100f..e8d717d 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); extern int jfs_removexattr(struct dentry *, const char *); +extern const struct xattr_handler *jfs_xattr_handlers[]; + #ifdef CONFIG_JFS_SECURITY extern int jfs_init_security(tid_t, struct inode *, struct inode *, const struct qstr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index aa8a337..d59c7de 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 6669aa2..e2b7483 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -44,6 +44,7 @@ #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" +#include "jfs_xattr.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); @@ -522,6 +523,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; + sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &dquot_quotactl_ops; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index d3472f4..3bd5ee4 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, } /* - * can_set_system_xattr - * - * This code is specific to the system.* namespace. It contains policy - * which doesn't belong in the main xattr codepath. - */ -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ -#ifdef CONFIG_JFS_POSIX_ACL - struct posix_acl *acl; - int rc; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (rc < 0) { - printk(KERN_ERR - "posix_acl_equiv_mode returned %d\n", - rc); - return rc; - } - mark_inode_dirty(inode); - } - /* - * We're changing the ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - - return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - posix_acl_release(acl); - - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); - - return 0; - } -#endif /* CONFIG_JFS_POSIX_ACL */ - return -EOPNOTSUPP; -} - -/* * Most of the permission checking is done by xattr_permission in the vfs. - * The local file system is responsible for handling the system.* namespace. * We also need to verify that this is a namespace that we recognize. */ static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); - if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { /* * This makes sure that we aren't trying to set an @@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name, * with "os2." */ if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) - return -EOPNOTSUPP; + return -EOPNOTSUPP; return 0; } @@ -860,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* Completely new ea list */ xattr_size = sizeof (struct jfs_ea_list); + /* + * The size of EA value is limitted by on-disk format up to + * __le16, there would be an overflow if the size is equal + * to XATTR_SIZE_MAX (65536). In order to avoid this issue, + * we can pre-checkup the value size against USHRT_MAX, and + * return -E2BIG in this case, which is consistent with the + * VFS setxattr interface. + */ + if (value_len >= USHRT_MAX) { + rc = -E2BIG; + goto release; + } + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); ea->flag = 0; ea->namelen = namelen; @@ -874,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* DEBUG - If we did this right, these number match */ if (xattr_size != new_size) { printk(KERN_ERR - "jfs_xsetattr: xattr_size = %d, new_size = %d\n", + "__jfs_setxattr: xattr_size = %d, new_size = %d\n", xattr_size, new_size); rc = -EINVAL; @@ -913,6 +857,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, if ((rc = can_set_xattr(inode, name, value, value_len))) return rc; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, value_len, flags); + if (value == NULL) { /* empty EA, do not remove */ value = ""; value_len = 0; @@ -986,6 +938,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, data, buf_size); + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { /* * skip past "os2." prefix @@ -1077,6 +1037,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name) if ((rc = can_set_xattr(inode, name, NULL, 0))) return rc; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); @@ -1088,6 +1056,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name) return rc; } +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *jfs_xattr_handlers[] = { +#ifdef JFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + + #ifdef CONFIG_JFS_SECURITY static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile new file mode 100644 index 0000000..674337c --- /dev/null +++ b/fs/kernfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c new file mode 100644 index 0000000..5104cf5 --- /dev/null +++ b/fs/kernfs/dir.c @@ -0,0 +1,1073 @@ +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/hash.h> + +#include "kernfs-internal.h" + +DEFINE_MUTEX(kernfs_mutex); + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +/** + * kernfs_name_hash + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 1) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash != kn->hash) + return hash - kn->hash; + if (ns != kn->ns) + return ns - kn->ns; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + */ +static void kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + + rb_erase(&kn->rb, &kn->parent->dir.children); +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is NULL. + * + * RETURNS: + * Pointer to @kn on success, NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + int v; + + if (unlikely(!kn)) + return; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_release(&kn->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + /* + * atomic_dec_return() is a mb(), we'll always see the updated + * kn->u.completion. + */ + complete(kn->u.completion); +} + +/** + * kernfs_deactivate - deactivate kernfs_node + * @kn: kernfs_node to deactivate + * + * Deny new active references and drain existing ones. + */ +static void kernfs_deactivate(struct kernfs_node *kn) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int v; + + BUG_ON(!(kn->flags & KERNFS_REMOVED)); + + if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) + return; + + kn->u.completion = (void *)&wait; + + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + /* atomic_add_return() is a mb(), put_active() will always see + * the updated kn->u.completion. + */ + v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); + + if (v != KN_DEACTIVATED_BIAS) { + lock_contended(&kn->dep_map, _RET_IP_); + wait_for_completion(&wait); + } + + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", + parent ? parent->name : "", kn->name); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + if (!(kn->flags & KERNFS_STATIC_NAME)) + kfree(kn->name); + if (kn->iattr) { + if (kn->iattr->ia_secdata) + security_release_secctx(kn->iattr->ia_secdata, + kn->iattr->ia_secdata_len); + simple_xattrs_free(&kn->iattr->xattrs); + } + kfree(kn->iattr); + ida_simple_remove(&root->ino_ida, kn->ino); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + ida_destroy(&root->ino_ida); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (!dentry->d_inode) + goto out_bad_unlocked; + + kn = dentry->d_fsdata; + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deleted */ + if (kn->flags & KERNFS_REMOVED) + goto out_bad; + + /* The kernfs node has been moved? */ + if (dentry->d_parent->d_fsdata != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); +out_valid: + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + /* + * @dentry doesn't match the underlying kernfs node, drop the + * dentry and force lookup. If we have submounts we must allow the + * vfs caches to lie about the state of the filesystem to prevent + * leaks and other nasty things, so use check_submounts_and_drop() + * instead of d_drop(). + */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + + return 0; +} + +static void kernfs_dop_release(struct dentry *dentry) +{ + kernfs_put(dentry->d_fsdata); +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, + .d_release = kernfs_dop_release, +}; + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + const char *name, umode_t mode, + unsigned flags) +{ + char *dup_name = NULL; + struct kernfs_node *kn; + int ret; + + if (!(flags & KERNFS_STATIC_NAME)) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + return NULL; + } + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto err_out2; + kn->ino = ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, 0); + + kn->name = name; + kn->mode = mode; + kn->flags = flags | KERNFS_REMOVED; + + return kn; + + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree(dup_name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/** + * kernfs_addrm_start - prepare for kernfs_node add/remove + * @acxt: pointer to kernfs_addrm_cxt to be used + * + * This function is called when the caller is about to add or remove + * kernfs_node. This function acquires kernfs_mutex. @acxt is used + * to keep and pass context to other addrm functions. + * + * LOCKING: + * Kernel thread context (may sleep). kernfs_mutex is locked on + * return. + */ +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt) + __acquires(kernfs_mutex) +{ + memset(acxt, 0, sizeof(*acxt)); + + mutex_lock(&kernfs_mutex); +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @acxt: addrm context to use + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed + * the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + bool has_ns = kernfs_ns_enabled(parent); + struct kernfs_iattrs *ps_iattr; + int ret; + + if (has_ns != (bool)kn->ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name); + return -EINVAL; + } + + if (kernfs_type(parent) != KERNFS_DIR) + return -EINVAL; + + if (parent->flags & KERNFS_REMOVED) + return -ENOENT; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + return ret; + + /* Update timestamps on the parent */ + ps_iattr = parent->iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + /* Mark the entry added into directory tree */ + kn->flags &= ~KERNFS_REMOVED; + + return 0; +} + +/** + * kernfs_remove_one - remove kernfs_node from parent + * @acxt: addrm context to use + * @kn: kernfs_node to be removed + * + * Mark @kn removed and drop nlink of parent inode if @kn is a + * directory. @kn is unlinked from the children list. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be + * passed the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + */ +static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_iattrs *ps_iattr; + + /* + * Removal can be called multiple times on the same node. Only the + * first invocation is effective and puts the base ref. + */ + if (kn->flags & KERNFS_REMOVED) + return; + + if (kn->parent) { + kernfs_unlink_sibling(kn); + + /* Update timestamps on the parent */ + ps_iattr = kn->parent->iattr; + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + } + + kn->flags |= KERNFS_REMOVED; + kn->u.removed_list = acxt->removed; + acxt->removed = kn; +} + +/** + * kernfs_addrm_finish - finish up kernfs_node add/remove + * @acxt: addrm context to finish up + * + * Finish up kernfs_node add/remove. Resources acquired by + * kernfs_addrm_start() are released and removed kernfs_nodes are + * cleaned up. + * + * LOCKING: + * kernfs_mutex is released. + */ +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt) + __releases(kernfs_mutex) +{ + /* release resources acquired by kernfs_addrm_start() */ + mutex_unlock(&kernfs_mutex); + + /* kill removed kernfs_nodes */ + while (acxt->removed) { + struct kernfs_node *kn = acxt->removed; + + acxt->removed = kn->u.removed_list; + + kernfs_deactivate(kn); + kernfs_unmap_bin_file(kn); + kernfs_put(kn); + } +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. Returns pointer to + * the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_mutex); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @kdops: optional directory syscall operations for the hierarchy + * @priv: opaque data associated with the new directory + * + * Returns the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + ida_init(&root->ino_ida); + + kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + KERNFS_DIR); + if (!kn) { + ida_destroy(&root->ino_ida); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->flags &= ~KERNFS_REMOVED; + kn->priv = priv; + kn->dir.root = root; + + root->dir_ops = kdops; + root->kn = kn; + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + kernfs_remove(root->kn); /* will also free @root */ +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + void *priv, const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *ret; + struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *kn; + struct inode *inode; + const void *ns = NULL; + + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + + /* no such entry */ + if (!kn) { + ret = NULL; + goto out_unlock; + } + kernfs_get(kn); + dentry->d_fsdata = kn; + + /* attach dentry and inode */ + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_materialise_unique(dentry, inode); + out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; + + if (!kdops || !kdops->mkdir) + return -EPERM; + + return kdops->mkdir(parent, dentry->d_name.name, mode); +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rmdir) + return -EPERM; + + return kdops->rmdir(kn); +} + +static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rename) + return -EPERM; + + return kdops->rename(kn, new_parent, new_dentry->d_name.name); +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held(&kernfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_node *pos, *next; + + if (!kn) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + next = NULL; + do { + pos = next; + next = kernfs_next_descendant_post(pos, kn); + if (pos) + kernfs_remove_one(acxt, pos); + } while (next); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_addrm_cxt acxt; + + kernfs_addrm_start(&acxt); + __kernfs_remove(&acxt, kn); + kernfs_addrm_finish(&acxt); +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + kernfs_addrm_start(&acxt); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) + __kernfs_remove(&acxt, kn); + + kernfs_addrm_finish(&acxt); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + int error; + + mutex_lock(&kernfs_mutex); + + error = -ENOENT; + if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + if (kn->flags & KERNFS_STATIC_NAME) + kn->flags &= ~KERNFS_STATIC_NAME; + else + kfree(kn->name); + + kn->name = new_name; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + kernfs_put(kn->parent); + kn->ns = new_ns; + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->parent = new_parent; + kernfs_link_sibling(kn); + + error = 0; + out: + mutex_unlock(&kernfs_mutex); + return error; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct kernfs_node *kn) +{ + return (kn->mode >> 12) & 15; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = !(pos->flags & KERNFS_REMOVED) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries in the wrong namespace */ + while (pos && pos->ns != ns) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && pos->ns != ns); + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *pos = file->private_data; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->ino; + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + mutex_unlock(&kernfs_mutex); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + mutex_lock(&kernfs_mutex); + } + mutex_unlock(&kernfs_mutex); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, + int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = kernfs_dir_fop_llseek, +}; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c new file mode 100644 index 0000000..dbf397b --- /dev/null +++ b/fs/kernfs/file.c @@ -0,0 +1,867 @@ +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/pagemap.h> +#include <linux/sched.h> + +#include "kernfs-internal.h" + +/* + * There's one kernfs_open_file for each open file and one kernfs_open_node + * for each kernfs_node with one or more open files. + * + * kernfs_node->attr.open points to kernfs_open_node. attr.open is + * protected by kernfs_open_node_lock. + * + * filp->private_data points to seq_file whose ->private points to + * kernfs_open_file. kernfs_open_files are chained at + * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. + */ +static DEFINE_SPINLOCK(kernfs_open_node_lock); +static DEFINE_MUTEX(kernfs_open_file_mutex); + +struct kernfs_open_node { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ +}; + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; + } +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of->kn->attr.open->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, + char __user *user_buf, size_t count, + loff_t *ppos) +{ + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len < 0) + goto out_free; + + if (copy_to_user(user_buf, buf, len)) { + len = -EFAULT; + goto out_free; + } + + *ppos += len; + + out_free: + kfree(buf); + return len; +} + +/** + * kernfs_fop_read - kernfs vfs read callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + */ +static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + + if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read(file, user_buf, count, ppos); + else + return kernfs_file_direct_read(of, user_buf, count, ppos); +} + +/** + * kernfs_fop_write - kernfs vfs write callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len > 0) + *ppos += len; +out_free: + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +static int kernfs_vma_migrate(struct vm_area_struct *vma, + const nodemask_t *from, const nodemask_t *to, + unsigned long flags) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return 0; + + ret = 0; + if (of->vm_ops->migrate) + ret = of->vm_ops->migrate(vma, from, to, flags); + + kernfs_put_active(of->kn); + return ret; +} +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, + .migrate = kernfs_vma_migrate, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on, *new_on = NULL; + + retry: + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irq(&kernfs_open_node_lock); + + if (!kn->attr.open && new_on) { + kn->attr.open = new_on; + new_on = NULL; + } + + on = kn->attr.open; + if (on) { + atomic_inc(&on->refcnt); + list_add_tail(&of->list, &on->files); + } + + spin_unlock_irq(&kernfs_open_node_lock); + mutex_unlock(&kernfs_open_file_mutex); + + if (on) { + kfree(new_on); + return 0; + } + + /* not there, initialize a new one and retry */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) + return -ENOMEM; + + atomic_set(&new_on->refcnt, 0); + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + goto retry; +} + +/** + * kernfs_put_open_node - put kernfs_open_node + * @kn: target kernfs_nodet + * @of: associated kernfs_open_file + * + * Put @kn->attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void kernfs_put_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on = kn->attr.open; + unsigned long flags; + + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (of) + list_del(&of->list); + + if (atomic_dec_and_test(&on->refcnt)) + kn->attr.open = NULL; + else + on = NULL; + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + mutex_unlock(&kernfs_open_file_mutex); + + kfree(on); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* check perms and supported operations */ + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + ((struct seq_file *)file->private_data)->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_close; + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_close: + seq_release(inode, file); +err_free: + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_file *of = kernfs_of(filp); + + kernfs_put_open_node(kn, of); + seq_release(inode, filp); + kfree(of); + + return 0; +} + +void kernfs_unmap_bin_file(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + + if (!(kn->flags & KERNFS_HAS_MMAP)) + return; + + spin_lock_irq(&kernfs_open_node_lock); + on = kn->attr.open; + if (on) + atomic_inc(&on->refcnt); + spin_unlock_irq(&kernfs_open_node_lock); + if (!on) + return; + + mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&kernfs_open_file_mutex); + + kernfs_put_open_node(kn, NULL); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_node *on = kn->attr.open; + + /* need parent for the kobj, grab both */ + if (!kernfs_get_active(kn)) + goto trigger; + + poll_wait(filp, &on->poll, wait); + + kernfs_put_active(kn); + + if (of->event != atomic_read(&on->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + unsigned long flags; + + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) { + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + } + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read = kernfs_fop_read, + .write = kernfs_fop_write, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @static_name: don't copy file name + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + bool name_is_static, + struct lock_class_key *key) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + if (name_is_static) + flags |= KERNFS_STATIC_NAME; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "s_active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accesible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c new file mode 100644 index 0000000..e55126f --- /dev/null +++ b/fs/kernfs/inode.c @@ -0,0 +1,377 @@ +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/security.h> + +#include "kernfs-internal.h" + +static const struct address_space_operations kernfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info kernfs_bdi = { + .name = "kernfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, +}; + +void __init kernfs_inode_init(void) +{ + if (bdi_init(&kernfs_bdi)) + panic("failed to init kernfs_bdi"); +} + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + struct iattr *iattrs; + + if (kn->iattr) + return kn->iattr; + + kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + if (!kn->iattr) + return NULL; + iattrs = &kn->iattr->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = kn->mode; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + simple_xattrs_init(&kn->iattr->xattrs); + + return kn->iattr; +} + +static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + iattrs = &attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = kn->mode = mode; + } + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Returns 0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + + mutex_lock(&kernfs_mutex); + ret = __kernfs_setattr(kn, iattr); + mutex_unlock(&kernfs_mutex); + return ret; +} + +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct kernfs_node *kn = dentry->d_fsdata; + int error; + + if (!kn) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&kernfs_mutex); + return error; +} + +static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, + u32 *secdata_len) +{ + struct kernfs_iattrs *attrs; + void *old_secdata; + size_t old_secdata_len; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_secdata = attrs->ia_secdata; + old_secdata_len = attrs->ia_secdata_len; + + attrs->ia_secdata = *secdata; + attrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + void *secdata; + int error; + u32 secdata_len = 0; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; + } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + return simple_xattr_set(&attrs->xattrs, name, value, size, + flags); + } + + return -EINVAL; +} + +int kernfs_iop_removexattr(struct dentry *dentry, const char *name) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_remove(&attrs->xattrs, name); +} + +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, buf, size); +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(&attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) { + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, &attrs->ia_iattr); + security_inode_notifysecctx(inode, attrs->ia_secdata, + attrs->ia_secdata_len); + } + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->backing_dev_info = &kernfs_bdi; + inode->i_op = &kernfs_iops; + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kn->ino); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct inode *inode, int mask) +{ + struct kernfs_node *kn; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + return generic_permission(inode, mask); +} diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h new file mode 100644 index 0000000..eb536b7 --- /dev/null +++ b/fs/kernfs/kernfs-internal.h @@ -0,0 +1,122 @@ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> + * + * This file is released under the GPLv2. + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include <linux/lockdep.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/xattr.h> + +#include <linux/kernfs.h> + +struct kernfs_iattrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; + + struct simple_xattrs xattrs; +}; + +#define KN_DEACTIVATED_BIAS INT_MIN + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * Context structure to be used while adding/removing nodes. + */ +struct kernfs_addrm_cxt { + struct kernfs_node *removed; +}; + +/* + * mount.c + */ +struct kernfs_super_info { + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +extern struct kmem_cache *kernfs_node_cache; + +/* + * inode.c + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct inode *inode, int mask); +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int kernfs_iop_removexattr(struct dentry *dentry, const char *name); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +void kernfs_inode_init(void); + +/* + * dir.c + */ +extern struct mutex kernfs_mutex; +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn); +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +void kernfs_unmap_bin_file(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +#endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c new file mode 100644 index 0000000..0d6ce89 --- /dev/null +++ b/fs/kernfs/mount.c @@ -0,0 +1,165 @@ +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/pagemap.h> + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache; + +static const struct super_operations kernfs_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, +}; + +static int kernfs_fill_super(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = SYSFS_MAGIC; + sb->s_op = &kernfs_sops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&kernfs_mutex); + inode = kernfs_get_inode(sb, info->root->kn); + mutex_unlock(&kernfs_mutex); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + kernfs_get(info->root->kn); + root->d_fsdata = info->root->kn; + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, void *data) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = data; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_mount_ns - kernfs mount helper + * @fs_type: file_system_type of the fs being mounted + * @flags: mount flags specified for the mount + * @root: kernfs_root of the hierarchy being mounted + * @ns: optional namespace tag of the mount + * + * This is to be called from each kernfs user's file_system_type->mount() + * implementation, which should pass through the specified @fs_type and + * @flags, and specify the hierarchy and namespace tag to mount via @root + * and @ns, respectively. + * + * The return value can be passed to the vfs layer verbatim. + */ +struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, + struct kernfs_root *root, const void *ns) +{ + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->root = root; + info->ns = ns; + + sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + if (!sb->s_root) { + error = kernfs_fill_super(sb); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *root_kn = sb->s_root->d_fsdata; + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); + kernfs_put(root_kn); +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); + kernfs_inode_init(); +} diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c new file mode 100644 index 0000000..4d45705 --- /dev/null +++ b/fs/kernfs/symlink.c @@ -0,0 +1,151 @@ +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/namei.h> + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + struct kernfs_addrm_cxt acxt; + int error; + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + kernfs_addrm_start(&acxt); + error = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + strncpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct dentry *dentry, char *path) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + int error; + + mutex_lock(&kernfs_mutex); + error = kernfs_get_target_path(parent, target, path); + mutex_unlock(&kernfs_mutex); + + return error; +} + +static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = kernfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations kernfs_symlink_iops = { + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + .readlink = generic_readlink, + .follow_link = kernfs_iop_follow_link, + .put_link = kernfs_iop_put_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 0f95f0d..76279e1 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -26,9 +26,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; - bio.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_size = PAGE_SIZE; return submit_bio_wait(rw, &bio); } @@ -56,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq); static void writeseg_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - struct page *page; BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ BUG_ON(err); - BUG_ON(bio->bi_vcnt == 0); - do { - page = bvec->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + + bio_for_each_segment_all(bvec, bio, i) { + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) wake_up(&wq); @@ -96,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -123,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -188,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); @@ -209,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index d448a77..7f9b096 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, page = read_cache_page(mapping, index, filler, sb); else { page = find_or_create_page(mapping, index, GFP_NOFS); - unlock_page(page); + if (page) + unlock_page(page); } return page; } @@ -74,7 +74,7 @@ static inline int mnt_has_parent(struct mount *mnt) static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ - return !IS_ERR_OR_NULL(real_mount(mnt)); + return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); @@ -43,16 +43,14 @@ */ static void mpage_end_io(struct bio *bio, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bv; + int i; - do { - struct page *page = bvec->bv_page; + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); if (bio_data_dir(bio) == READ) { - if (uptodate) { + if (!err) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -60,14 +58,15 @@ static void mpage_end_io(struct bio *bio, int err) } unlock_page(page); } else { /* bio_data_dir(bio) == WRITE */ - if (!uptodate) { + if (err) { SetPageError(page); if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); } end_page_writeback(page); } - } while (bvec >= bio->bi_io_vec); + } + bio_put(bio); } @@ -94,7 +93,7 @@ mpage_alloc(struct block_device *bdev, if (bio) { bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; } return bio; } @@ -235,27 +235,9 @@ static int check_acl(struct inode *inode, int mask) return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); @@ -1598,11 +1580,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1646,7 +1623,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); diff --git a/fs/namespace.c b/fs/namespace.c index ac2ce8a..22e5367 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2790,6 +2790,8 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mountpoint_hashtable[u]); + kernfs_init(); + err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type) struct inode *inode = child->mnt_mountpoint->d_inode; if (!S_ISDIR(inode->i_mode)) goto next; - if (inode->i_nlink != 2) + if (inode->i_nlink > 2) goto next; } visible = true; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index e242bbf..56ff823 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio) if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", - bio->bi_size, (unsigned long long)bio->bi_sector); + rw == READ ? "read" : "write", bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector); submit_bio(rw, bio); } return NULL; @@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = end_io; bio->bi_private = par; @@ -201,18 +202,14 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; - do { - struct page *page = bvec->bv_page; + if (!err) + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (uptodate) - SetPageUptodate(page); - } while (bvec >= bio->bi_io_vec); - if (!uptodate) { + if (err) { struct nfs_read_data *rdata = par->data; struct nfs_pgio_header *header = rdata->header; @@ -383,20 +380,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl, static void bl_end_io_write_zero(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; + struct bio_vec *bvec; + int i; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); + bio_for_each_segment_all(bvec, bio, i) { /* This is the zeroing page we added */ - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } - if (unlikely(!uptodate)) { + if (unlikely(err)) { struct nfs_write_data *data = par->data; struct nfs_pgio_header *header = data->header; @@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + (offset / SECTOR_SIZE); - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = bl_read_single_end_io; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 02e1851..28a0a3c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1692,10 +1692,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL - nfsi->acl_access = ERR_PTR(-EAGAIN); - nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 34e918d..9a5ca03 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -10,179 +10,7 @@ #define NFSDBG_FACILITY NFSDBG_PROC -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int pos=0, len=0; - -# define output(s) do { \ - if (pos + sizeof(s) <= size) { \ - memcpy(buffer + pos, s, sizeof(s)); \ - pos += sizeof(s); \ - } \ - len += sizeof(s); \ - } while(0) - - acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_access"); - posix_acl_release(acl); - } - - if (S_ISDIR(inode->i_mode)) { - acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_default"); - posix_acl_release(acl); - } - } - -# undef output - - if (!buffer || len <= size) - return len; - return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error = 0; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = nfs3_proc_getacl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - if (type == ACL_TYPE_ACCESS && acl->a_count == 0) - error = -ENODATA; - else - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - } else - error = -ENODATA; - - return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = nfs3_proc_setacl(inode, type, acl); - posix_acl_release(acl); - - return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = dentry->d_inode; - int type; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ - if (!IS_ERR(nfsi->acl_access)) { - posix_acl_release(nfsi->acl_access); - nfsi->acl_access = ERR_PTR(-EAGAIN); - } - if (!IS_ERR(nfsi->acl_default)) { - posix_acl_release(nfsi->acl_default); - nfsi->acl_default = ERR_PTR(-EAGAIN); - } -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ - dprintk("NFS: nfs3_forget_cached_acls(%s/%lu)\n", inode->i_sb->s_id, - inode->i_ino); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct posix_acl *acl = ERR_PTR(-EINVAL); - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = nfsi->acl_access; - break; - - case ACL_TYPE_DEFAULT: - acl = nfsi->acl_default; - break; - - default: - goto out; - } - if (IS_ERR(acl)) - acl = ERR_PTR(-EAGAIN); - else - acl = posix_acl_dup(acl); -out: - spin_unlock(&inode->i_lock); - dprintk("NFS: nfs3_get_cached_acl(%s/%lu, %d) = %p\n", inode->i_sb->s_id, - inode->i_ino, type, acl); - return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, - inode->i_ino, acl, dfacl); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - if (!IS_ERR(acl)) - nfsi->acl_access = posix_acl_dup(acl); - if (!IS_ERR(dfacl)) - nfsi->acl_default = posix_acl_dup(dfacl); - spin_unlock(&inode->i_lock); -} - -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .rpc_argp = &args, .rpc_resp = &res, }; - struct posix_acl *acl; int status, count; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); - acl = nfs3_get_cached_acl(inode, type); - if (acl != ERR_PTR(-EAGAIN)) - return acl; - acl = NULL; /* * Only get the access acl when explicitly requested: We don't @@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + if (posix_acl_equiv_mode(res.acl_access, NULL) || + res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; } } - nfs3_cache_acls(inode, - (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), - (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); - switch(type) { - case ACL_TYPE_ACCESS: - acl = res.acl_access; - res.acl_access = NULL; - break; + if (res.mask & NFS_ACL) + set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + else + forget_cached_acl(inode, ACL_TYPE_ACCESS); - case ACL_TYPE_DEFAULT: - acl = res.acl_default; - res.acl_default = NULL; + if (res.mask & NFS_DFACL) + set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + else + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + nfs_free_fattr(res.fattr); + if (type == ACL_TYPE_ACCESS) { + posix_acl_release(res.acl_default); + return res.acl_access; + } else { + posix_acl_release(res.acl_access); + return res.acl_default; } getout: posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); nfs_free_fattr(res.fattr); - - if (status != 0) { - posix_acl_release(acl); - acl = ERR_PTR(status); - } - return acl; + return ERR_PTR(status); } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr *fattr; @@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - nfs3_cache_acls(inode, acl, dfacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -373,33 +198,27 @@ out: return status; } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { - case ACL_TYPE_ACCESS: - alloc = dfacl = nfs3_proc_getacl(inode, - ACL_TYPE_DEFAULT); - if (IS_ERR(alloc)) - goto fail; - break; - - case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = nfs3_proc_getacl(inode, - ACL_TYPE_ACCESS); - if (IS_ERR(alloc)) - goto fail; - break; - - default: - return -EINVAL; + case ACL_TYPE_ACCESS: + alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; } - } else if (type != ACL_TYPE_ACCESS) - return -EINVAL; + } if (acl == NULL) { alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -417,24 +236,24 @@ fail: int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, umode_t mode) { - struct posix_acl *dfacl, *acl; - int error = 0; + struct posix_acl *default_acl, *acl; + int error; - dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(dfacl)) { - error = PTR_ERR(dfacl); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) return (error == -EOPNOTSUPP) ? 0 : error; - } - if (!dfacl) - return 0; - acl = posix_acl_dup(dfacl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - goto out_release_dfacl; - error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? - dfacl : NULL); - posix_acl_release(acl); -out_release_dfacl: - posix_acl_release(dfacl); + + error = nfs3_proc_setacls(inode, acl, default_acl); + + if (acl) + posix_acl_release(acl); + if (default_acl) + posix_acl_release(default_acl); return error; } + +const struct xattr_handler *nfs3_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 01b6f6a..aa9bc973 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -317,8 +317,8 @@ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -340,7 +340,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = cpu_to_be32(current->pid); } - sattr->ia_mode &= ~current_umask(); + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -366,7 +368,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, } if (status != 0) - goto out; + goto out_release_acls; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ @@ -385,9 +387,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out; + goto out_release_acls; } - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); @@ -572,18 +579,20 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; data->arg.mkdir.fh = NFS_FH(dir); data->arg.mkdir.name = dentry->d_name.name; @@ -592,9 +601,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; + goto out_release_acls; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); @@ -691,19 +704,21 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; data->arg.mknod.fh = NFS_FH(dir); data->arg.mknod.name = dentry->d_name.name; @@ -731,8 +746,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + goto out_release_acls; + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); @@ -904,20 +924,28 @@ static const struct inode_operations nfs3_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = generic_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; static const struct inode_operations nfs3_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = generic_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; const struct nfs_rpc_ops nfs_v3_clientops = { @@ -965,7 +993,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, - .clear_acl_cache = nfs3_forget_cached_acls, + .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, .return_delegation = nfs3_return_delegation, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index cc471c7..d6a9894 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL + .xattr = nfs3_xattr_handlers, +#endif }; static int __init init_nfs_v3(void) diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 8b186a4..a812fd1 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -35,7 +35,9 @@ #ifndef LINUX_NFS4_ACL_H #define LINUX_NFS4_ACL_H -#include <linux/posix_acl.h> +struct nfs4_acl; +struct svc_fh; +struct svc_rqst; /* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to * fit in a page: */ @@ -43,15 +45,11 @@ struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); -int nfs4_acl_write_who(int who, char *p); +__be32 nfs4_acl_write_who(int who, __be32 **p, int *len); -#define NFS4_ACL_TYPE_DEFAULT 0x01 -#define NFS4_ACL_DIR 0x02 -#define NFS4_ACL_OWNER 0x04 - -struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, - struct posix_acl *, unsigned int flags); -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, - struct posix_acl **, unsigned int flags); +int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl); +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl); #endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index d5c5b3e..b582f9a 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,12 +84,4 @@ int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); int nfsd_reply_cache_stats_open(struct inode *, struct file *); -#ifdef CONFIG_NFSD_V4 -void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); -#else /* CONFIG_NFSD_V4 */ -static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ -} -#endif /* CONFIG_NFSD_V4 */ - #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index bf95f6b..66e58db 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net) __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); -int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); +__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *); +__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 849a7c3..d32b3aa 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -95,6 +95,7 @@ struct nfsd_net { time_t nfsd4_grace; bool nfsd_net_up; + bool lockd_up; /* * Time of server startup diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 95d76dc..11c1fba 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); @@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; @@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, goto fail; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); - } - if (!nfserr) { - nfserr = fh_getattr(fh, &resp->stat); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + if (error) + goto out_drop_write; + + fh_drop_write(fh); + + nfserr = fh_getattr(fh, &resp->stat); + +out: /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); + goto out; } /* diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9cbc1a8..adc5f1b 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); @@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd3_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); +out: /* argp->acl_{access,default} may have been allocated in nfs3svc_decode_setaclargs. */ posix_acl_release(argp->acl_access); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 14d9ecb..de6e39e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); @@ -842,21 +842,21 @@ out: static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { - struct svc_fh fh; + struct svc_fh *fh = &cd->scratch; __be32 err; - fh_init(&fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, &fh, name, namlen); + fh_init(fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, fh, name, namlen); if (err) { *p++ = 0; *p++ = 0; goto out; } - p = encode_post_op_attr(cd->rqstp, p, &fh); + p = encode_post_op_attr(cd->rqstp, p, fh); *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, &fh); + p = encode_fh(p, fh); out: - fh_put(&fh); + fh_put(fh); return p; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 8a50b3c..d3a5871 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -37,8 +37,14 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> #include <linux/export.h> +#include "nfsfh.h" +#include "nfsd.h" #include "acl.h" +#include "vfs.h" +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 /* mode bit translations: */ #define NFS4_READ_MODE (NFS4_ACE_READ_DATA) @@ -130,36 +136,50 @@ static short ace2type(struct nfs4_ace *); static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); -struct nfs4_acl * -nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, - unsigned int flags) +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl) { - struct nfs4_acl *acl; + struct inode *inode = dentry->d_inode; + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; int size = 0; - if (pacl) { - if (posix_acl_valid(pacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*pacl->a_count; + pacl = get_acl(inode, ACL_TYPE_ACCESS); + if (!pacl) { + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; } - if (dpacl) { - if (posix_acl_valid(dpacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*dpacl->a_count; + + if (S_ISDIR(inode->i_mode)) { + flags = NFS4_ACL_DIR; + dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (dpacl) + size += 2 * dpacl->a_count; + } else { + dpacl = NULL; } - /* Allocate for worst case: one (deny, allow) pair each: */ - acl = nfs4_acl_new(size); - if (acl == NULL) - return ERR_PTR(-ENOMEM); + *acl = nfs4_acl_new(size); + if (*acl == NULL) { + error = -ENOMEM; + goto out; + } if (pacl) - _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) - _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); - return acl; + out: + posix_acl_release(pacl); + posix_acl_release(dpacl); + return error; } struct posix_acl_summary { @@ -719,8 +739,9 @@ static void process_one_v4_ace(struct posix_acl_state *state, } } -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, - struct posix_acl **dpacl, unsigned int flags) +static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, + struct posix_acl **pacl, struct posix_acl **dpacl, + unsigned int flags) { struct posix_acl_state effective_acl_state, default_acl_state; struct nfs4_ace *ace; @@ -780,6 +801,57 @@ out_estate: return ret; } +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) + return nfserr_attrnotsupp; + + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) + return nfserr_attrnotsupp; + if (host_error < 0) + goto out_nfserr; + + host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + if (host_error < 0) + goto out_release; + + if (S_ISDIR(inode->i_mode)) { + host_error = inode->i_op->set_acl(inode, dpacl, + ACL_TYPE_DEFAULT); + } + +out_release: + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + + static short ace2type(struct nfs4_ace *ace) { @@ -798,9 +870,6 @@ ace2type(struct nfs4_ace *ace) return -1; } -EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); -EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); - struct nfs4_acl * nfs4_acl_new(int n) { @@ -848,21 +917,22 @@ nfs4_acl_get_whotype(char *p, u32 len) return NFS4_ACL_WHO_NAMED; } -int -nfs4_acl_write_who(int who, char *p) +__be32 nfs4_acl_write_who(int who, __be32 **p, int *len) { int i; + int bytes; for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { - if (s2t_map[i].type == who) { - memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); - return s2t_map[i].stringlen; - } + if (s2t_map[i].type != who) + continue; + bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2); + if (bytes > *len) + return nfserr_resource; + *p = xdr_encode_opaque(*p, s2t_map[i].string, + s2t_map[i].stringlen); + *len -= bytes; + return 0; } - BUG(); + WARN_ON_ONCE(1); return -1; } - -EXPORT_SYMBOL(nfs4_acl_new); -EXPORT_SYMBOL(nfs4_acl_get_whotype); -EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4832fd8..c0dfde6 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -551,27 +551,46 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return 0; } -static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen) +{ + char buf[11]; + int len; + int bytes; + + len = sprintf(buf, "%u", id); + bytes = 4 + (XDR_QUADLEN(len) << 2); + if (bytes > *buflen) + return nfserr_resource; + *p = xdr_encode_opaque(*p, buf, len); + *buflen -= bytes; + return 0; +} + +static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) { struct ent *item, key = { .id = id, .type = type, }; int ret; + int bytes; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) - return sprintf(name, "%u", id); + return encode_ascii_id(id, p, buflen); if (ret) - return ret; + return nfserrno(ret); ret = strlen(item->name); - BUG_ON(ret > IDMAP_NAMESZ); - memcpy(name, item->name, ret); + WARN_ON_ONCE(ret > IDMAP_NAMESZ); + bytes = 4 + (XDR_QUADLEN(ret) << 2); + if (bytes > *buflen) + return nfserr_resource; + *p = xdr_encode_opaque(*p, item->name, ret); + *buflen -= bytes; cache_put(&item->h, nn->idtoname_cache); - return ret; + return 0; } static bool @@ -603,12 +622,11 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u return idmap_name_to_id(rqstp, type, name, namelen, id); } -static int -do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) - return sprintf(name, "%u", id); - return idmap_id_to_name(rqstp, type, id, name); + return encode_ascii_id(id, p, buflen); + return idmap_id_to_name(rqstp, type, id, p, buflen); } __be32 @@ -637,16 +655,14 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return status; } -int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name) +__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen) { u32 id = from_kuid(&init_user_ns, uid); - return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); + return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen); } -int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name) +__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen) { u32 id = from_kgid(&init_user_ns, gid); - return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); + return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 419572f..82189b2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -41,6 +41,7 @@ #include "vfs.h" #include "current_stateid.h" #include "netns.h" +#include "acl.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -230,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate } static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { struct svc_fh *current_fh = &cstate->current_fh; - struct svc_fh *resfh; int accmode; __be32 status; - resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); - if (!resfh) + *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!*resfh) return nfserr_jukebox; - fh_init(resfh, NFS4_FHSIZE); + fh_init(*resfh, NFS4_FHSIZE); open->op_truncate = 0; if (open->op_create) { @@ -265,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru */ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, - resfh, open->op_createmode, + *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); if (!status && open->op_label.len) - nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval); + nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -280,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); - } else { + } else + /* + * Note this may exit with the parent still locked. + * We will hold the lock until nfsd4_open's final + * lookup, to prevent renames or unlinks until we've had + * a chance to an acquire a delegation if appropriate. + */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, resfh); - fh_unlock(current_fh); - } + open->op_fname.data, open->op_fname.len, *resfh); if (status) goto out; - status = nfsd_check_obj_isreg(resfh); + status = nfsd_check_obj_isreg(*resfh); if (status) goto out; if (is_create_with_attrs(open) && open->op_acl != NULL) - do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); + do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); - nfsd4_set_open_owner_reply_cache(cstate, open, resfh); + nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); accmode = NFSD_MAY_NOP; if (open->op_created || open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) accmode |= NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, resfh, open, accmode); + status = do_open_permission(rqstp, *resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); - fh_dup2(current_fh, resfh); out: - fh_put(resfh); - kfree(resfh); return status; } @@ -357,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { __be32 status; + struct svc_fh *resfh = NULL; struct nfsd4_compoundres *resp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -423,7 +425,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open); + status = do_open_lookup(rqstp, cstate, open, &resfh); if (status) goto out; break; @@ -439,6 +441,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = do_open_fhandle(rqstp, cstate, open); if (status) goto out; + resfh = &cstate->current_fh; break; case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: @@ -458,9 +461,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * successful, it (1) truncates the file if open->op_truncate was * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ - status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + status = nfsd4_process_open2(rqstp, resfh, open); WARN_ON(status && open->op_created); out: + if (resfh && resfh != &cstate->current_fh) { + fh_dup2(&cstate->current_fh, resfh); + fh_put(resfh); + kfree(resfh); + } nfsd4_cleanup_open_state(open, status); if (open->op_openowner && !nfsd4_has_session(cstate)) cstate->replay_owner = &open->op_openowner->oo_owner; @@ -1069,8 +1077,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_dentry, &p, count, verify->ve_bmval, rqstp, 0); - - /* this means that nfsd4_encode_fattr() ran out of space */ + /* + * If nfsd4_encode_fattr() ran out of space, assume that's because + * the attributes are longer (hence different) than those given: + */ if (status == nfserr_resource) status = nfserr_not_same; if (status) @@ -1524,7 +1534,8 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ + 1 + 1 + /* eir_flags, spr_how */\ + 4 + /* spo_must_enforce & _allow with bitmap */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ @@ -1881,6 +1892,7 @@ struct svc_version nfsd_version4 = { .vs_proc = nfsd_procedures4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = 1, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 105d6fa..d5d070f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -832,10 +832,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) spin_unlock(&nfsd_drc_lock); } -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, + struct nfsd4_channel_attrs *battrs) { - int numslots = attrs->maxreqs; - int slotsize = slot_bytes(attrs); + int numslots = fattrs->maxreqs; + int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; int mem, i; @@ -852,6 +853,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) if (!new->se_slots[i]) goto out_free; } + + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + return new; out_free: while (i--) @@ -997,8 +1002,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_unlock(&nn->client_lock); - memcpy(&new->se_fchannel, &cses->fore_channel, - sizeof(struct nfsd4_channel_attrs)); + if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); /* @@ -1851,6 +1855,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs return nfs_ok; } +#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ + RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) +#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ + RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) + static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; @@ -1861,9 +1870,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) * less than 1k. Tighten up this estimate in the unlikely event * it turns out to be a problem for some client: */ - if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) + if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; - if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) + if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) return nfserr_toosmall; ca->maxresp_cached = 0; if (ca->maxops < 2) @@ -1913,9 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) - return status; + goto out_release_drc_mem; status = nfserr_jukebox; - new = alloc_session(&cr_ses->fore_channel); + new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); @@ -3034,18 +3043,18 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); - if (status) { - list_del_init(&dp->dl_perclnt); - locks_free_lock(fl); - return status; - } + if (status) + goto out_free; + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); fp->fi_lease = fl; fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); return 0; +out_free: + locks_free_lock(fl); + return status; } static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) @@ -3125,6 +3134,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ee7237f..63f2395 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -103,11 +103,6 @@ xdr_error: \ (x) = (u64)ntohl(*p++) << 32; \ (x) |= ntohl(*p++); \ } while (0) -#define READTIME(x) do { \ - p++; \ - (x) = ntohl(*p++); \ - p++; \ -} while (0) #define READMEM(x,nbytes) do { \ x = (char *)p; \ p += XDR_QUADLEN(nbytes); \ @@ -190,6 +185,15 @@ static int zero_clientid(clientid_t *clid) return (clid->cl_boot == 0) && (clid->cl_id == 0); } +/** + * defer_free - mark an allocation as deferred freed + * @argp: NFSv4 compound argument structure to be freed with + * @release: release callback to free @p, typically kfree() + * @p: pointer to be freed + * + * Marks @p to be freed when processing the compound operation + * described in @argp finishes. + */ static int defer_free(struct nfsd4_compoundargs *argp, void (*release)(const void *), void *p) @@ -206,6 +210,16 @@ defer_free(struct nfsd4_compoundargs *argp, return 0; } +/** + * savemem - duplicate a chunk of memory for later processing + * @argp: NFSv4 compound argument structure to be freed with + * @p: pointer to be duplicated + * @nbytes: length to be duplicated + * + * Returns a pointer to a copy of @nbytes bytes of memory at @p + * that are preserved until processing of the NFSv4 compound + * operation described by @argp finishes. + */ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { if (p == argp->tmp) { @@ -257,7 +271,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, int expected_len, len = 0; u32 dummy32; char *buf; - int host_err; DECODE_HEAD; iattr->ia_valid = 0; @@ -284,10 +297,9 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_resource; *acl = nfs4_acl_new(nace); - if (*acl == NULL) { - host_err = -ENOMEM; - goto out_nfserr; - } + if (*acl == NULL) + return nfserr_jukebox; + defer_free(argp, kfree, *acl); (*acl)->naces = nace; @@ -425,10 +437,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, goto xdr_error; DECODE_TAIL; - -out_nfserr: - status = nfserrno(host_err); - goto out; } static __be32 @@ -1957,56 +1965,16 @@ static u32 nfs4_file_type(umode_t mode) }; } -static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid, - __be32 **p, int *buflen) -{ - int status; - - if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4) - return nfserr_resource; - if (whotype != NFS4_ACL_WHO_NAMED) - status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); - else if (gid_valid(gid)) - status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1)); - else - status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1)); - if (status < 0) - return nfserrno(status); - *p = xdr_encode_opaque(*p, NULL, status); - *buflen -= (XDR_QUADLEN(status) << 2) + 4; - BUG_ON(*buflen < 0); - return 0; -} - -static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen) -{ - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, - p, buflen); -} - -static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen) -{ - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, - p, buflen); -} - static inline __be32 nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, __be32 **p, int *buflen) { - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - - if (ace->whotype == NFS4_ACL_WHO_NAMED) { - if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - gid = ace->who_gid; - else - uid = ace->who_uid; - } - return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + return nfs4_acl_write_who(ace->whotype, p, buflen); + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen); + else + return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2090,7 +2058,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, u32 bmval1 = bmval[1]; u32 bmval2 = bmval[2]; struct kstat stat; - struct svc_fh tempfh; + struct svc_fh *tempfh = NULL; struct kstatfs statfs; int buflen = count << 2; __be32 *attrlenp; @@ -2137,11 +2105,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { - fh_init(&tempfh, NFS4_FHSIZE); - status = fh_compose(&tempfh, exp, dentry, NULL); + tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + status = nfserr_jukebox; + if (!tempfh) + goto out; + fh_init(tempfh, NFS4_FHSIZE); + status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = &tempfh; + fhp = tempfh; } if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_SUPPORTED_ATTRS)) { @@ -2222,8 +2194,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if ((buflen -= 4) < 0) goto out_resource; dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) - goto out_serverfault; + if (dummy == NF4BAD) { + status = nfserr_serverfault; + goto out; + } WRITE32(dummy); } if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { @@ -2317,8 +2291,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(ace->flag); WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2379,8 +2351,6 @@ out_acl: } if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2431,15 +2401,11 @@ out_acl: } if (bmval1 & FATTR4_WORD1_OWNER) { status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2533,8 +2499,8 @@ out: security_release_secctx(context, contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); - if (fhp == &tempfh) - fh_put(&tempfh); + if (tempfh) + fh_put(tempfh); return status; out_nfserr: status = nfserrno(err); @@ -2542,9 +2508,6 @@ out_nfserr: out_resource: status = nfserr_resource; goto out; -out_serverfault: - status = nfserr_serverfault; - goto out; } static inline int attributes_need_mount(u32 *bmval) @@ -2621,17 +2584,14 @@ out_put: static __be32 * nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) { - __be32 *attrlenp; - if (buflen < 6) return NULL; *p++ = htonl(2); *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ *p++ = htonl(0); /* bmval1 */ - attrlenp = p++; + *p++ = htonl(4); /* attribute length */ *p++ = nfserr; /* no htonl */ - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); return p; } @@ -3244,7 +3204,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); + RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4); WRITE32(RPC_AUTH_GSS); WRITE32(info.oid.len); WRITEMEM(info.oid.data, info.oid.len); @@ -3379,35 +3339,43 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how */ + - 8 /* spo_must_enforce, spo_must_allow */ + - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); + 4 /* spr_how */); WRITEMEM(&exid->clientid, 8); WRITE32(exid->seqid); WRITE32(exid->flags); WRITE32(exid->spa_how); + ADJUST_ARGS(); + switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: + /* spo_must_enforce, spo_must_allow */ + RESERVE_SPACE(16); + /* spo_must_enforce bitmap: */ WRITE32(2); WRITE32(nfs4_minimal_spo_must_enforce[0]); WRITE32(nfs4_minimal_spo_must_enforce[1]); /* empty spo_must_allow bitmap: */ WRITE32(0); + + ADJUST_ARGS(); break; default: WARN_ON_ONCE(1); } + RESERVE_SPACE( + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + /* The server_owner struct */ WRITE64(minor_id); /* Minor id */ /* major id */ @@ -3474,28 +3442,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_destroy_session *destroy_session) -{ - return nfserr; -} - -static __be32 -nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_free_stateid *free_stateid) -{ - __be32 *p; - - if (nfserr) - return nfserr; - - RESERVE_SPACE(4); - *p++ = nfserr; - ADJUST_ARGS(); - return nfserr; -} - -static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { @@ -3593,8 +3539,8 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9186c7c..f8f060f 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -409,22 +409,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) /* * Since the common case is a cache miss followed by an insert, - * preallocate an entry. First, try to reuse the first entry on the LRU - * if it works, then go ahead and prune the LRU list. + * preallocate an entry. */ - spin_lock(&cache_lock); - if (!list_empty(&lru_head)) { - rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); - if (nfsd_cache_entry_expired(rp) || - num_drc_entries >= max_drc_entries) { - lru_put_end(rp); - prune_cache_entries(); - goto search_cache; - } - } - - /* No expired ones available, allocate a new one. */ - spin_unlock(&cache_lock); rp = nfsd_reply_cache_alloc(); spin_lock(&cache_lock); if (likely(rp)) { @@ -432,7 +418,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) drc_mem_usage += sizeof(*rp); } -search_cache: + /* go ahead and prune the cache */ + prune_cache_entries(); + found = nfsd_cache_search(rqstp, csum); if (found) { if (likely(rp)) @@ -446,15 +434,6 @@ search_cache: goto out; } - /* - * We're keeping the one we just allocated. Are we now over the - * limit? Prune one off the tip of the LRU in trade for the one we - * just allocated if so. - */ - if (num_drc_entries >= max_drc_entries) - nfsd_reply_cache_free_locked(list_first_entry(&lru_head, - struct svc_cacherep, c_lru)); - nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 760c85a..9a4a5f9 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void) nfsd_racache_shutdown(); } +static bool nfsd_needs_lockd(void) +{ +#if defined(CONFIG_NFSD_V3) + return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL); +#else + return (nfsd_versions[2] != NULL); +#endif +} + static int nfsd_startup_net(int nrservs, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net) ret = nfsd_init_socks(net); if (ret) goto out_socks; - ret = lockd_up(net); - if (ret) - goto out_socks; + + if (nfsd_needs_lockd() && !nn->lockd_up) { + ret = lockd_up(net); + if (ret) + goto out_socks; + nn->lockd_up = 1; + } + ret = nfs4_state_start_net(net); if (ret) goto out_lockd; @@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net) return 0; out_lockd: - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } out_socks: nfsd_shutdown_generic(); return ret; @@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_state_shutdown_net(net); - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } nn->nfsd_net_up = false; nfsd_shutdown_generic(); } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 9c769a4..b17d932 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, type = (stat->mode & S_IFMT); *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7eea63c..017d3cb5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - fh_lock(fhp); + /* + * In the nfsd4_open() case, this may be held across + * subsequent open and delegation acquisition which may + * need to take the child's i_mutex: + */ + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = lookup_one_len(name, dparent, len); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -273,13 +278,6 @@ out: return err; } -static int nfsd_break_lease(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - return break_lease(inode, O_WRONLY | O_NONBLOCK); -} - /* * Commit metadata changes to stable storage. */ @@ -348,8 +346,7 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && - (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || - ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) { + ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -449,16 +446,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, goto out_put_write_access; } - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_put_write_access_nfserror; - fh_lock(fhp); host_err = notify_change(dentry, iap, NULL); fh_unlock(fhp); -out_put_write_access_nfserror: - err = nfserrno(host_err); out_put_write_access: if (size_change) put_write_access(inode); @@ -468,158 +459,7 @@ out: return err; } -#if defined(CONFIG_NFSD_V2_ACL) || \ - defined(CONFIG_NFSD_V3_ACL) || \ - defined(CONFIG_NFSD_V4) -static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) -{ - ssize_t buflen; - ssize_t ret; - - buflen = vfs_getxattr(dentry, key, NULL, 0); - if (buflen <= 0) - return buflen; - - *buf = kmalloc(buflen, GFP_KERNEL); - if (!*buf) - return -ENOMEM; - - ret = vfs_getxattr(dentry, key, *buf, buflen); - if (ret < 0) - kfree(*buf); - return ret; -} -#endif - #if defined(CONFIG_NFSD_V4) -static int -set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) -{ - int len; - size_t buflen; - char *buf = NULL; - int error = 0; - - buflen = posix_acl_xattr_size(pacl->a_count); - buf = kmalloc(buflen, GFP_KERNEL); - error = -ENOMEM; - if (buf == NULL) - goto out; - - len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen); - if (len < 0) { - error = len; - goto out; - } - - error = vfs_setxattr(dentry, key, buf, len, 0); -out: - kfree(buf); - return error; -} - -__be32 -nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl) -{ - __be32 error; - int host_error; - struct dentry *dentry; - struct inode *inode; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; - - /* Get inode */ - error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); - if (error) - return error; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - if (S_ISDIR(inode->i_mode)) - flags = NFS4_ACL_DIR; - - host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); - if (host_error == -EINVAL) { - return nfserr_attrnotsupp; - } else if (host_error < 0) - goto out_nfserr; - - host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); - if (host_error < 0) - goto out_release; - - if (S_ISDIR(inode->i_mode)) - host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); - -out_release: - posix_acl_release(pacl); - posix_acl_release(dpacl); -out_nfserr: - if (host_error == -EOPNOTSUPP) - return nfserr_attrnotsupp; - else - return nfserrno(host_error); -} - -static struct posix_acl * -_get_posix_acl(struct dentry *dentry, char *key) -{ - void *buf = NULL; - struct posix_acl *pacl = NULL; - int buflen; - - buflen = nfsd_getxattr(dentry, key, &buf); - if (!buflen) - buflen = -ENODATA; - if (buflen <= 0) - return ERR_PTR(buflen); - - pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen); - kfree(buf); - return pacl; -} - -int -nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) -{ - struct inode *inode = dentry->d_inode; - int error = 0; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; - - pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); - if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) - pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); - if (IS_ERR(pacl)) { - error = PTR_ERR(pacl); - pacl = NULL; - goto out; - } - - if (S_ISDIR(inode->i_mode)) { - dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); - if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) - dpacl = NULL; - else if (IS_ERR(dpacl)) { - error = PTR_ERR(dpacl); - dpacl = NULL; - goto out; - } - flags = NFS4_ACL_DIR; - } - - *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); - if (IS_ERR(*acl)) { - error = PTR_ERR(*acl); - *acl = NULL; - } - out: - posix_acl_release(pacl); - posix_acl_release(dpacl); - return error; -} - /* * NFS junction information is stored in an extended attribute. */ @@ -1760,11 +1600,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (!dold->d_inode) goto out_dput; - host_err = nfsd_break_lease(dold->d_inode); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); @@ -1858,14 +1693,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = nfsd_break_lease(odentry->d_inode); - if (host_err) - goto out_dput_new; - if (ndentry->d_inode) { - host_err = nfsd_break_lease(ndentry->d_inode); - if (host_err) - goto out_dput_new; - } host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); if (!host_err) { host_err = commit_metadata(tfhp); @@ -1935,16 +1762,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = nfsd_break_lease(rdentry->d_inode); - if (host_err) - goto out_put; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry, NULL); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_put: dput(rdentry); out_nfserr: @@ -2284,93 +2107,3 @@ out_nomem: nfsd_racache_shutdown(); return -ENOMEM; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl * -nfsd_get_posix_acl(struct svc_fh *fhp, int type) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - ssize_t size; - struct posix_acl *acl; - - if (!IS_POSIXACL(inode)) - return ERR_PTR(-EOPNOTSUPP); - - switch (type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return ERR_PTR(-EOPNOTSUPP); - } - - size = nfsd_getxattr(fhp->fh_dentry, name, &value); - if (size < 0) - return ERR_PTR(size); - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - kfree(value); - return acl; -} - -int -nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - size_t size; - int error; - - if (!IS_POSIXACL(inode) || - !inode->i_op->setxattr || !inode->i_op->removexattr) - return -EOPNOTSUPP; - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return -EOPNOTSUPP; - } - - if (acl && acl->a_count) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); - if (!value) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto getout; - size = error; - } else - size = 0; - - error = fh_want_write(fhp); - if (error) - goto getout; - if (size) - error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); - else { - if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) - error = 0; - else { - error = vfs_removexattr(fhp->fh_dentry, name); - if (error == -ENODATA) - error = 0; - } - } - fh_drop_write(fhp); - -getout: - kfree(value); - return error; -} -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a4be2e3..fbe90bd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -52,9 +52,6 @@ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time_t); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 -__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, - struct nfs4_acl *); -int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); #endif /* CONFIG_NFSD_V4 */ @@ -89,8 +86,6 @@ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); -__be32 nfsd_remove(struct svc_rqst *, - struct svc_fh *, char *, int); __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, @@ -101,11 +96,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); -int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); -#endif - static inline int fh_want_write(struct svc_fh *fh) { int ret = mnt_want_write(fh->fh_export->ex_path.mnt); diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index b6d5542..335e04a 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -174,6 +174,9 @@ struct nfsd3_linkres { struct nfsd3_readdirres { __be32 status; struct svc_fh fh; + /* Just to save kmalloc on every readdirplus entry (svc_fh is a + * little large for the stack): */ + struct svc_fh scratch; int count; __be32 verf[2]; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index b3ed644..d278a0d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -228,7 +228,7 @@ struct nfsd4_open { u32 op_create; /* request */ u32 op_createmode; /* request */ u32 op_bmval[3]; /* request */ - struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ nfs4_verifier op_verf __attribute__((aligned(32))); /* EXCLUSIVE4 */ clientid_t op_clientid; /* request */ @@ -250,7 +250,6 @@ struct nfsd4_open { struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; -#define op_iattr iattr struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; @@ -374,7 +373,6 @@ struct nfsd4_test_stateid { struct nfsd4_free_stateid { stateid_t fr_stateid; /* request */ - __be32 fr_status; /* response */ }; /* also used for NVERIFY */ diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index b44bdb29..2b34021 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -37,7 +37,26 @@ #include "sufile.h" #include "dat.h" - +/** + * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @dir: set of direction flags + * @dofunc: concrete function of get/set metadata info + * + * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of + * calling dofunc() function on the basis of @argv argument. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, struct nilfs_argv *argv, int dir, ssize_t (*dofunc)(struct the_nilfs *, @@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, if (argv->v_size > PAGE_SIZE) return -EINVAL; + /* + * Reject pairs of a start item position (argv->v_index) and a + * total count (argv->v_nmembs) which leads position 'pos' to + * overflow by the increment at the end of the loop. + */ + if (argv->v_index > ~(__u64)0 - argv->v_nmembs) + return -EINVAL; + buf = (void *)__get_free_pages(GFP_NOFS, 0); if (unlikely(!buf)) return -ENOMEM; @@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_getflags - ioctl to support lsattr + */ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) { unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; @@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) return put_user(flags, (int __user *)argp); } +/** + * nilfs_ioctl_setflags - ioctl to support chattr + */ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, void __user *argp) { @@ -158,11 +191,33 @@ out: return ret; } +/** + * nilfs_ioctl_getversion - get info about a file's version (generation number) + */ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) { return put_user(inode->i_generation, (int __user *)argp); } +/** + * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot) + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_change_cpmode() function changes mode of + * given checkpoint between checkpoint and snapshot state. This ioctl + * is used in chcp and mkcp utilities. + * + * Return Value: On success, 0 is returned and mode of a checkpoint is + * changed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint mode changing. + */ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -198,6 +253,25 @@ out: return ret; } +/** + * nilfs_ioctl_delete_checkpoint - remove checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_delete_checkpoint() function removes + * checkpoint from NILFS2 file system. This ioctl is used in rmcp + * utility. + * + * Return Value: On success, 0 is returned and a checkpoint is + * removed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint removing. + */ static int nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -229,6 +303,21 @@ out: return ret; } +/** + * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints + * @nilfs: nilfs object + * @posp: pointer on array of checkpoint's numbers + * @flags: checkpoint mode (checkpoint or snapshot) + * @buf: buffer for storing checkponts' info + * @size: size in bytes of one checkpoint info item in array + * @nmembs: number of checkpoints in array (numbers and infos) + * + * Description: nilfs_ioctl_do_get_cpinfo() function returns info about + * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in + * lscp utility and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_cpinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_cpstat - get checkpoints statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints. + * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting checkpoints statistics. + */ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info + * @nilfs: nilfs object + * @posp: pointer on array of segment numbers + * @flags: *not used* + * @buf: buffer for storing suinfo array + * @size: size in bytes of one suinfo item in array + * @nmembs: count of segment numbers and suinfos in array + * + * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage + * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used + * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_suinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_sustat - get segment usage statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_sustat() returns segment usage statistics. + * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and segment usage information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting segment usage statistics. + */ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_vinfo structures + * @size: size in bytes of one vinfo item in array + * @nmembs: count of vinfos in array + * + * Description: nilfs_ioctl_do_get_vinfo() function returns information + * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used + * by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_vinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_bdesc structures + * @size: size in bytes of one bdesc item in array + * @nmembs: count of bdescs in array + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_bdescs structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, return nmembs; } +/** + * nilfs_ioctl_get_bdescs - get disk block descriptors + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and disk block descriptors are + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting disk block descriptors. + */ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC + * @inode: inode object + * @vdesc: descriptor of virtual block number + * @buffers: list of moving buffers + * + * Description: nilfs_ioctl_move_inode_block() function registers data/node + * buffer in the GC pagecache and submit read request. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Requested block doesn't exist. + * + * %-EEXIST - Blocks conflict is detected. + */ static int nilfs_ioctl_move_inode_block(struct inode *inode, struct nilfs_vdesc *vdesc, struct list_head *buffers) @@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, return 0; } +/** + * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection + * @sb: superblock object + * @argv: vector of arguments from userspace + * @buf: array of nilfs_vdesc structures + * + * Description: nilfs_ioctl_move_blocks() function reads valid data/node + * blocks that garbage collector specified with the array of nilfs_vdesc + * structures and stores them into page caches of GC inodes. + * + * Return Value: Number of processed nilfs_vdesc structures or + * error code, otherwise. + */ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { @@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, return ret; } +/** + * nilfs_ioctl_delete_checkpoints - delete checkpoints + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of periods of checkpoints numbers + * + * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints + * in the period from p_start to p_end, excluding p_end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: Number of processed nilfs_period structures or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, return nmembs; } +/** + * nilfs_ioctl_free_vblocknrs - free virtual block numbers + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of virtual block numbers + * + * Description: nilfs_ioctl_free_vblocknrs() function frees + * the virtual block numbers specified by @buf and @argv->v_nmembs. + * + * Return Value: Number of processed virtual block numbers or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, return (ret < 0) ? ret : nmembs; } +/** + * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of block descriptors + * + * Description: nilfs_ioctl_mark_blocks_dirty() function marks + * metadata file or data blocks as dirty. + * + * Return Value: Number of processed block descriptors or + * error code, otherwise. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_clean_segments - clean segments + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_clean_segments() function makes garbage + * collection operation in the environment of requested parameters + * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by + * nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -682,6 +983,33 @@ out: return ret; } +/** + * nilfs_ioctl_sync - make a checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_sync() function constructs a logical segment + * for checkpointing. This function guarantees that all modified data + * and metadata are written out to the device when it successfully + * returned. + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return 0; } +/** + * nilfs_ioctl_resize - resize NILFS2 volume + * @inode: inode object + * @filp: file object + * @argp: pointer on argument from userspace + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, void __user *argp) { @@ -735,6 +1071,17 @@ out: return ret; } +/** + * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit + * of segments in bytes and upper limit of segments in bytes. + * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; @@ -767,6 +1114,28 @@ out: return ret; } +/** + * nilfs_ioctl_get_info - wrapping function of get metadata info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * @membsz: size of an item in bytes + * @dofunc: concrete function of getting metadata info + * + * Description: nilfs_ioctl_get_info() gets metadata info by means of + * calling dofunc() function. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, size_t membsz, diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2d8be51..dc3a9efd 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, } if (likely(bio)) { bio->bi_bdev = nilfs->ns_bdev; - bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); + bio->bi_iter.bi_sector = + start << (nilfs->ns_blocksize_bits - 9); } return bio; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f6b486..a1a1916 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nilfs_clear_logs(&sci->sc_segbufs); - err = nilfs_segctor_extend_segments(sci, nilfs, nadd); - if (unlikely(err)) - return err; - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 634a8b7..266c2d7d 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macceltic(void) diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 979e626..9789c60 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccenteuro(void) diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index dd3f675..bb19e7a 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccroatian(void) diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 1112c84..2a7dea3 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccyrillic(void) diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 2de9158..77b0016 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -548,7 +548,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgaelic(void) diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index a863100..1eccf499 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgreek(void) diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index babe299..cbd0875 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maciceland(void) diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index 312364f..fba8357 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macinuit(void) diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index 53ce080..b6a98a5 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -618,7 +618,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macroman(void) diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index add6f7a..25547f0 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macromanian(void) diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index dffa96d..b5454bc 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macturkish(void) diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 7020e94..a262065 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -148,7 +148,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_ascii(void) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index fea6bd5..52ccd34 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, } EXPORT_SYMBOL(utf16s_to_utf8s); -int register_nls(struct nls_table * nls) +int __register_nls(struct nls_table *nls, struct module *owner) { struct nls_table ** tmp = &tables; if (nls->next) return -EBUSY; + nls->owner = owner; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { @@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls) spin_unlock(&nls_lock); return 0; } +EXPORT_SYMBOL(__register_nls); int unregister_nls(struct nls_table * nls) { @@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void) return &default_table; } -EXPORT_SYMBOL(register_nls); EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index c8471fe..ace3e19 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -329,7 +329,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1250(void) diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 1939b46..9273ddf 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1251(void) diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index 8120ae2..1caf5df 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -365,7 +365,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1255(void) diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index ff37a462..7ddb830 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp437(void) diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index f5576b8..c593f68 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -332,7 +332,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp737(void) diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 4905635..554c863 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -301,7 +301,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp775(void) diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index fe5bdad..56cccd1 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp850(void) diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index ceb1c01..7cdc05a 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -319,7 +319,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp852(void) diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index cc7f5fb2..7426eea 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -281,7 +281,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp855(void) diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index e418e19..0983097 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp857(void) diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index a86c97d..8422447 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -346,7 +346,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp860(void) diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index bd92022..dc873e4 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp861(void) diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index e9b68eb..d5263e3 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -403,7 +403,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp862(void) diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index f8a9b07..051c983 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -363,7 +363,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp863(void) diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index 8d31f43..97eb127 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -389,7 +389,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp864(void) diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 4bd902fe..1112142 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp865(void) diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index bdc7cb3..ffdcbc3 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -287,7 +287,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp866(void) diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 9f283a2..3b5a345 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp869(void) diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 0b3c488..8dfaa10 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -256,7 +256,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp874(void) diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 0ffed6f..67b7398 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7914,7 +7914,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp932(void) diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index 8277030..c96546c 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11092,7 +11092,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp936(void) diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 8a7a2fe..199171e 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13927,7 +13927,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp949(void) diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index ef25368..8e14187 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9463,7 +9463,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp950(void) diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 7424929..162b3f1 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -553,7 +553,6 @@ static struct nls_table table = { .charset = "euc-jp", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_euc_jp(void) diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 7b951bb..69ac020 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -239,7 +239,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_1(void) diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index c4d52ea..afb3f8f 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -267,7 +267,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_13(void) diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index dc02600..046370f 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -323,7 +323,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_14(void) diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 3c7dfc8..7e34a84 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -289,7 +289,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_15(void) diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index a2d2197..7dd5711 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_2(void) diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index a61e0da..740b75e 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_3(void) diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index e8ff555..8826021 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_4(void) diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 4721e89..7c04057 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_5(void) diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index 01a517d..d4a8814 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -245,7 +245,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_6(void) diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 2d27b93..37b75d8 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -299,7 +299,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_7(void) diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 694bf07..557b982 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_9(void) diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 4387531..811f232 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -305,7 +305,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_r(void) diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index e7bc1d7..a80a741 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -55,7 +55,6 @@ static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_ru(void) diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 8c9f029..7e029e4 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -312,7 +312,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_u(void) diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index 0d60a44..afcfbc4 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -46,7 +46,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = identity, /* no conversion */ .charset2upper = identity, - .owner = THIS_MODULE, }; static int __init init_nls_utf8(void) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f..0b9ff43 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * events. */ static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct dnotify_mark *dn_mark; - struct inode *to_tell; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; - BUG_ON(vfsmount_mark); + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; - to_tell = event->to_tell; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f912..0e792f5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -9,91 +9,59 @@ #include <linux/types.h> #include <linux/wait.h> -static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - pr_debug("%s: old=%p new=%p\n", __func__, old, new); + struct fanotify_event_info *old, *new; - if (old->to_tell == new->to_tell && - old->data_type == new->data_type && - old->tgid == new->tgid) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_PATH): -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old->mask & FAN_ALL_PERM_EVENTS) && - (new->mask & FAN_ALL_PERM_EVENTS)) - return false; -#endif - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - return true; - default: - BUG(); - }; - } + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; return false; } /* and the list better be locked by something too! */ -static struct fsnotify_event *fanotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *test_holder; - struct fsnotify_event *test_event = NULL; - struct fsnotify_event *new_event; + struct fsnotify_event *test_event; + bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* + * Don't merge a permission event with any other event so that we know + * the event structure we have created in fanotify_handle_event() is the + * one we should check for permission response. + */ + if (event->mask & FAN_ALL_PERM_EVENTS) + return 0; +#endif - list_for_each_entry_reverse(test_holder, list, event_list) { - if (should_merge(test_holder->event, event)) { - test_event = test_holder->event; + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; break; } } - if (!test_event) - return NULL; - - fsnotify_get_event(test_event); - - /* if they are exactly the same we are done */ - if (test_event->mask == event->mask) - return test_event; - - /* - * if the refcnt == 2 this is the only queue - * for this event and so we can update the mask - * in place. - */ - if (atomic_read(&test_event->refcnt) == 2) { - test_event->mask |= event->mask; - return test_event; - } - - new_event = fsnotify_clone_event(test_event); - - /* done with test_event */ - fsnotify_put_event(test_event); - - /* couldn't allocate memory, merge was not possible */ - if (unlikely(!new_event)) - return ERR_PTR(-ENOMEM); - - /* build new event and replace it on the list */ - new_event->mask = (test_event->mask | event->mask); - fsnotify_replace_event(test_holder, new_event); + if (!do_merge) + return 0; - /* we hold a reference on new_event from clone_event */ - return new_event; + test_event->mask |= event->mask; + return 1; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fsnotify_event *event) + struct fanotify_event_info *event) { int ret; @@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, return 0; /* userspace responded, convert to something usable */ - spin_lock(&event->lock); switch (event->response) { case FAN_ALLOW: ret = 0; @@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, ret = -EPERM; } event->response = 0; - spin_unlock(&event->lock); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static int fanotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, - struct fsnotify_event *event) -{ - int ret = 0; - struct fsnotify_event *notify_event = NULL; - - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); - if (IS_ERR(notify_event)) - return PTR_ERR(notify_event); - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - /* if we merged we need to wait on the new event */ - if (notify_event) - event = notify_event; - ret = fanotify_get_response_from_access(group, event); - } -#endif - - if (notify_event) - fsnotify_put_event(notify_event); - - return ret; -} - -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *to_tell, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -217,6 +142,71 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, return false; } +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (data_type == FSNOTIFY_EVENT_PATH) { + struct path *path = data; + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + event->response = 0; +#endif + + ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + if (ret) { + BUG_ON(mask & FAN_ALL_PERM_EVENTS); + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + ret = 0; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + ret = fanotify_get_response_from_access(group, event); + fsnotify_destroy_event(group, fsn_event); + } +#endif + return ret; +} + static void fanotify_free_group_priv(struct fsnotify_group *group) { struct user_struct *user; @@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); + kmem_cache_free(fanotify_event_cachep, event); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, - .free_event_priv = NULL, - .freeing_mark = NULL, + .free_event = fanotify_free_event, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 0000000..32a2f03 --- /dev/null +++ b/fs/notify/fanotify/fanotify.h @@ -0,0 +1,30 @@ +#include <linux/fsnotify_backend.h> +#include <linux/path.h> +#include <linux/slab.h> + +extern struct kmem_cache *fanotify_event_cachep; + +/* + * Lifetime of the structure differs for normal and permission events. In both + * cases the structure is allocated in fanotify_handle_event(). For normal + * events the structure is freed immediately after reporting it to userspace. + * For permission events we free it only after we receive response from + * userspace. + */ +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + u32 response; /* userspace answer to question */ +#endif +}; + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb64..b6175fa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -19,6 +19,7 @@ #include "../../mount.h" #include "../fdinfo.h" +#include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; static struct kmem_cache *fanotify_response_event_cache __read_mostly; +struct kmem_cache *fanotify_event_cachep __read_mostly; struct fanotify_response_event { struct list_head list; __s32 fd; - struct fsnotify_event *event; + struct fanotify_event_info *event; }; /* @@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } static int create_fd(struct fsnotify_group *group, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_info *event, + struct file **file) { int client_fd; struct file *new_file; @@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group, if (client_fd < 0) return client_fd; - if (event->data_type != FSNOTIFY_EVENT_PATH) { - WARN_ON(1); - put_unused_fd(client_fd); - return -EINVAL; - } - /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. @@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group, } static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) { int ret = 0; + struct fanotify_event_info *event; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, event); + group, metadata, fsn_event); *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { metadata->fd = create_fd(group, event, file); @@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (!re) return -ENOMEM; - re->event = event; + re->event = FANOTIFY_E(event); re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); @@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); - event->response = FAN_ALLOW; + FANOTIFY_E(event)->response = FAN_ALLOW; return 0; } @@ -273,7 +271,7 @@ out_close_fd: out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { - event->response = FAN_DENY; + FANOTIFY_E(event)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); } #endif @@ -321,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + /* + * Permission events get destroyed after we + * receive response + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -409,7 +412,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -421,7 +424,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) + list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -888,9 +891,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, { return sys_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN - ((__u64)mask1 << 32) | mask0, -#else ((__u64)mask0 << 32) | mask1, +#else + ((__u64)mask1 << 32) | mask0, #endif dfd, pathname); } @@ -906,6 +909,7 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d6..1d4e1ea 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_is, u32 cookie, - const unsigned char *file_name, - struct fsnotify_event **event) + const unsigned char *file_name) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell, pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" - " data=%p data_is=%d cookie=%d event=%p\n", + " data=%p data_is=%d cookie=%d\n", __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, - data_is, cookie, *event); + data_is, cookie); if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - - if (!*event) { - *event = fsnotify_create_event(to_tell, mask, data, - data_is, file_name, - cookie, GFP_KERNEL); - if (!*event) - return -ENOMEM; - } - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name); } /* @@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_event *event = NULL; struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, inode_mark, NULL, mask, + data, data_is, cookie, file_name); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, + data, data_is, cookie, file_name); inode_group = NULL; } else { ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + mask, data, data_is, cookie, + file_name); } if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) @@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); return ret; } diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625b..ee674fe 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4..485eef3 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -2,11 +2,12 @@ #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; +struct inotify_event_info { + struct fsnotify_event fse; int wd; + u32 sync_cookie; + int name_len; + char name[]; }; struct inotify_inode_mark { @@ -14,8 +15,18 @@ struct inotify_inode_mark { int wd; }; +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308..d5ee563 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -34,107 +34,89 @@ #include "inotify.h" /* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. + * Check if 2 events contain the same information. */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type) && - (old->name_len == new->name_len)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - /* remember, after old was put on the wait_q we aren't - * allowed to look at the inode any more, only thing - * left to check was if the file_name is the same */ - if (!old->name_len || - !strcmp(old->file_name, new->file_name)) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - if (old->mask & FS_Q_OVERFLOW) - return true; - else if (old->mask & FS_IN_IGNORED) - return false; - return true; - }; - } + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; return false; } -static struct fsnotify_event *inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct list_head *list, + struct fsnotify_event *event) { - struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - /* and the list better be locked by something too */ - spin_lock(&event->lock); - - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) - fsnotify_get_event(last_event); - else - last_event = NULL; - - spin_unlock(&event->lock); - - return last_event; + last_event = list_entry(list->prev, struct fsnotify_event, list); + return event_compare(last_event, event); } -static int inotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct inotify_inode_mark *i_mark; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - struct fsnotify_event *added_event; - int wd, ret = 0; + struct inotify_event_info *event; + struct fsnotify_event *fsn_event; + int ret; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); BUG_ON(vfsmount_mark); - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, - event, event->to_tell, event->mask); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } - to_tell = event->to_tell; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - wd = i_mark->wd; - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) return -ENOMEM; - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = wd; - - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); - if (added_event) { - inotify_free_event_priv(fsn_event_priv); - if (!IS_ERR(added_event)) - fsnotify_put_event(added_event); - else - ret = PTR_ERR(added_event); + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->name_len = len; + if (len) + strcpy(event->name, file_name); + + ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + if (ret) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); } if (inode_mark->mask & IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); - return ret; + return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) @@ -142,22 +124,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -202,22 +168,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group) free_uid(group->inotify_data.user); } -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +static void inotify_free_event(struct fsnotify_event *fsn_event) { - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - fsnotify_put_group(fsn_event_priv->group); - kmem_cache_free(event_priv_cachep, event_priv); + kfree(INOTIFY_E(fsn_event)); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, + .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 60f954a..497395c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } +static int round_event_name_len(struct fsnotify_event *fsn_event) +{ + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); + if (!event->name_len) + return 0; + return roundup(event->name_len + 1, sizeof(struct inotify_event)); +} + /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if @@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - if (event->name_len) - event_size += roundup(event->name_len + 1, event_size); - + event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); @@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; + struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); - size_t name_len = 0; - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); + size_t name_len; + size_t pad_name_len; - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + event = INOTIFY_E(fsn_event); + name_len = event->name_len; /* - * round up event->name_len so it is a multiple of event_size + * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ - if (event->name_len) - name_len = roundup(event->name_len + 1, event_size); - inotify_event.len = name_len; - - inotify_event.mask = inotify_mask_to_arg(event->mask); + pad_name_len = round_event_name_len(fsn_event); + inotify_event.len = pad_name_len; + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ @@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) - * with zeros. I get my zeros from the nul_inotify_event. + * with zeros. */ - if (name_len) { - unsigned int len_to_zero = name_len - event->name_len; + if (pad_name_len) { /* copy the path name */ - if (copy_to_user(buf, event->file_name, event->name_len)) + if (copy_to_user(buf, event->name, name_len)) return -EFAULT; - buf += event->name_len; + buf += name_len; /* fill userspace with 0's */ - if (clear_user(buf, len_to_zero)) + if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; - buf += len_to_zero; - event_size += name_len; + event_size += pad_name_len; } return event_size; @@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; + list_for_each_entry(fsn_event, &group->notification_list, + list) { send_len += sizeof(struct inotify_event); - if (event->name_len) - send_len += roundup(event->name_len + 1, - sizeof(struct inotify_event)); + send_len += round_event_name_len(fsn_event); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_event *ignored_event, *notify_event; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int ret; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_NOFS); - if (!ignored_event) - goto skip_send_ignore; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = i_mark->wd; - - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); - if (notify_event) { - if (IS_ERR(notify_event)) - ret = PTR_ERR(notify_event); - else - fsnotify_put_event(notify_event); - inotify_free_event_priv(fsn_event_priv); - } -skip_send_ignore: - /* matches the reference taken when the event was created */ - if (ignored_event) - fsnotify_put_event(ignored_event); + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL); + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); @@ -836,7 +794,6 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05..18b3c44 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -48,15 +48,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event *q_overflow_event; static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** @@ -76,186 +67,72 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) return list_empty(&group->notification_list) ? true : false; } -void fsnotify_get_event(struct fsnotify_event *event) -{ - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) { - if (!event) + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) return; - if (atomic_dec_and_test(&event->refcnt)) { - pr_debug("%s: event=%p\n", __func__, event); - - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - put_pid(event->tgid); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - if (holder) - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; + group->ops->free_event(event); } /* * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. If the event is successfully added to the - * group's notification queue, a reference is taken on event. + * event off the queue to deal with. The function returns 0 if the event was + * added to the queue, 1 if the event was merged with some other queued event. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, - struct fsnotify_event *(*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { - struct fsnotify_event *return_event = NULL; - struct fsnotify_event_holder *holder = NULL; + int ret = 0; struct list_head *list = &group->notification_list; - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return ERR_PTR(-ENOMEM); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, event); mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { - event = q_overflow_event; - - /* - * we need to return the overflow event - * which means we need a ref - */ - fsnotify_get_event(event); - return_event = event; - - /* sorry, no private data on the overflow event */ - priv = NULL; + /* Queue overflow event only if it isn't already queued */ + if (list_empty(&group->overflow_event.list)) + event = &group->overflow_event; + ret = 1; } if (!list_empty(list) && merge) { - struct fsnotify_event *tmp; - - tmp = merge(list, event); - if (tmp) { + ret = merge(list, event); + if (ret) { mutex_unlock(&group->notification_mutex); - - if (return_event) - fsnotify_put_event(return_event); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return tmp; + return ret; } } - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - - if (return_event) { - fsnotify_put_event(return_event); - return_event = NULL; - } - - goto alloc_holder; - } - group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); + list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); - return return_event; + return ret; } /* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_holder *holder; BUG_ON(!mutex_is_locked(&group->notification_mutex)); pr_debug("%s: group=%p\n", __func__, group); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + list_del(&event->list); group->q_len--; return event; @@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group */ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* @@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); } -static void initialize_event(struct fsnotify_event *event) -{ - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - INIT_LIST_HEAD(&event->private_data_list); -} - -/* - * Caller damn well better be holding whatever mutex is protecting the - * old_holder->event_list and the new_event must be a clean event which - * cannot be found anywhere else in the kernel. - */ -int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event) -{ - struct fsnotify_event *old_event = old_holder->event; - struct fsnotify_event_holder *new_holder = &new_event->holder; - - enum event_spinlock_class { - SPINLOCK_OLD, - SPINLOCK_NEW, - }; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); - - /* - * if the new_event's embedded holder is in use someone - * screwed up and didn't give us a clean new event. - */ - BUG_ON(!list_empty(&new_holder->event_list)); - - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); - - new_holder->event = new_event; - list_replace_init(&old_holder->event_list, &new_holder->event_list); - - spin_unlock(&new_event->lock); - spin_unlock(&old_event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (old_holder != &old_event->holder) - fsnotify_destroy_event_holder(old_holder); - - fsnotify_get_event(new_event); /* on the list take reference */ - fsnotify_put_event(old_event); /* off the list, drop reference */ - - return 0; -} - -struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); - - memcpy(event, old_event, sizeof(*event)); - initialize_event(event); - - if (event->name_len) { - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - } - event->tgid = get_pid(old_event->tgid); - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_get(&event->path); - - return event; -} - /* * fsnotify_create_event - Allocate a new event which will be sent to each * group's handle_event function if the group was interested in this * particular event. * - * @to_tell the inode which is supposed to receive the event (sometimes a + * @inode the inode which is supposed to receive the event (sometimes a * parent of the inode to which the event happened. * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const unsigned char *name, - u32 cookie, gfp_t gfp) +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) { - struct fsnotify_event *event; - - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); - if (!event) - return NULL; - - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", - __func__, event, to_tell, mask, data, data_type); - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, gfp); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->tgid = get_pid(task_tgid(current)); - event->sync_cookie = cookie; - event->to_tell = to_tell; - event->data_type = data_type; - - switch (data_type) { - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - + INIT_LIST_HEAD(&event->list); + event->inode = inode; event->mask = mask; - - return event; -} - -static __init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_KERNEL); - if (!q_overflow_event) - panic("unable to allocate fsnotify q_overflow_event\n"); - - return 0; } -subsys_initcall(fsnotify_notification_init); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b..ce210d4 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -38,7 +38,6 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ quota_local.o \ quota_global.o \ xattr.o \ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index b4f788e..555f4cd 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, return acl; } - -/* - * Get posix acl. - */ -static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - struct posix_acl *acl; - int ret; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - acl = ERR_PTR(ret); - return acl; - } - - acl = ocfs2_get_acl_nolock(inode, type, di_bh); - - ocfs2_inode_unlock(inode, 0); - - brelse(di_bh); - - return acl; -} - /* * Helper function to set i_mode in memory and disk. Some call paths * will not have di_bh or a journal handle to pass, in which case it @@ -250,7 +220,7 @@ out: /* * Set the access or default ACL of an inode. */ -static int ocfs2_set_acl(handle_t *handle, +int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, @@ -313,6 +283,11 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); +} + struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; @@ -334,200 +309,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) return acl; } - -int ocfs2_acl_chmod(struct inode *inode) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, - acl, NULL, NULL); - posix_acl_release(acl); - return ret; -} - -/* - * Initialize the ACLs of a new inode. If parent directory has default ACL, - * then clone to new inode. Called from ocfs2_mknod. - */ -int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl = NULL; - int ret = 0, ret2; - umode_t mode; - - if (!S_ISLNK(inode->i_mode)) { - if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { - acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, - dir_bh); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) { - mode = inode->i_mode & ~current_umask(); - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret) { - mlog_errno(ret); - goto cleanup; - } - } - } - if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = ocfs2_set_acl(handle, inode, di_bh, - ACL_TYPE_DEFAULT, acl, - meta_ac, data_ac); - if (ret) - goto cleanup; - } - mode = inode->i_mode; - ret = posix_acl_create(&acl, GFP_NOFS, &mode); - if (ret < 0) - return ret; - - ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret2) { - mlog_errno(ret2); - ret = ret2; - goto cleanup; - } - if (ret > 0) { - ret = ocfs2_set_acl(handle, inode, - di_bh, ACL_TYPE_ACCESS, - acl, meta_ac, data_ac); - } - } -cleanup: - posix_acl_release(acl); - return ret; -} - -static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - struct posix_acl *acl; - int ret; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ocfs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return ret; -} - -static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto cleanup; - } - } else - acl = NULL; - - ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); - -cleanup: - posix_acl_release(acl); - return ret; -} - -const struct xattr_handler ocfs2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; - -const struct xattr_handler ocfs2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 071fbd38..3fce68d 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -27,10 +27,13 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -extern int ocfs2_acl_chmod(struct inode *); -extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, - struct buffer_head *, struct buffer_head *, - struct ocfs2_alloc_context *, - struct ocfs2_alloc_context *); +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index dc7411f..8750ae1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - - if (!len) { - range->len = 0; - return 0; - } - if (minlen >= osb->bitmap_cpg) + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; main_bm_inode = ocfs2_get_system_file_inode(osb, @@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out_unlock; } + len = range->len >> osb->s_clustersize_bits; if (start + len > le32_to_cpu(main_bm->i_clusters)) len = le32_to_cpu(main_bm->i_clusters) - start; @@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); last_bit = osb->bitmap_cpg; + trimmed = 0; for (group = first_group; group <= last_group;) { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7..1aefc03 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 73920ff..bf482df 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -413,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb24064..441c84e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -29,7 +29,6 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling @@ -945,8 +944,6 @@ static int __init init_o2nm(void) { int ret = -1; - cluster_print_version(); - ret = o2hb_init(); if (ret) goto out; @@ -984,6 +981,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6..0000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3..0000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044e..bd1aab1f 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382a..33660a4 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -43,8 +43,6 @@ #include "dlmdomain.h" #include "dlmdebug.h" -#include "dlmver.h" - #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" @@ -2328,8 +2326,6 @@ static int __init dlm_init(void) { int status; - dlm_print_version(); - status = dlm_init_mle_cache(); if (status) { mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); module_init(dlm_init); module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4..0000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee..0000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89..eed3db8c 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d..09b7d9d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -49,7 +49,6 @@ #include "stackglue.h" #include "userdlm.h" -#include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" @@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - dlmfs_print_version(); - status = bdi_init(&dlmfs_backing_dev_info); if (status) return status; @@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b33..0000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadb..0000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3407b2c..1998695 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) /* for now, uuid == domain */ status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), osb->uuid_str, strlen(osb->uuid_str), &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - status = ocfs2_cluster_this_node(&osb->node_num); + status = ocfs2_cluster_this_node(conn, &osb->node_num); if (status < 0) { mlog_errno(status); mlog(ML_ERROR, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6fff128..d77d71e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1236,7 +1236,7 @@ bail: dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = ocfs2_acl_chmod(inode); + status = posix_acl_chmod(inode, inode->i_mode); if (status < 0) mlog_errno(status); } @@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } size = sr->l_start + sr->l_len; - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; goto out_inode_unlock; @@ -2661,6 +2662,7 @@ const struct inode_operations ocfs2_file_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2668,6 +2670,7 @@ const struct inode_operations ocfs2_special_file_iops = { .getattr = ocfs2_getattr, .permission = ocfs2_permission, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; /* diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9..8ca3c29 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/blkdev.h> #include <linux/compat.h> #include <cluster/masklog.h> @@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 631a982..64c304d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4f791f6..f4d609b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -230,6 +230,7 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct posix_acl *default_acl = NULL, *acl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -331,6 +332,12 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + status = posix_acl_create(dir, &mode, &default_acl, &acl); + if (status) { + mlog_errno(status); + goto leave; + } + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -379,8 +386,17 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - meta_ac, data_ac); + if (default_acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_DEFAULT, default_acl, + meta_ac, data_ac); + } + if (!status && acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_ACCESS, acl, + meta_ac, data_ac); + } + if (status < 0) { mlog_errno(status); goto leave; @@ -419,6 +435,10 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) @@ -948,7 +968,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status && (status != -ENOTEMPTY)) + if (status && (status != -ENOTEMPTY) && (status != -ENOENT)) mlog_errno(status); return status; @@ -2504,4 +2524,5 @@ const struct inode_operations ocfs2_dir_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a90347..553f53c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -387,6 +387,7 @@ struct ocfs2_super u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 55767e1..6ba4bcb 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -46,6 +46,7 @@ #include <linux/quotaops.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/posix_acl.h> struct ocfs2_cow_context { struct inode *inode; @@ -4268,11 +4269,20 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct posix_acl *default_acl, *acl; + umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + mode = inode->i_mode; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_create_inode_in_orphan(dir, mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4303,11 +4313,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name); + &new_dentry->d_name, + default_acl, acl); if (error) mlog_errno(error); } out: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f893..1724d43 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { int node_num; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1..13a8537 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/reboot.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "stackglue.h" @@ -102,6 +103,12 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; /* * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +117,13 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc, ops_rv; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) { + rc = -ENOMEM; + goto out; + } + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); - goto out; - } - - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(control); - goto out; + ocfs2_live_connection_drop(lc); + lc = NULL; } - conn->cc_private = control; - conn->cc_lockspace = fsdlm; out: + if (rc && lc) + kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; - rc = ocfs2_control_get_this_node(); if (rc < 0) return rc; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index cb7ec0b..1324e66 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - memcpy(new_conn->cc_name, group, grouplen); + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; + strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; @@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, if (cluster_stack_name[0]) stack_name = cluster_stack_name; - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, - recovery_handler, recovery_data, conn); + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { - return active_stack->sp_ops->this_node(node); + return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fd..66334a3 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -45,6 +45,9 @@ struct file_lock; */ #define GROUP_NAME_MAX 64 +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + /* * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +100,10 @@ struct ocfs2_locking_protocol { * locking compatibility. */ struct ocfs2_cluster_connection { - char cc_name[GROUP_NAME_MAX]; + char cc_name[GROUP_NAME_MAX + 1]; int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; struct ocfs2_protocol_version cc_version; struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +157,8 @@ struct ocfs2_stack_operations { * ->this_node() returns the cluster's unique identifier for the * local node. */ - int (*this_node)(unsigned int *node); + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); /* * Call the underlying dlm lock function. The ->dlm_lock() @@ -239,6 +245,8 @@ struct ocfs2_stack_plugin { /* Used by the filesystem */ int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2c91452..47ae266 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, @@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, ocfs2_journal_dirty(handle, group_bh); bail: - if (status) - mlog_errno(status); return status; } @@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa..218d803 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c414929..49d84f8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,7 +68,6 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" -#include "ver.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" @@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { @@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void) { int status, i; - ocfs2_print_version(); - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) init_waitqueue_head(&ocfs2__ioend_wq[i]); @@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); - ocfs2_truncate_log_shutdown(osb); - /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); + /* + * During dismount, when it recovers another node it will call + * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. + */ + ocfs2_truncate_log_shutdown(osb); + ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); @@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb, if (ocfs2_clusterinfo_valid(osb)) { osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - memcpy(osb->osb_cluster_stack, + strlcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN); - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + OCFS2_STACK_LABEL_LEN + 1); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } + strlcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN + 1); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4..0000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb..0000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0a1326..185fa3b7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &ocfs2_xattr_acl_access_handler, - &ocfs2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = { static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &ocfs2_xattr_acl_access_handler, + = &posix_acl_access_xattr_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &ocfs2_xattr_acl_default_handler, + = &posix_acl_default_xattr_handler, [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -7190,10 +7190,12 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr) + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl) { - int ret = 0; struct buffer_head *dir_bh = NULL; + int ret = 0; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7207,9 +7209,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); - if (ret) - mlog_errno(ret); + if (!ret && default_acl) + ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (!ret && acl) + ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 19f134e..f10d5b9 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler ocfs2_xattr_acl_access_handler; -extern const struct xattr_handler ocfs2_xattr_acl_default_handler; extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); @@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr); + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl); #endif /* OCFS2_XATTR_H */ @@ -663,10 +663,11 @@ out: wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - if (ret > 0) { + if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; + sb_end_write(file_inode(filp)->i_sb); } return ret; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135..38bae5a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1,10 +1,8 @@ /* - * linux/fs/posix_acl.c + * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org> * - * Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org> - * - * Fixes from William Schumacher incorporated on 15 March 2001. - * (Reported by Charles Bertsch, <CBertsch@microtest.com>). + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, <CBertsch@microtest.com>). */ /* @@ -18,15 +16,112 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> #include <linux/export.h> +#include <linux/user_namespace.h> -#include <linux/errno.h> +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); -EXPORT_SYMBOL(posix_acl_init); -EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_valid); -EXPORT_SYMBOL(posix_acl_equiv_mode); -EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); + +struct posix_acl *get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (!inode->i_op->get_acl) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return inode->i_op->get_acl(inode, type); +} +EXPORT_SYMBOL(get_acl); /* * Init a fresh posix_acl @@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count) atomic_set(&acl->a_refcount, 1); acl->a_count = count; } +EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. @@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags) posix_acl_init(acl, count); return acl; } +EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. @@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - kuid_t prev_uid = INVALID_UID; - kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!uid_valid(pa->e_uid)) return -EINVAL; - if (uid_valid(prev_uid) && - uid_lte(pa->e_uid, prev_uid)) - return -EINVAL; - prev_uid = pa->e_uid; needs_mask = 1; break; @@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!gid_valid(pa->e_gid)) return -EINVAL; - if (gid_valid(prev_gid) && - gid_lte(pa->e_gid, prev_gid)) - return -EINVAL; - prev_gid = pa->e_gid; needs_mask = 1; break; @@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl) return 0; return -EINVAL; } +EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional @@ -186,6 +274,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } +EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. @@ -207,6 +296,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } +EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode @@ -338,7 +428,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) +static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -384,7 +474,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -399,15 +489,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) *acl = clone; return err; } -EXPORT_SYMBOL(posix_acl_create); +EXPORT_SYMBOL(__posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { - err = posix_acl_chmod_masq(clone, mode); + err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; @@ -417,4 +507,382 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) *acl = clone; return err; } +EXPORT_SYMBOL(__posix_acl_chmod); + +int +posix_acl_chmod(struct inode *inode, umode_t mode) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(inode)) + return 0; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + return ret; + ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + return ret; +} EXPORT_SYMBOL(posix_acl_chmod); + +int +posix_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return PTR_ERR(p); + + if (!p) { + *mode &= ~current_umask(); + goto no_acl; + } + + *acl = posix_acl_clone(p, GFP_NOFS); + if (!*acl) + return -ENOMEM; + + ret = posix_acl_create_masq(*acl, mode); + if (ret < 0) { + posix_acl_release(*acl); + return -ENOMEM; + } + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(posix_acl_create); + +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + + case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; + case ACL_GROUP: + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); + +static int +posix_acl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + acl = get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +posix_acl_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int ret; + + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = inode->i_op->set_acl(inode, acl, type); +out: + posix_acl_release(acl); + return ret; +} + +static size_t +posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const char *xname; + size_t size; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + else + xname = POSIX_ACL_XATTR_DEFAULT; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +const struct xattr_handler posix_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); + +const struct xattr_handler posix_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); + +int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return 0; + if (error == 0) + acl = NULL; + } + + inode->i_ctime = CURRENT_TIME; + set_cached_acl(inode, type, acl); + return 0; +} + +int simple_acl_create(struct inode *dir, struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return 0; +} diff --git a/fs/proc/array.c b/fs/proc/array.c index 1bd2077..656e401 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -140,24 +140,15 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 8 */ "Z (zombie)", /* 16 */ "X (dead)", /* 32 */ - "x (dead)", /* 64 */ - "K (wakekill)", /* 128 */ - "W (waking)", /* 256 */ - "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; - const char * const *p = &task_state_array[0]; + unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - while (state) { - p++; - state >>= 1; - } - return *p; + return task_state_array[fls(state)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, @@ -453,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); - t = next_thread(t); - } while (t != task); + } while_each_thread(task, t); min_flt += sig->min_flt; maj_flt += sig->maj_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index 03c8d74..5150706 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; + return proc_inode_is_dead(dentry->d_inode); } const struct dentry_operations pid_dentry_operations = @@ -3092,34 +3097,42 @@ out_no_task: * In the case of a seek we start with the leader and walk nr * threads past it. */ -static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr, struct pid_namespace *ns) +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) { - struct task_struct *pos; + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; rcu_read_lock(); - /* Attempt to start with the pid of a thread */ - if (tid && (nr > 0)) { + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); - if (pos && (pos->group_leader == leader)) + if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ - pos = NULL; - if (nr && nr >= get_nr_threads(leader)) - goto out; + if (nr >= get_nr_threads(task)) + goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - for (pos = leader; nr > 0; --nr) { - pos = next_thread(pos); - if (pos == leader) { - pos = NULL; - goto out; - } - } + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; found: get_task_struct(pos); out: @@ -3152,25 +3165,16 @@ static struct task_struct *next_tid(struct task_struct *start) /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct task_struct *leader = NULL; - struct task_struct *task = get_proc_task(file_inode(file)); + struct inode *inode = file_inode(file); + struct task_struct *task; struct pid_namespace *ns; int tid; - if (!task) - return -ENOENT; - rcu_read_lock(); - if (pid_alive(task)) { - leader = task->group_leader; - get_task_struct(leader); - } - rcu_read_unlock(); - put_task_struct(task); - if (!leader) + if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) - goto out; + return 0; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. @@ -3178,7 +3182,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) ns = file->f_dentry->d_sb->s_fs_info; tid = (int)file->f_version; file->f_version = 0; - for (task = first_tid(leader, tid, ctx->pos - 2, ns); + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[PROC_NUMBUF]; @@ -3194,8 +3198,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) break; } } -out: - put_task_struct(leader); + return 0; } diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 82676e3..cbd82df 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void) proc_create("cmdline", 0, NULL, &cmdline_proc_fops); return 0; } -module_init(proc_cmdline_init); +fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 51942d5..290ba85 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -109,4 +109,4 @@ static int __init proc_consoles_init(void) proc_create("consoles", 0, NULL, &proc_consoles_operations); return 0; } -module_init(proc_consoles_init); +fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 5a1e539..06f4d31 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void) proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); return 0; } -module_init(proc_cpuinfo_init); +fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index b143471..50493ed 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -67,4 +67,4 @@ static int __init proc_devices_init(void) proc_create("devices", 0, NULL, &proc_devinfo_operations); return 0; } -module_init(proc_devices_init); +fs_initcall(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index cca93b6..b7f268e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) setattr_copy(inode, iattr); mark_inode_dirty(inode); - de->uid = inode->i_uid; - de->gid = inode->i_gid; + proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 28955d4..124fc43 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long) = NULL; + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif - if (pde->proc_fops->get_unmapped_area) - get_area = pde->proc_fops->get_unmapped_area; + if (get_area) rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; unuse_pde(pde); } return rv; diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 05029c0..a352d57 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void) proc_create("interrupts", 0, NULL, &proc_interrupts_operations); return 0; } -module_init(proc_interrupts_init); +fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 5ed0e52..39e6ef3 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -639,4 +639,4 @@ static int __init proc_kcore_init(void) return 0; } -module_init(proc_kcore_init); +fs_initcall(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bdfabda..05f8dcd 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void) proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); return 0; } -module_init(proc_kmsg_init); +fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 1afa4dd..aec66e6 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void) proc_create("loadavg", 0, NULL, &loadavg_proc_fops); return 0; } -module_init(proc_loadavg_init); +fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a77d2b2..136e548 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long committed; struct vmalloc_info vmi; long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; int lru; /* @@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable swap consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + /* * Tagged format, for easy grepping and expansion. */ seq_printf(m, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , K(i.totalram), K(i.freeram), + K(available), K(i.bufferram), K(cached), K(total_swapcache_pages()), @@ -183,4 +220,4 @@ static int __init proc_meminfo_init(void) proc_create("meminfo", 0, NULL, &meminfo_proc_fops); return 0; } -module_init(proc_meminfo_init); +fs_initcall(proc_meminfo_init); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 5f9bc8a..d4a3574 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -131,4 +131,4 @@ static int __init proc_nommu_init(void) return 0; } -module_init(proc_nommu_init); +fs_initcall(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index b8730d9..02174a6 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -118,10 +118,12 @@ u64 stable_page_flags(struct page *page) /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU to make - * sure a given page is a thp, not a non-huge compound page. + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) + else if (PageTransCompound(page) && + (PageLRU(compound_trans_head(page)) || + PageAnon(compound_trans_head(page)))) u |= 1 << KPF_THP; /* @@ -217,4 +219,4 @@ static int __init proc_page_init(void) proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); return 0; } -module_init(proc_page_init); +fs_initcall(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 70779b2..c82dd51 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, return NULL; if (!strncmp(name, "security-", 9)) - ent->size = 0; /* don't leak number of password chars */ + proc_set_size(ent, 0); /* don't leak number of password chars */ else - ent->size = pp->length; + proc_set_size(ent, pp->length); return ent; } @@ -232,6 +232,7 @@ void __init proc_device_tree_init(void) return; root = of_find_node_by_path("/"); if (root == NULL) { + remove_proc_entry("device-tree", NULL); pr_debug("/proc/device-tree: can't find root\n"); return; } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 62604be..ad8a77f 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void) proc_create("softirqs", 0, NULL, &proc_softirqs_operations); return 0; } -module_init(proc_softirqs_init); +fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 1cf86c0..6f599c6 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -221,4 +221,4 @@ static int __init proc_stat_init(void) proc_create("stat", 0, NULL, &proc_stat_operations); return 0; } -module_init(proc_stat_init); +fs_initcall(proc_stat_init); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 0618946..7141b8d 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -49,4 +49,4 @@ static int __init proc_uptime_init(void) proc_create("uptime", 0, NULL, &uptime_proc_fops); return 0; } -module_init(proc_uptime_init); +fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 76817a6..d2154eb 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -31,4 +31,4 @@ static int __init proc_version_init(void) proc_create("version", 0, NULL, &version_proc_fops); return 0; } -module_init(proc_version_init); +fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9100d69..2ca7ba0 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1082,7 +1082,7 @@ static int __init vmcore_init(void) proc_vmcore->size = vmcore_size; return 0; } -module_init(vmcore_init) +fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 439406e..7be26f0 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file, rcu_read_lock(); nsp = task_nsproxy(task); - if (!nsp) { + if (!nsp || !nsp->mnt_ns) { rcu_read_unlock(); put_task_struct(task); goto err; } ns = nsp->mnt_ns; - if (!ns) { - rcu_read_unlock(); - put_task_struct(task); - goto err; - } get_mnt_ns(ns); rcu_read_unlock(); task_lock(task); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b8e93a4..78c3c20 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); - pstore_register_console(); - pstore_register_ftrace(); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2e8caa6..89558810 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -27,7 +27,6 @@ static const struct super_operations qnx4_sops; -static void qnx4_put_super(struct super_block *sb); static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_destroy_inode(struct inode *inode); static int qnx4_remount(struct super_block *sb, int *flags, char *data); @@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .destroy_inode = qnx4_destroy_inode, - .put_super = qnx4_put_super, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, }; @@ -148,18 +146,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) * it really _is_ a qnx4 filesystem, and to check the size * of the directory entry. */ -static const char *qnx4_checkroot(struct super_block *sb) +static const char *qnx4_checkroot(struct super_block *sb, + struct qnx4_super_block *s) { struct buffer_head *bh; struct qnx4_inode_entry *rootdir; int rd, rl; int i, j; - if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') + if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0') return "no qnx4 filesystem (no root dir)."; QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); - rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; - rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); + rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size); for (j = 0; j < rl; j++) { bh = sb_bread(sb, rd + j); /* root dir, first block */ if (bh == NULL) @@ -189,7 +188,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; - int ret = -EINVAL; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -198,67 +196,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) sb_set_blocksize(s, QNX4_BLOCK_SIZE); + s->s_op = &qnx4_sops; + s->s_magic = QNX4_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible if we don't belong here... */ bh = sb_bread(s, 1); if (!bh) { printk(KERN_ERR "qnx4: unable to read the superblock\n"); - goto outnobh; + return -EINVAL; } - if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { - if (!silent) - printk(KERN_ERR "qnx4: wrong fsid in superblock.\n"); - goto out; - } - s->s_op = &qnx4_sops; - s->s_magic = QNX4_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ - qnx4_sb(s)->sb_buf = bh; - qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; - /* check before allocating dentries, inodes, .. */ - errmsg = qnx4_checkroot(s); + errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); + brelse(bh); if (errmsg != NULL) { if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); - goto out; + return -EINVAL; } /* does root not have inode number QNX4_ROOT_INO ?? */ root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); if (IS_ERR(root)) { printk(KERN_ERR "qnx4: get inode failed\n"); - ret = PTR_ERR(root); - goto outb; + return PTR_ERR(root); } - ret = -ENOMEM; s->s_root = d_make_root(root); if (s->s_root == NULL) - goto outb; + return -ENOMEM; - brelse(bh); return 0; - - outb: - kfree(qs->BitMap); - out: - brelse(bh); - outnobh: - kfree(qs); - s->s_fs_info = NULL; - return ret; } -static void qnx4_put_super(struct super_block *sb) +static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); - kfree( qs->BitMap ); - kfree( qs ); - sb->s_fs_info = NULL; - return; + kill_block_super(sb); + if (qs) { + kfree(qs->BitMap); + kfree(qs); + } } static int qnx4_readpage(struct file *file, struct page *page) @@ -409,7 +390,7 @@ static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", .mount = qnx4_mount, - .kill_sb = kill_block_super, + .kill_sb = qnx4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("qnx4"); diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 34e2d32..c9b1be2 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -10,8 +10,6 @@ #endif struct qnx4_sb_info { - struct buffer_head *sb_buf; /* superblock buffer */ - struct qnx4_super_block *sb; /* our superblock */ unsigned int Version; /* may be useful */ struct qnx4_inode_entry *BitMap; /* useful */ }; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 4884ac5..1e56a4e 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -30,13 +30,6 @@ #include "internal.h" -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; - const struct file_operations ramfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 8d5b438..0b3d8e4 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -27,13 +27,12 @@ #include "internal.h" static int ramfs_nommu_setattr(struct dentry *, struct iattr *); - -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags); +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); const struct file_operations ramfs_file_operations = { .mmap = ramfs_nommu_mmap, @@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) * - the pages to be mapped must exist * - the pages be physically contiguous in sequence */ -unsigned long ramfs_nommu_get_unmapped_area(struct file *file, +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -256,7 +255,7 @@ out: /* * set up a mapping for shared memory segments */ -int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { if (!(vma->vm_flags & VM_SHARED)) return -ENOSYS; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 39d1465..d365b1c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -43,6 +43,13 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; +static const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + static struct backing_dev_info ramfs_backing_dev_info = { .name = "ramfs", .ra_pages = 0, /* No readahead */ @@ -275,4 +282,4 @@ int __init init_ramfs_fs(void) return err; } -module_init(init_ramfs_fs) +fs_initcall(init_ramfs_fs); diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h index 6b33063..a9d8ae8 100644 --- a/fs/ramfs/internal.h +++ b/fs/ramfs/internal.h @@ -10,5 +10,4 @@ */ -extern const struct address_space_operations ramfs_aops; extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c index 58e440d..edc5746 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, io_fn_t fn; iov_fn_t fnv; - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); if (ret <= 0) @@ -968,9 +964,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1005,9 +1001,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_preadv64(fd, vec, vlen, pos); @@ -1035,9 +1031,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1072,9 +1068,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_pwritev64(fd, vec, vlen, pos); diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index f096b80..4a211f5 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); int reiserfs_cache_default_acl(struct inode *dir); -extern const struct xattr_handler reiserfs_posix_acl_default_handler; -extern const struct xattr_handler reiserfs_posix_acl_access_handler; #else #define reiserfs_cache_default_acl(inode) 0 #define reiserfs_get_acl NULL +#define reiserfs_set_acl NULL static inline int reiserfs_acl_chmod(struct inode *inode) { diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index dcaafcf..ed58d84 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -260,4 +260,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index dc5236f..e825f8b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1522,6 +1522,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; /* @@ -1538,8 +1539,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, - }; /* @@ -1553,4 +1552,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index a958444..02b0b7d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index f8adaee..8d06adf 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -608,14 +608,6 @@ int reiserfs_resize(struct super_block *, unsigned long); #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) -/* A safe version of the "bdevname", which returns the "s_id" field of - * a superblock or else "Null superblock" if the super block is NULL. - */ -static inline char *reiserfs_bdevname(struct super_block *s) -{ - return (s == NULL) ? "Null superblock" : s->s_id; -} - #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal *journal) @@ -1958,8 +1950,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} #define MAX_US_INT 0xffff // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset -#define U32_MAX (~(__u32)0) - static inline loff_t max_reiserfs_offset(struct inode *inode) { if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3ead145..2c80335 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1479,7 +1479,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2006", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1500,7 +1500,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2007", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1509,7 +1509,7 @@ static int read_super_block(struct super_block *s, int offset) if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " "filesystem on (dev %s, block %Lu, size %lu)", - reiserfs_bdevname(s), + s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); brelse(bh); @@ -1825,7 +1825,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) /* try new format (64-th 1k block), which can contain reiserfs super block */ else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", - reiserfs_bdevname(s)); + s->s_id); goto error_unlocked; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8a9e2dc..5cdfbd6 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -50,6 +50,7 @@ #include <linux/stat.h> #include <linux/quotaops.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -904,8 +905,8 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = { &reiserfs_xattr_security_handler, #endif #ifdef CONFIG_REISERFS_FS_POSIX_ACL - &reiserfs_posix_acl_access_handler, - &reiserfs_posix_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 06c04f7..a6ce532 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -11,35 +11,19 @@ #include "acl.h" #include <asm/uaccess.h> -static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl); -static int -posix_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) + +int +reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; size_t jcreate_blocks; - if (!reiserfs_posixacl(inode->i_sb)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) { - return PTR_ERR(acl); - } else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; + /* Pessimism: We can't assume that anything from the xattr root up * has been created. */ @@ -51,7 +35,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { - error = reiserfs_set_acl(&th, inode, type, acl); + error = __reiserfs_set_acl(&th, inode, type, acl); reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); @@ -59,36 +43,13 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, error = error2; } - release_and_out: - posix_acl_release(acl); - return error; -} - -static int -posix_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (!reiserfs_posixacl(dentry->d_sb)) - return -EOPNOTSUPP; - - acl = reiserfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return error; } /* * Convert from filesystem to in-memory representation. */ -static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; @@ -158,7 +119,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) /* * Convert from in-memory to filesystem representation. */ -static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) { reiserfs_acl_header *ext_acl; char *e; @@ -221,10 +182,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) int size; int retval; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -257,7 +214,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) } else if (retval < 0) { acl = ERR_PTR(retval); } else { - acl = posix_acl_from_disk(value, retval); + acl = reiserfs_posix_acl_from_disk(value, retval); } if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); @@ -273,7 +230,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) * BKL held [before 2.5.x] */ static int -reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl) { char *name; @@ -281,9 +238,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -307,7 +261,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, } if (acl) { - value = posix_acl_to_disk(acl, &size); + value = reiserfs_posix_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } @@ -343,7 +297,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int err = 0; /* ACLs only get applied to files and directories */ @@ -363,37 +317,28 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, goto apply_umask; } - acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + if (default_acl) { + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } if (acl) { - /* Copy the default ACL to the default ACL of a new directory */ - if (S_ISDIR(inode->i_mode)) { - err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, - acl); - if (err) - goto cleanup; - } - - /* Now we reconcile the new ACL and the mode, - potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - return err; - - /* If we need an ACL.. */ - if (err > 0) - err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); - cleanup: + if (!err) + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, + acl); posix_acl_release(acl); - } else { - apply_umask: - /* no ACL, apply umask */ - inode->i_mode &= ~current_umask(); } return err; + + apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + return err; } /* This is used to cache the default acl before a new object is created. @@ -442,84 +387,11 @@ int reiserfs_cache_default_acl(struct inode *inode) */ int reiserfs_acl_chmod(struct inode *inode) { - struct reiserfs_transaction_handle th; - struct posix_acl *acl; - size_t size; - int error; - if (IS_PRIVATE(inode)) return 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (get_inode_sd_version(inode) == STAT_DATA_V1 || - !reiserfs_posixacl(inode->i_sb)) { + !reiserfs_posixacl(inode->i_sb)) return 0; - } - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (!acl) - return 0; - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); - if (error) - return error; - - size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (!error) { - int error2; - error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); - reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (error2) - error = error2; - } - posix_acl_release(acl); - return error; -} - -static size_t posix_acl_access_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; + return posix_acl_chmod(inode, inode->i_mode); } - -const struct xattr_handler reiserfs_posix_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_access_list, -}; - -static size_t posix_acl_default_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -const struct xattr_handler reiserfs_posix_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_default_list, -}; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ff1d3d4..d841878 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) root = romfs_iget(sb, pos); if (IS_ERR(root)) - goto error; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto error; + return -ENOMEM; return 0; -error: - return -EINVAL; error_rsb_inval: ret = -EINVAL; error_rsb: diff --git a/fs/splice.c b/fs/splice.c index 46a08f7..12028fa 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { @@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (!s) return NULL; + INIT_LIST_HEAD(&s->s_mounts); + if (security_sb_alloc(s)) goto fail; @@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (list_lru_init(&s->s_inode_lru)) goto fail; - INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile index 8876ac1..6eff6e1 100644 --- a/fs/sysfs/Makefile +++ b/fs/sysfs/Makefile @@ -2,4 +2,4 @@ # Makefile for the sysfs virtual filesystem # -obj-y := inode.o file.o dir.o symlink.o mount.o group.o +obj-y := file.o dir.o symlink.o mount.o group.o diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5e73d66..ee0d761 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -13,465 +13,31 @@ #undef DEBUG #include <linux/fs.h> -#include <linux/mount.h> -#include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> -#include <linux/idr.h> -#include <linux/completion.h> -#include <linux/mutex.h> #include <linux/slab.h> -#include <linux/security.h> -#include <linux/hash.h> #include "sysfs.h" -DEFINE_MUTEX(sysfs_mutex); DEFINE_SPINLOCK(sysfs_symlink_target_lock); -#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb) - -static DEFINE_SPINLOCK(sysfs_ino_lock); -static DEFINE_IDA(sysfs_ino_ida); - -/** - * sysfs_name_hash - * @name: Null terminated string to hash - * @ns: Namespace tag to hash - * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) - */ -static unsigned int sysfs_name_hash(const char *name, const void *ns) -{ - unsigned long hash = init_name_hash(); - unsigned int len = strlen(name); - while (len--) - hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); - hash &= 0x7fffffffU; - /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ - if (hash < 1) - hash += 2; - if (hash >= INT_MAX) - hash = INT_MAX - 1; - return hash; -} - -static int sysfs_name_compare(unsigned int hash, const char *name, - const void *ns, const struct sysfs_dirent *sd) -{ - if (hash != sd->s_hash) - return hash - sd->s_hash; - if (ns != sd->s_ns) - return ns - sd->s_ns; - return strcmp(name, sd->s_name); -} - -static int sysfs_sd_compare(const struct sysfs_dirent *left, - const struct sysfs_dirent *right) -{ - return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns, - right); -} - -/** - * sysfs_link_sibling - link sysfs_dirent into sibling rbtree - * @sd: sysfs_dirent of interest - * - * Link @sd into its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * 0 on susccess -EEXIST on failure. - */ -static int sysfs_link_sibling(struct sysfs_dirent *sd) -{ - struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; - struct rb_node *parent = NULL; - - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs++; - - while (*node) { - struct sysfs_dirent *pos; - int result; - - pos = to_sysfs_dirent(*node); - parent = *node; - result = sysfs_sd_compare(sd, pos); - if (result < 0) - node = &pos->s_rb.rb_left; - else if (result > 0) - node = &pos->s_rb.rb_right; - else - return -EEXIST; - } - /* add new node and rebalance the tree */ - rb_link_node(&sd->s_rb, parent, node); - rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); - return 0; -} - -/** - * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree - * @sd: sysfs_dirent of interest - * - * Unlink @sd from its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - */ -static void sysfs_unlink_sibling(struct sysfs_dirent *sd) -{ - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs--; - - rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); -} - -/** - * sysfs_get_active - get an active reference to sysfs_dirent - * @sd: sysfs_dirent to get an active reference to - * - * Get an active reference of @sd. This function is noop if @sd - * is NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) -{ - if (unlikely(!sd)) - return NULL; - - if (!atomic_inc_unless_negative(&sd->s_active)) - return NULL; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); - return sd; -} - -/** - * sysfs_put_active - put an active reference to sysfs_dirent - * @sd: sysfs_dirent to put an active reference to - * - * Put an active reference to @sd. This function is noop if @sd - * is NULL. - */ -void sysfs_put_active(struct sysfs_dirent *sd) -{ - int v; - - if (unlikely(!sd)) - return; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_release(&sd->dep_map, 1, _RET_IP_); - v = atomic_dec_return(&sd->s_active); - if (likely(v != SD_DEACTIVATED_BIAS)) - return; - - /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->u.completion. - */ - complete(sd->u.completion); -} - -/** - * sysfs_deactivate - deactivate sysfs_dirent - * @sd: sysfs_dirent to deactivate - * - * Deny new active references and drain existing ones. - */ -static void sysfs_deactivate(struct sysfs_dirent *sd) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int v; - - BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); - - if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) - return; - - sd->u.completion = (void *)&wait; - - rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->u.completion. - */ - v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - - if (v != SD_DEACTIVATED_BIAS) { - lock_contended(&sd->dep_map, _RET_IP_); - wait_for_completion(&wait); - } - - lock_acquired(&sd->dep_map, _RET_IP_); - rwsem_release(&sd->dep_map, 1, _RET_IP_); -} - -static int sysfs_alloc_ino(unsigned int *pino) -{ - int ino, rc; - - retry: - spin_lock(&sysfs_ino_lock); - rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); - spin_unlock(&sysfs_ino_lock); - - if (rc == -EAGAIN) { - if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) - goto retry; - rc = -ENOMEM; - } - - *pino = ino; - return rc; -} - -static void sysfs_free_ino(unsigned int ino) -{ - spin_lock(&sysfs_ino_lock); - ida_remove(&sysfs_ino_ida, ino); - spin_unlock(&sysfs_ino_lock); -} - -void release_sysfs_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *parent_sd; - - repeat: - /* Moving/renaming is always done while holding reference. - * sd->s_parent won't change beneath us. - */ - parent_sd = sd->s_parent; - - WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), - "sysfs: free using entry: %s/%s\n", - parent_sd ? parent_sd->s_name : "", sd->s_name); - - if (sysfs_type(sd) == SYSFS_KOBJ_LINK) - sysfs_put(sd->s_symlink.target_sd); - if (sysfs_type(sd) & SYSFS_COPY_NAME) - kfree(sd->s_name); - if (sd->s_iattr && sd->s_iattr->ia_secdata) - security_release_secctx(sd->s_iattr->ia_secdata, - sd->s_iattr->ia_secdata_len); - kfree(sd->s_iattr); - sysfs_free_ino(sd->s_ino); - kmem_cache_free(sysfs_dir_cachep, sd); - - sd = parent_sd; - if (sd && atomic_dec_and_test(&sd->s_count)) - goto repeat; -} - -static int sysfs_dentry_delete(const struct dentry *dentry) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED)); -} - -static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct sysfs_dirent *sd; - int type; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - sd = dentry->d_fsdata; - mutex_lock(&sysfs_mutex); - - /* The sysfs dirent has been deleted */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - goto out_bad; - - /* The sysfs dirent has been moved? */ - if (dentry->d_parent->d_fsdata != sd->s_parent) - goto out_bad; - - /* The sysfs dirent has been renamed */ - if (strcmp(dentry->d_name.name, sd->s_name) != 0) - goto out_bad; - - /* The sysfs dirent has been moved to a different namespace */ - type = KOBJ_NS_TYPE_NONE; - if (sd->s_parent) { - type = sysfs_ns_type(sd->s_parent); - if (type != KOBJ_NS_TYPE_NONE && - sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) - goto out_bad; - } - - mutex_unlock(&sysfs_mutex); -out_valid: - return 1; -out_bad: - /* Remove the dentry from the dcache hashes. - * If this is a deleted dentry we use d_drop instead of d_delete - * so sysfs doesn't need to cope with negative dentries. - * - * If this is a dentry that has simply been renamed we - * use d_drop to remove it from the dcache lookup on its - * old parent. If this dentry persists later when a lookup - * is performed at its new name the dentry will be readded - * to the dcache hashes. - */ - mutex_unlock(&sysfs_mutex); - - /* If we have submounts we must allow the vfs caches - * to lie about the state of the filesystem to prevent - * leaks and other nasty things. - */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - - return 0; -} - -static void sysfs_dentry_release(struct dentry *dentry) -{ - sysfs_put(dentry->d_fsdata); -} - -const struct dentry_operations sysfs_dentry_ops = { - .d_revalidate = sysfs_dentry_revalidate, - .d_delete = sysfs_dentry_delete, - .d_release = sysfs_dentry_release, -}; - -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) -{ - char *dup_name = NULL; - struct sysfs_dirent *sd; - - if (type & SYSFS_COPY_NAME) { - name = dup_name = kstrdup(name, GFP_KERNEL); - if (!name) - return NULL; - } - - sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); - if (!sd) - goto err_out1; - - if (sysfs_alloc_ino(&sd->s_ino)) - goto err_out2; - - atomic_set(&sd->s_count, 1); - atomic_set(&sd->s_active, 0); - - sd->s_name = name; - sd->s_mode = mode; - sd->s_flags = type | SYSFS_FLAG_REMOVED; - - return sd; - - err_out2: - kmem_cache_free(sysfs_dir_cachep, sd); - err_out1: - kfree(dup_name); - return NULL; -} - -/** - * sysfs_addrm_start - prepare for sysfs_dirent add/remove - * @acxt: pointer to sysfs_addrm_cxt to be used - * - * This function is called when the caller is about to add or remove - * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used - * to keep and pass context to other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. - */ -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt) - __acquires(sysfs_mutex) -{ - memset(acxt, 0, sizeof(*acxt)); - - mutex_lock(&sysfs_mutex); -} - -/** - * __sysfs_add_one - add sysfs_dirent to parent without warning - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - struct sysfs_inode_attrs *ps_iattr; - int ret; - - if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, sd->s_name); - return -EINVAL; - } - - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = sysfs_get(parent_sd); - - ret = sysfs_link_sibling(sd); - if (ret) - return ret; - - /* Update timestamps on the parent */ - ps_iattr = parent_sd->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - /* Mark the entry added into directory tree */ - sd->s_flags &= ~SYSFS_FLAG_REMOVED; - - return 0; -} - /** * sysfs_pathname - return full path to sysfs dirent - * @sd: sysfs_dirent whose path we want + * @kn: kernfs_node whose path we want * @path: caller allocated buffer of size PATH_MAX * * Gives the name "/" to the sysfs_root entry; any path returned * is relative to wherever sysfs is mounted. */ -static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) +static char *sysfs_pathname(struct kernfs_node *kn, char *path) { - if (sd->s_parent) { - sysfs_pathname(sd->s_parent, path); + if (kn->parent) { + sysfs_pathname(kn->parent, path); strlcat(path, "/", PATH_MAX); } - strlcat(path, sd->s_name, PATH_MAX); + strlcat(path, kn->name, PATH_MAX); return path; } -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) +void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *path; @@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) } /** - * sysfs_add_one - add sysfs_dirent to parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - int ret; - - ret = __sysfs_add_one(acxt, sd, parent_sd); - - if (ret == -EEXIST) - sysfs_warn_dup(parent_sd, sd->s_name); - return ret; -} - -/** - * sysfs_remove_one - remove sysfs_dirent from parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be removed - * - * Mark @sd removed and drop nlink of parent inode if @sd is a - * directory. @sd is unlinked from the children list. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - */ -static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *ps_iattr; - - /* - * Removal can be called multiple times on the same node. Only the - * first invocation is effective and puts the base ref. - */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - return; - - sysfs_unlink_sibling(sd); - - /* Update timestamps on the parent */ - ps_iattr = sd->s_parent->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->u.removed_list = acxt->removed; - acxt->removed = sd; -} - -/** - * sysfs_addrm_finish - finish up sysfs_dirent add/remove - * @acxt: addrm context to finish up - * - * Finish up sysfs_dirent add/remove. Resources acquired by - * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. - * - * LOCKING: - * sysfs_mutex is released. - */ -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) - __releases(sysfs_mutex) -{ - /* release resources acquired by sysfs_addrm_start() */ - mutex_unlock(&sysfs_mutex); - - /* kill removed sysfs_dirents */ - while (acxt->removed) { - struct sysfs_dirent *sd = acxt->removed; - - acxt->removed = sd->u.removed_list; - - sysfs_deactivate(sd); - sysfs_unmap_bin_file(sd); - sysfs_put(sd); - } -} - -/** - * sysfs_find_dirent - find sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd. - * - * LOCKING: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct rb_node *node = parent_sd->s_dir.children.rb_node; - unsigned int hash; - - if (!!sysfs_ns_type(parent_sd) != !!ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, name); - return NULL; - } - - hash = sysfs_name_hash(name, ns); - while (node) { - struct sysfs_dirent *sd; - int result; - - sd = to_sysfs_dirent(node); - result = sysfs_name_compare(hash, name, ns, sd); - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return sd; - } - return NULL; -} - -/** - * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd and get - * it if found. - * - * LOCKING: - * Kernel thread context (may sleep). Grabs sysfs_mutex. - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct sysfs_dirent *sd; - - mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, name, ns); - sysfs_get(sd); - mutex_unlock(&sysfs_mutex); - - return sd; -} -EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns); - -static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, - const char *name, const void *ns, - struct sysfs_dirent **p_sd) -{ - umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - /* allocate */ - sd = sysfs_new_dirent(name, mode, SYSFS_DIR); - if (!sd) - return -ENOMEM; - - sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); - sd->s_ns = ns; - sd->s_dir.kobj = kobj; - - /* link in */ - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, parent_sd); - sysfs_addrm_finish(&acxt); - - if (rc == 0) - *p_sd = sd; - else - sysfs_put(sd); - - return rc; -} - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd) -{ - return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, name, NULL, p_sd); -} - -/** - * sysfs_read_ns_type: return associated ns_type - * @kobj: the kobject being queried - * - * Each kobject can be tagged with exactly one namespace type - * (i.e. network or user). Return the ns_type associated with - * this object if any - */ -static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) -{ - const struct kobj_ns_type_operations *ops; - enum kobj_ns_type type; - - ops = kobj_child_ns_ops(kobj); - if (!ops) - return KOBJ_NS_TYPE_NONE; - - type = ops->type; - BUG_ON(type <= KOBJ_NS_TYPE_NONE); - BUG_ON(type >= KOBJ_NS_TYPES); - BUG_ON(!kobj_ns_type_registered(type)); - - return type; -} - -/** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { - enum kobj_ns_type type; - struct sysfs_dirent *parent_sd, *sd; - int error = 0; + struct kernfs_node *parent, *kn; BUG_ON(!kobj); if (kobj->parent) - parent_sd = kobj->parent->sd; + parent = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent = sysfs_root_kn; - if (!parent_sd) + if (!parent) return -ENOENT; - type = sysfs_read_ns_type(kobj); - - error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); - if (!error) - kobj->sd = sd; - return error; -} - -static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret = NULL; - struct dentry *parent = dentry->d_parent; - struct sysfs_dirent *parent_sd = parent->d_fsdata; - struct sysfs_dirent *sd; - struct inode *inode; - enum kobj_ns_type type; - const void *ns; - - mutex_lock(&sysfs_mutex); - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dir->i_sb)->ns[type]; - - sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns); - - /* no such entry */ - if (!sd) { - ret = ERR_PTR(-ENOENT); - goto out_unlock; - } - dentry->d_fsdata = sysfs_get(sd); - - /* attach dentry and inode */ - inode = sysfs_get_inode(dir->i_sb, sd); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; - } - - /* instantiate and hash dentry */ - ret = d_materialise_unique(dentry, inode); - out_unlock: - mutex_unlock(&sysfs_mutex); - return ret; -} - -const struct inode_operations sysfs_dir_inode_operations = { - .lookup = sysfs_lookup, - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos) -{ - struct sysfs_dirent *last; - - while (true) { - struct rb_node *rbn; - - last = pos; - - if (sysfs_type(pos) != SYSFS_DIR) - break; - - rbn = rb_first(&pos->s_dir.children); - if (!rbn) - break; - - pos = to_sysfs_dirent(rbn); - } - - return last; -} - -/** - * sysfs_next_descendant_post - find the next descendant for post-order walk - * @pos: the current position (%NULL to initiate traversal) - * @root: sysfs_dirent whose descendants to walk - * - * Find the next descendant to visit for post-order traversal of @root's - * descendants. @root is included in the iteration and the last node to be - * visited. - */ -static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos, - struct sysfs_dirent *root) -{ - struct rb_node *rbn; - - lockdep_assert_held(&sysfs_mutex); - - /* if first iteration, visit leftmost descendant which may be root */ - if (!pos) - return sysfs_leftmost_descendant(root); - - /* if we visited @root, we're done */ - if (pos == root) - return NULL; - - /* if there's an unvisited sibling, visit its leftmost descendant */ - rbn = rb_next(&pos->s_rb); - if (rbn) - return sysfs_leftmost_descendant(to_sysfs_dirent(rbn)); - - /* no sibling left, visit parent */ - return pos->s_parent; -} - -static void __sysfs_remove(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_dirent *pos, *next; - - if (!sd) - return; - - pr_debug("sysfs %s: removing\n", sd->s_name); - - next = NULL; - do { - pos = next; - next = sysfs_next_descendant_post(pos, sd); - if (pos) - sysfs_remove_one(acxt, pos); - } while (next); -} - -/** - * sysfs_remove - remove a sysfs_dirent recursively - * @sd: the sysfs_dirent to remove - * - * Remove @sd along with all its subdirectories and files. - */ -void sysfs_remove(struct sysfs_dirent *sd) -{ - struct sysfs_addrm_cxt acxt; - - sysfs_addrm_start(&acxt); - __sysfs_remove(&acxt, sd); - sysfs_addrm_finish(&acxt); -} - -/** - * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it - * @dir_sd: parent of the target - * @name: name of the sysfs_dirent to remove - * @ns: namespace tag of the sysfs_dirent to remove - * - * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove - * it. Returns 0 on success, -ENOENT if such entry doesn't exist. - */ -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns) -{ - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - - if (!dir_sd) { - WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", - name); - return -ENOENT; + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), + S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, kobject_name(kobj)); + return PTR_ERR(kn); } - sysfs_addrm_start(&acxt); - - sd = sysfs_find_dirent(dir_sd, name, ns); - if (sd) - __sysfs_remove(&acxt, sd); - - sysfs_addrm_finish(&acxt); - - if (sd) - return 0; - else - return -ENOENT; + kobj->sd = kn; + return 0; } /** @@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, */ void sysfs_remove_dir(struct kobject *kobj) { - struct sysfs_dirent *sd = kobj->sd; + struct kernfs_node *kn = kobj->sd; /* * In general, kboject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no - * control over removal. @kobj->sd may be removed anytime and - * symlink code may end up dereferencing an already freed sd. + * control over removal. @kobj->sd may be removed anytime + * and symlink code may end up dereferencing an already freed node. * - * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation - * against symlink operations so that symlink code can safely - * dereference @kobj->sd. + * sysfs_symlink_target_lock synchronizes @kobj->sd + * disassociation against symlink operations so that symlink code + * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); - if (sd) { - WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); - sysfs_remove(sd); + if (kn) { + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); + kernfs_remove(kn); } } -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns) -{ - int error; - - mutex_lock(&sysfs_mutex); - - error = 0; - if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && - (strcmp(sd->s_name, new_name) == 0)) - goto out; /* nothing to rename */ - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_name, new_ns)) - goto out; - - /* rename sysfs_dirent */ - if (strcmp(sd->s_name, new_name) != 0) { - error = -ENOMEM; - new_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out; - - kfree(sd->s_name); - sd->s_name = new_name; - } - - /* - * Move to the appropriate place in the appropriate directories rbtree. - */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_ns = new_ns; - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - - error = 0; - out: - mutex_unlock(&sysfs_mutex); - return error; -} - int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { - struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + struct kernfs_node *parent = kobj->sd->parent; - return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); + return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { - struct sysfs_dirent *sd = kobj->sd; - struct sysfs_dirent *new_parent_sd; + struct kernfs_node *kn = kobj->sd; + struct kernfs_node *new_parent; - BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? - new_parent_kobj->sd : &sysfs_root; + BUG_ON(!kn->parent); + new_parent = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : sysfs_root_kn; - return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); + return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } - -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct sysfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - -static int sysfs_dir_release(struct inode *inode, struct file *filp) -{ - sysfs_put(filp->private_data); - return 0; -} - -static struct sysfs_dirent *sysfs_dir_pos(const void *ns, - struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos) -{ - if (pos) { - int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && - pos->s_parent == parent_sd && - hash == pos->s_hash; - sysfs_put(pos); - if (!valid) - pos = NULL; - } - if (!pos && (hash > 1) && (hash < INT_MAX)) { - struct rb_node *node = parent_sd->s_dir.children.rb_node; - while (node) { - pos = to_sysfs_dirent(node); - - if (hash < pos->s_hash) - node = node->rb_left; - else if (hash > pos->s_hash) - node = node->rb_right; - else - break; - } - } - /* Skip over entries in the wrong namespace */ - while (pos && pos->s_ns != ns) { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } - return pos; -} - -static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, - struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) -{ - pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); - return pos; -} - -static int sysfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent *parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = file->private_data; - enum kobj_ns_type type; - const void *ns; - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dentry->d_sb)->ns[type]; - - if (!dir_emit_dots(file, ctx)) - return 0; - mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); - pos; - pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { - const char *name = pos->s_name; - unsigned int type = dt_type(pos); - int len = strlen(name); - ino_t ino = pos->s_ino; - ctx->pos = pos->s_hash; - file->private_data = sysfs_get(pos); - - mutex_unlock(&sysfs_mutex); - if (!dir_emit(ctx, name, len, ino, type)) - return 0; - mutex_lock(&sysfs_mutex); - } - mutex_unlock(&sysfs_mutex); - file->private_data = NULL; - ctx->pos = INT_MAX; - return 0; -} - -static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *inode = file_inode(file); - loff_t ret; - - mutex_lock(&inode->i_mutex); - ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -const struct file_operations sysfs_dir_operations = { - .read = generic_read_dir, - .iterate = sysfs_readdir, - .release = sysfs_dir_release, - .llseek = sysfs_dir_llseek, -}; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b94f936..810cf6e 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,70 +14,23 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> -#include <linux/fsnotify.h> -#include <linux/namei.h> -#include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> -#include <linux/limits.h> -#include <linux/uaccess.h> #include <linux/seq_file.h> -#include <linux/mm.h> #include "sysfs.h" +#include "../kernfs/kernfs-internal.h" /* - * There's one sysfs_open_file for each open file and one sysfs_open_dirent - * for each sysfs_dirent with one or more open files. - * - * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is - * protected by sysfs_open_dirent_lock. - * - * filp->private_data points to seq_file whose ->private points to - * sysfs_open_file. sysfs_open_files are chained at - * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex. - */ -static DEFINE_SPINLOCK(sysfs_open_dirent_lock); -static DEFINE_MUTEX(sysfs_open_file_mutex); - -struct sysfs_open_dirent { - atomic_t refcnt; - atomic_t event; - wait_queue_head_t poll; - struct list_head files; /* goes through sysfs_open_file.list */ -}; - -struct sysfs_open_file { - struct sysfs_dirent *sd; - struct file *file; - struct mutex mutex; - int event; - struct list_head list; - - bool mmapped; - const struct vm_operations_struct *vm_ops; -}; - -static bool sysfs_is_bin(struct sysfs_dirent *sd) -{ - return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR; -} - -static struct sysfs_open_file *sysfs_of(struct file *file) -{ - return ((struct seq_file *)file->private_data)->private; -} - -/* - * Determine ktype->sysfs_ops for the given sysfs_dirent. This function + * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ -static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) +static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { - struct kobject *kobj = sd->s_parent->s_dir.kobj; + struct kobject *kobj = kn->parent->priv; - if (!sysfs_ignore_lockdep(sd)) - lockdep_assert_held(sd); + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } @@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ -static int sysfs_seq_show(struct seq_file *sf, void *v) +static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { - struct sysfs_open_file *of = sf->private; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops; - char *buf; + struct kernfs_open_file *of = sf->private; + struct kobject *kobj = of->kn->parent->priv; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; + char *buf; /* acquire buffer and ensure that it's >= PAGE_SIZE */ count = seq_get_buf(sf, &buf); @@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) } /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. + * Invoke show(). Control may reach here via seq file lseek even + * if @ops->show() isn't implemented. */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; + if (ops->show) { + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; } - of->event = atomic_read(&of->sd->s_attr.open->event); - - /* - * Lookup @ops and invoke show(). Control may reach here via seq - * file lseek even if @ops->show() isn't implemented. - */ - ops = sysfs_file_ops(of->sd); - if (ops->show) - count = ops->show(kobj, of->sd->s_attr.attr, buf); - else - count = 0; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - return count; - /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. @@ -144,728 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) return 0; } -/* - * Read method for bin files. As reading a bin file can have side-effects, - * the exact offset and bytes specified in read(2) call should be passed to - * the read callback making it difficult to use seq_file. Implement - * simplistic custom buffering for bin files. - */ -static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf, - size_t bytes, loff_t *off) +static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - loff_t size = file_inode(file)->i_size; - int count = min_t(size_t, bytes, PAGE_SIZE); - loff_t offs = *off; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (!bytes) + if (!count) return 0; if (size) { - if (offs > size) + if (pos > size) return 0; - if (offs + count > size) - count = size - offs; - } - - buf = kmalloc(count, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - /* need of->sd for battr, its parent for kobj */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - count = -ENODEV; - mutex_unlock(&of->mutex); - goto out_free; - } - - if (battr->read) - count = battr->read(file, kobj, battr, buf, offs, count); - else - count = -EIO; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - goto out_free; - - if (copy_to_user(userbuf, buf, count)) { - count = -EFAULT; - goto out_free; + if (pos + count > size) + count = size - pos; } - pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); - - *off = offs + count; + if (!battr->read) + return -EIO; - out_free: - kfree(buf); - return count; + return battr->read(of->file, kobj, battr, buf, pos, count); } -/** - * flush_write_buffer - push buffer to kobject - * @of: open file - * @buf: data buffer for file - * @off: file offset to write to - * @count: number of bytes - * - * Get the correct pointers for the kobject and the attribute we're dealing - * with, then call the store() method for it with @buf. - */ -static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off, - size_t count) +/* kernfs write callback for regular sysfs files */ +static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc = 0; - - /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. - */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; - } + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; - if (sysfs_is_bin(of->sd)) { - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - - rc = -EIO; - if (battr->write) - rc = battr->write(of->file, kobj, battr, buf, off, - count); - } else { - const struct sysfs_ops *ops = sysfs_file_ops(of->sd); - - rc = ops->store(kobj, of->sd->s_attr.attr, buf, count); - } - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); + if (!count) + return 0; - return rc; + return ops->store(kobj, of->kn->priv, buf, count); } -/** - * sysfs_write_file - write an attribute - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Copy data in from userland and pass it to the matching - * sysfs_ops->store() by invoking flush_write_buffer(). - * - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come on - * the first write. Hint: if you're writing a value, first read the file, - * modify only the the value you're changing, then write entire buffer - * back. - */ -static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +/* kernfs write callback for bin sysfs files */ +static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - ssize_t len = min_t(size_t, count, PAGE_SIZE); - loff_t size = file_inode(file)->i_size; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (sysfs_is_bin(of->sd) && size) { - if (size <= *ppos) + if (size) { + if (size <= pos) return 0; - len = min_t(ssize_t, len, size - *ppos); + count = min_t(ssize_t, count, size - pos); } - - if (!len) + if (!count) return 0; - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + if (!battr->write) + return -EIO; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - - len = flush_write_buffer(of, buf, *ppos, len); - if (len > 0) - *ppos += len; -out_free: - kfree(buf); - return len; -} - -static void sysfs_bin_vma_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - - if (!of->vm_ops) - return; - - if (!sysfs_get_active(of->sd)) - return; - - if (of->vm_ops->open) - of->vm_ops->open(vma); - - sysfs_put_active(of->sd); + return battr->write(of->file, kobj, battr, buf, pos, count); } -static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, + struct vm_area_struct *vma) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; - if (!of->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = VM_FAULT_SIGBUS; - if (of->vm_ops->fault) - ret = of->vm_ops->fault(vma, vmf); - - sysfs_put_active(of->sd); - return ret; + return battr->mmap(of->file, kobj, battr, vma); } -static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return VM_FAULT_SIGBUS; + struct kernfs_node *kn = kobj->sd, *tmp; - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = 0; - if (of->vm_ops->page_mkwrite) - ret = of->vm_ops->page_mkwrite(vma, vmf); + if (kn && dir) + kn = kernfs_find_and_get(kn, dir); else - file_update_time(file); - - sysfs_put_active(of->sd); - return ret; -} - -static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return -EINVAL; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = -EINVAL; - if (of->vm_ops->access) - ret = of->vm_ops->access(vma, addr, buf, len, write); - - sysfs_put_active(of->sd); - return ret; -} - -#ifdef CONFIG_NUMA -static int sysfs_bin_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - sysfs_put_active(of->sd); - return ret; -} - -static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!sysfs_get_active(of->sd)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - sysfs_put_active(of->sd); - return pol; -} - -static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return 0; - - ret = 0; - if (of->vm_ops->migrate) - ret = of->vm_ops->migrate(vma, from, to, flags); - - sysfs_put_active(of->sd); - return ret; -} -#endif - -static const struct vm_operations_struct sysfs_bin_vm_ops = { - .open = sysfs_bin_vma_open, - .fault = sysfs_bin_fault, - .page_mkwrite = sysfs_bin_page_mkwrite, - .access = sysfs_bin_access, -#ifdef CONFIG_NUMA - .set_policy = sysfs_bin_set_policy, - .get_policy = sysfs_bin_get_policy, - .migrate = sysfs_bin_migrate, -#endif -}; - -static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc; - - mutex_lock(&of->mutex); - - /* need of->sd for battr, its parent for kobj */ - rc = -ENODEV; - if (!sysfs_get_active(of->sd)) - goto out_unlock; - - if (!battr->mmap) - goto out_put; - - rc = battr->mmap(file, kobj, battr, vma); - if (rc) - goto out_put; - - /* - * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() - * to satisfy versions of X which crash if the mmap fails: that - * substitutes a new vm_file, and we don't then want bin_vm_ops. - */ - if (vma->vm_file != file) - goto out_put; - - rc = -EINVAL; - if (of->mmapped && of->vm_ops != vma->vm_ops) - goto out_put; + kernfs_get(kn); - /* - * It is not possible to successfully wrap close. - * So error if someone is trying to use close. - */ - rc = -EINVAL; - if (vma->vm_ops && vma->vm_ops->close) - goto out_put; - - rc = 0; - of->mmapped = 1; - of->vm_ops = vma->vm_ops; - vma->vm_ops = &sysfs_bin_vm_ops; -out_put: - sysfs_put_active(of->sd); -out_unlock: - mutex_unlock(&of->mutex); - - return rc; -} - -/** - * sysfs_get_open_dirent - get or create sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: sysfs_open_file for this instance of open - * - * If @sd->s_attr.open exists, increment its reference count; - * otherwise, create one. @of is chained to the files list. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int sysfs_get_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od, *new_od = NULL; - - retry: - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irq(&sysfs_open_dirent_lock); - - if (!sd->s_attr.open && new_od) { - sd->s_attr.open = new_od; - new_od = NULL; + if (kn && attr) { + tmp = kernfs_find_and_get(kn, attr); + kernfs_put(kn); + kn = tmp; } - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->refcnt); - list_add_tail(&of->list, &od->files); - } - - spin_unlock_irq(&sysfs_open_dirent_lock); - mutex_unlock(&sysfs_open_file_mutex); - - if (od) { - kfree(new_od); - return 0; + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } +} +EXPORT_SYMBOL_GPL(sysfs_notify); - /* not there, initialize a new one and retry */ - new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); - if (!new_od) - return -ENOMEM; +static const struct kernfs_ops sysfs_file_kfops_empty = { +}; - atomic_set(&new_od->refcnt, 0); - atomic_set(&new_od->event, 1); - init_waitqueue_head(&new_od->poll); - INIT_LIST_HEAD(&new_od->files); - goto retry; -} +static const struct kernfs_ops sysfs_file_kfops_ro = { + .seq_show = sysfs_kf_seq_show, +}; -/** - * sysfs_put_open_dirent - put sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: associated sysfs_open_file - * - * Put @sd->s_attr.open and unlink @of from the files list. If - * reference count reaches zero, disassociate and free it. - * - * LOCKING: - * None. - */ -static void sysfs_put_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od = sd->s_attr.open; - unsigned long flags; +static const struct kernfs_ops sysfs_file_kfops_wo = { + .write = sysfs_kf_write, +}; - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); +static const struct kernfs_ops sysfs_file_kfops_rw = { + .seq_show = sysfs_kf_seq_show, + .write = sysfs_kf_write, +}; - if (of) - list_del(&of->list); +static const struct kernfs_ops sysfs_bin_kfops_ro = { + .read = sysfs_kf_bin_read, +}; - if (atomic_dec_and_test(&od->refcnt)) - sd->s_attr.open = NULL; - else - od = NULL; +static const struct kernfs_ops sysfs_bin_kfops_wo = { + .write = sysfs_kf_bin_write, +}; - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); - mutex_unlock(&sysfs_open_file_mutex); +static const struct kernfs_ops sysfs_bin_kfops_rw = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, +}; - kfree(od); -} +static const struct kernfs_ops sysfs_bin_kfops_mmap = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, + .mmap = sysfs_kf_bin_mmap, +}; -static int sysfs_open_file(struct inode *inode, struct file *file) +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t mode, const void *ns) { - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_open_file *of; - bool has_read, has_write, has_mmap; - int error = -EACCES; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + loff_t size; - if (sysfs_is_bin(attr_sd)) { - struct bin_attribute *battr = attr_sd->s_attr.bin_attr; - - has_read = battr->read || battr->mmap; - has_write = battr->write || battr->mmap; - has_mmap = battr->mmap; - } else { - const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); + if (!is_bin) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; /* every kobject with an attribute needs a ktype assigned */ - if (WARN(!ops, KERN_ERR + if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) - goto err_out; - - has_read = ops->show; - has_write = ops->store; - has_mmap = false; - } - - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; - - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; - - /* allocate a sysfs_open_file for the file */ - error = -ENOMEM; - of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); - if (!of) - goto err_out; - - /* - * The following is done to give a different lockdep key to - * @of->mutex for files which implement mmap. This is a rather - * crude way to avoid false positive lockdep warning around - * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and - * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under - * which mm->mmap_sem nests, while holding @of->mutex. As each - * open file has a separate mutex, it's okay as long as those don't - * happen on the same file. At this point, we can't easily give - * each file a separate locking class. Let's differentiate on - * whether the file has mmap or not for now. - */ - if (has_mmap) - mutex_init(&of->mutex); - else - mutex_init(&of->mutex); - - of->sd = attr_sd; - of->file = file; - - /* - * Always instantiate seq_file even if read access doesn't use - * seq_file or is not requested. This unifies private data access - * and readable regular files are the vast majority anyway. - */ - if (sysfs_is_bin(attr_sd)) - error = single_open(file, NULL, of); - else - error = single_open(file, sysfs_seq_show, of); - if (error) - goto err_free; - - /* seq_file clears PWRITE unconditionally, restore it if WRITE */ - if (file->f_mode & FMODE_WRITE) - file->f_mode |= FMODE_PWRITE; - - /* make sure we have open dirent struct */ - error = sysfs_get_open_dirent(attr_sd, of); - if (error) - goto err_close; - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; - -err_close: - single_release(inode, file); -err_free: - kfree(of); -err_out: - sysfs_put_active(attr_sd); - return error; -} - -static int sysfs_release(struct inode *inode, struct file *filp) -{ - struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_file *of = sysfs_of(filp); - - sysfs_put_open_dirent(sd, of); - single_release(inode, filp); - kfree(of); - - return 0; -} - -void sysfs_unmap_bin_file(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - struct sysfs_open_file *of; - - if (!sysfs_is_bin(sd)) - return; - - spin_lock_irq(&sysfs_open_dirent_lock); - od = sd->s_attr.open; - if (od) - atomic_inc(&od->refcnt); - spin_unlock_irq(&sysfs_open_dirent_lock); - if (!od) - return; - - mutex_lock(&sysfs_open_file_mutex); - list_for_each_entry(of, &od->files, list) { - struct inode *inode = file_inode(of->file); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); + return -EINVAL; + + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_file_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_file_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_file_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = PAGE_SIZE; + } else { + struct bin_attribute *battr = (void *)attr; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = battr->size; } - mutex_unlock(&sysfs_open_file_mutex); - - sysfs_put_open_dirent(sd, NULL); -} - -/* Sysfs attribute files are pollable. The idea is that you read - * the content and then you use 'poll' or 'select' to wait for - * the content to change. When the content changes (assuming the - * manager for the kobject supports notification), poll will - * return POLLERR|POLLPRI, and select will return the fd whether - * it is waiting for read, write, or exceptions. - * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, or seek to 0 and read again. - * Reminder: this only works for attributes which actively support - * it, and it is not possible to test an attribute from userspace - * to see if it supports poll (Neither 'poll' nor 'select' return - * an appropriate error code). When in doubt, set a suitable timeout value. - */ -static unsigned int sysfs_poll(struct file *filp, poll_table *wait) -{ - struct sysfs_open_file *of = sysfs_of(filp); - struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_dirent *od = attr_sd->s_attr.open; - - /* need parent for the kobj, grab both */ - if (!sysfs_get_active(attr_sd)) - goto trigger; - - poll_wait(filp, &od->poll, wait); - sysfs_put_active(attr_sd); - - if (of->event != atomic_read(&od->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|POLLERR|POLLPRI; -} - -void sysfs_notify_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - unsigned long flags; - - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - - if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + kn = __kernfs_create_file(parent, attr->name, mode, size, ops, + (void *)attr, ns, true, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); } - - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); -} -EXPORT_SYMBOL_GPL(sysfs_notify_dirent); - -void sysfs_notify(struct kobject *k, const char *dir, const char *attr) -{ - struct sysfs_dirent *sd = k->sd; - - mutex_lock(&sysfs_mutex); - - if (sd && dir) - sd = sysfs_find_dirent(sd, dir, NULL); - if (sd && attr) - sd = sysfs_find_dirent(sd, attr, NULL); - if (sd) - sysfs_notify_dirent(sd); - - mutex_unlock(&sysfs_mutex); -} -EXPORT_SYMBOL_GPL(sysfs_notify); - -const struct file_operations sysfs_file_operations = { - .read = seq_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -const struct file_operations sysfs_bin_operations = { - .read = sysfs_bin_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .mmap = sysfs_bin_mmap, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, - umode_t amode, const void *ns) -{ - umode_t mode = (amode & S_IALLUGO) | S_IFREG; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - sd = sysfs_new_dirent(attr->name, mode, type); - if (!sd) - return -ENOMEM; - - sd->s_ns = ns; - sd->s_attr.attr = (void *)attr; - sysfs_dirent_init_lockdep(sd); - - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, dir_sd); - sysfs_addrm_finish(&acxt); - - if (rc) - sysfs_put(sd); - - return rc; + return 0; } - -int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, - int type) +int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, + bool is_bin) { - return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); + return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); } /** @@ -879,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, - attr->mode, ns); + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); @@ -908,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } - if (!dir_sd) + if (!parent) return -ENOENT; - error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); - sysfs_put(dir_sd); + error = sysfs_add_file(parent, attr, false); + kernfs_put(parent); return error; } @@ -936,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; struct iattr newattrs; int rc; - mutex_lock(&sysfs_mutex); - - rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, attr->name, NULL); - if (!sd) - goto out; + kn = kernfs_find_and_get(kobj->sd, attr->name); + if (!kn) + return -ENOENT; - newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; - rc = sysfs_sd_setattr(sd, &newattrs); - out: - mutex_unlock(&sysfs_mutex); + rc = kernfs_setattr(kn, &newattrs); + + kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); @@ -968,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { - struct sysfs_dirent *dir_sd = kobj->sd; + struct kernfs_node *parent = kobj->sd; - sysfs_hash_and_remove(dir_sd, attr->name, ns); + kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); @@ -991,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, attr->name, NULL); - sysfs_put(dir_sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (parent) { + kernfs_remove_by_name(parent, attr->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); @@ -1014,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); + return sysfs_add_file(kobj->sd, &attr->attr, true); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); @@ -1026,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { - sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); + kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 1898a10..6b57938 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp) { struct attribute *const *attr; @@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->attrs) for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) sysfs_remove_bin_file(kobj, *bin_attr); } -static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static int create_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp, int update) { struct attribute *const *attr; @@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * re-adding (if required) the file. */ if (update) - sysfs_hash_and_remove(dir_sd, (*attr)->name, - NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } - error = sysfs_add_file_mode_ns(dir_sd, *attr, - SYSFS_KOBJ_ATTR, + error = sysfs_add_file_mode_ns(parent, *attr, false, (*attr)->mode | mode, NULL); if (unlikely(error)) break; } if (error) { - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); goto exit; } } @@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, break; } if (error) - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); } exit: return error; @@ -88,7 +86,7 @@ exit: static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; int error; BUG_ON(!kobj || (!update && !kobj->sd)); @@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update, return -EINVAL; } if (grp->name) { - error = sysfs_create_subdir(kobj, grp->name, &sd); - if (error) - return error; + kn = kernfs_create_dir(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, kobj); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(kobj->sd, grp->name); + return PTR_ERR(kn); + } } else - sd = kobj->sd; - sysfs_get(sd); - error = create_files(sd, kobj, grp, update); + kn = kobj->sd; + kernfs_get(kn); + error = create_files(kn, kobj, grp, update); if (error) { if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); } - sysfs_put(sd); + kernfs_put(kn); return error; } @@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd = kobj->sd; - struct sysfs_dirent *sd; + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, grp->name); - if (!sd) { - WARN(!sd, KERN_WARNING + kn = kernfs_find_and_get(parent, grp->name); + if (!kn) { + WARN(!kn, KERN_WARNING "sysfs group %p not found for kobject '%s'\n", grp, kobject_name(kobj)); return; } - } else - sd = sysfs_get(dir_sd); + } else { + kn = parent; + kernfs_get(kn); + } - remove_files(sd, kobj, grp); + remove_files(kn, kobj, grp); if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); - sysfs_put(sd); + kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); @@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; struct attribute *const *attr; int i; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (!parent) return -ENOENT; for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + error = sysfs_add_file(parent, *attr, false); if (error) { while (--i >= 0) - sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); + kernfs_remove_by_name(parent, (*--attr)->name); } - sysfs_put(dir_sd); + kernfs_put(parent); return error; } @@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; struct attribute *const *attr; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (dir_sd) { + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (parent) { for (attr = grp->attrs; *attr; ++attr) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); - sysfs_put(dir_sd); + kernfs_remove_by_name(parent, (*attr)->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); @@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, group_name); + if (!parent) return -ENOENT; - error = sysfs_create_link_sd(dir_sd, target, link_name); - sysfs_put(dir_sd); + error = sysfs_create_link_sd(parent, target, link_name); + kernfs_put(parent); return error; } @@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, link_name, NULL); - sysfs_put(dir_sd); + parent = kernfs_find_and_get(kobj->sd, group_name); + if (parent) { + kernfs_remove_by_name(parent, link_name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c deleted file mode 100644 index 1750f79..0000000 --- a/fs/sysfs/inode.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * fs/sysfs/inode.c - basic sysfs inode and dentry operations - * - * Copyright (c) 2001-3 Patrick Mochel - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/pagemap.h> -#include <linux/namei.h> -#include <linux/backing-dev.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sysfs.h> -#include <linux/xattr.h> -#include <linux/security.h> -#include "sysfs.h" - -static const struct address_space_operations sysfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - -static struct backing_dev_info sysfs_backing_dev_info = { - .name = "sysfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static const struct inode_operations sysfs_inode_operations = { - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -int __init sysfs_inode_init(void) -{ - return bdi_init(&sysfs_backing_dev_info); -} - -static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *attrs; - struct iattr *iattrs; - - attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); - if (!attrs) - return NULL; - iattrs = &attrs->ia_iattr; - - /* assign default attributes */ - iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = GLOBAL_ROOT_UID; - iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; - - return attrs; -} - -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) -{ - struct sysfs_inode_attrs *sd_attrs; - struct iattr *iattrs; - unsigned int ia_valid = iattr->ia_valid; - - sd_attrs = sd->s_iattr; - - if (!sd_attrs) { - /* setting attributes for the first time, allocate now */ - sd_attrs = sysfs_init_inode_attrs(sd); - if (!sd_attrs) - return -ENOMEM; - sd->s_iattr = sd_attrs; - } - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } - return 0; -} - -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct sysfs_dirent *sd = dentry->d_fsdata; - int error; - - if (!sd) - return -EINVAL; - - mutex_lock(&sysfs_mutex); - error = inode_change_ok(inode, iattr); - if (error) - goto out; - - error = sysfs_sd_setattr(sd, iattr); - if (error) - goto out; - - /* this ignores size changes */ - setattr_copy(inode, iattr); - -out: - mutex_unlock(&sysfs_mutex); - return error; -} - -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, - u32 *secdata_len) -{ - struct sysfs_inode_attrs *iattrs; - void *old_secdata; - size_t old_secdata_len; - - if (!sd->s_iattr) { - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - } - - iattrs = sd->s_iattr; - old_secdata = iattrs->ia_secdata; - old_secdata_len = iattrs->ia_secdata_len; - - iattrs->ia_secdata = *secdata; - iattrs->ia_secdata_len = *secdata_len; - - *secdata = old_secdata; - *secdata_len = old_secdata_len; - return 0; -} - -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - void *secdata; - int error; - u32 secdata_len = 0; - - if (!sd) - return -EINVAL; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, - value, size, flags); - if (error) - goto out; - error = security_inode_getsecctx(dentry->d_inode, - &secdata, &secdata_len); - if (error) - goto out; - - mutex_lock(&sysfs_mutex); - error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); - mutex_unlock(&sysfs_mutex); - - if (secdata) - security_release_secctx(secdata, secdata_len); - } else - return -EINVAL; -out: - return error; -} - -static inline void set_default_inode_attr(struct inode *inode, umode_t mode) -{ - inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -} - -static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) -{ - inode->i_uid = iattr->ia_uid; - inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; -} - -static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct sysfs_inode_attrs *iattrs = sd->s_iattr; - - inode->i_mode = sd->s_mode; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them from persistent copy in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } - - if (sysfs_type(sd) == SYSFS_DIR) - set_nlink(inode, sd->s_dir.subdirs + 2); -} - -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - generic_fillattr(inode, stat); - return 0; -} - -static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct bin_attribute *bin_attr; - - inode->i_private = sysfs_get(sd); - inode->i_mapping->a_ops = &sysfs_aops; - inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; - inode->i_op = &sysfs_inode_operations; - - set_default_inode_attr(inode, sd->s_mode); - sysfs_refresh_inode(sd, inode); - - /* initialize inode according to type */ - switch (sysfs_type(sd)) { - case SYSFS_DIR: - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - break; - case SYSFS_KOBJ_ATTR: - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - break; - case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_attr.bin_attr; - inode->i_size = bin_attr->size; - inode->i_fop = &sysfs_bin_operations; - break; - case SYSFS_KOBJ_LINK: - inode->i_op = &sysfs_symlink_inode_operations; - break; - default: - BUG(); - } - - unlock_new_inode(inode); -} - -/** - * sysfs_get_inode - get inode for sysfs_dirent - * @sb: super block - * @sd: sysfs_dirent to allocate inode for - * - * Get inode for @sd. If such inode doesn't exist, a new inode - * is allocated and basics are initialized. New inode is - * returned locked. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * Pointer to allocated inode on success, NULL on failure. - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) -{ - struct inode *inode; - - inode = iget_locked(sb, sd->s_ino); - if (inode && (inode->i_state & I_NEW)) - sysfs_init_inode(sd, inode); - - return inode; -} - -/* - * The sysfs_dirent serves as both an inode and a directory entry for sysfs. - * To prevent the sysfs inode numbers from being freed prematurely we take a - * reference to sysfs_dirent from the sysfs inode. A - * super_operations.evict_inode() implementation is needed to drop that - * reference upon inode destruction. - */ -void sysfs_evict_inode(struct inode *inode) -{ - struct sysfs_dirent *sd = inode->i_private; - - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - sysfs_put(sd); -} - -int sysfs_permission(struct inode *inode, int mask) -{ - struct sysfs_dirent *sd; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sd = inode->i_private; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - return generic_permission(inode, mask); -} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 834ec2c..6211230 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -14,146 +14,41 @@ #include <linux/fs.h> #include <linux/mount.h> -#include <linux/pagemap.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/magic.h> -#include <linux/slab.h> #include <linux/user_namespace.h> #include "sysfs.h" - -static struct vfsmount *sysfs_mnt; -struct kmem_cache *sysfs_dir_cachep; - -static const struct super_operations sysfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .evict_inode = sysfs_evict_inode, -}; - -struct sysfs_dirent sysfs_root = { - .s_name = "", - .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), - .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, - .s_ino = 1, -}; - -static int sysfs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; - sb->s_op = &sysfs_ops; - sb->s_time_gran = 1; - - /* get root inode, initialize and unlock it */ - mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(sb, &sysfs_root); - mutex_unlock(&sysfs_mutex); - if (!inode) { - pr_debug("sysfs: could not get root inode\n"); - return -ENOMEM; - } - - /* instantiate and link root dentry */ - root = d_make_root(inode); - if (!root) { - pr_debug("%s: could not get root dentry!\n", __func__); - return -ENOMEM; - } - root->d_fsdata = &sysfs_root; - sb->s_root = root; - sb->s_d_op = &sysfs_dentry_ops; - return 0; -} - -static int sysfs_test_super(struct super_block *sb, void *data) -{ - struct sysfs_super_info *sb_info = sysfs_info(sb); - struct sysfs_super_info *info = data; - enum kobj_ns_type type; - int found = 1; - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (sb_info->ns[type] != info->ns[type]) - found = 0; - } - return found; -} - -static int sysfs_set_super(struct super_block *sb, void *data) -{ - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; -} - -static void free_sysfs_super_info(struct sysfs_super_info *info) -{ - int type; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - kobj_ns_drop(type, info->ns[type]); - kfree(info); -} +static struct kernfs_root *sysfs_root; +struct kernfs_node *sysfs_root_kn; static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct sysfs_super_info *info; - enum kobj_ns_type type; - struct super_block *sb; - int error; + struct dentry *root; + void *ns; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (!kobj_ns_current_may_mount(type)) - return ERR_PTR(-EPERM); - } - } - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_grab_current(type); - - sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - free_sysfs_super_info(info); - if (IS_ERR(sb)) - return ERR_CAST(sb); - if (!sb->s_root) { - error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(sb); - return ERR_PTR(error); - } - sb->s_flags |= MS_ACTIVE; + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); } - return dget(sb->s_root); + ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns); + if (IS_ERR(root)) + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + return root; } static void sysfs_kill_sb(struct super_block *sb) { - struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances - * so we can't find it, before freeing sysfs_super_info. - */ - kill_anon_super(sb); - free_sysfs_super_info(info); + void *ns = (void *)kernfs_super_ns(sb); + + kernfs_kill_sb(sb); + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { @@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = { int __init sysfs_init(void) { - int err = -ENOMEM; + int err; - sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", - sizeof(struct sysfs_dirent), - 0, 0, NULL); - if (!sysfs_dir_cachep) - goto out; + sysfs_root = kernfs_create_root(NULL, NULL); + if (IS_ERR(sysfs_root)) + return PTR_ERR(sysfs_root); - err = sysfs_inode_init(); - if (err) - goto out_err; + sysfs_root_kn = sysfs_root->kn; err = register_filesystem(&sysfs_fs_type); - if (!err) { - sysfs_mnt = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mnt)) { - printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mnt); - sysfs_mnt = NULL; - unregister_filesystem(&sysfs_fs_type); - goto out_err; - } - } else - goto out_err; -out: - return err; -out_err: - kmem_cache_destroy(sysfs_dir_cachep); - sysfs_dir_cachep = NULL; - goto out; -} - -#undef sysfs_get -struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) -{ - return __sysfs_get(sd); -} -EXPORT_SYMBOL_GPL(sysfs_get); + if (err) { + kernfs_destroy_root(sysfs_root); + return err; + } -#undef sysfs_put -void sysfs_put(struct sysfs_dirent *sd) -{ - __sysfs_put(sd); + return 0; } -EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3ae3f1b..aecb15f 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -11,109 +11,73 @@ */ #include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" -static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, - struct kobject *target, +static int sysfs_do_create_link_sd(struct kernfs_node *parent, + struct kobject *target_kobj, const char *name, int warn) { - struct sysfs_dirent *target_sd = NULL; - struct sysfs_dirent *sd = NULL; - struct sysfs_addrm_cxt acxt; - enum kobj_ns_type ns_type; - int error; + struct kernfs_node *kn, *target = NULL; - BUG_ON(!name || !parent_sd); + BUG_ON(!name || !parent); /* - * We don't own @target and it may be removed at any time. + * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (target->sd) - target_sd = sysfs_get(target->sd); + if (target_kobj->sd) { + target = target_kobj->sd; + kernfs_get(target); + } spin_unlock(&sysfs_symlink_target_lock); - error = -ENOENT; - if (!target_sd) - goto out_put; - - error = -ENOMEM; - sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!sd) - goto out_put; + if (!target) + return -ENOENT; - ns_type = sysfs_ns_type(parent_sd); - if (ns_type) - sd->s_ns = target_sd->s_ns; - sd->s_symlink.target_sd = target_sd; - target_sd = NULL; /* reference is now owned by the symlink */ - - sysfs_addrm_start(&acxt); - /* Symlinks must be between directories with the same ns_type */ - if (!ns_type || - (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { - if (warn) - error = sysfs_add_one(&acxt, sd, parent_sd); - else - error = __sysfs_add_one(&acxt, sd, parent_sd); - } else { - error = -EINVAL; - WARN(1, KERN_WARNING - "sysfs: symlink across ns_types %s/%s -> %s/%s\n", - parent_sd->s_name, - sd->s_name, - sd->s_symlink.target_sd->s_parent->s_name, - sd->s_symlink.target_sd->s_name); - } - sysfs_addrm_finish(&acxt); + kn = kernfs_create_link(parent, name, target); + kernfs_put(target); - if (error) - goto out_put; + if (!IS_ERR(kn)) + return 0; - return 0; - - out_put: - sysfs_put(target_sd); - sysfs_put(sd); - return error; + if (warn && PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. - * @sd: directory we're creating the link in. + * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { - return sysfs_do_create_link_sd(sd, target, name, 1); + return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - if (!parent_sd) + if (!parent) return -EFAULT; - return sysfs_do_create_link_sd(parent_sd, target, name, warn); + return sysfs_do_create_link_sd(parent, target, name, warn); } /** @@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (targ->sd && sysfs_ns_type(kobj->sd)) - ns = targ->sd->s_ns; + if (targ->sd && kernfs_ns_enabled(kobj->sd)) + ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); - sysfs_hash_and_remove(kobj->sd, name, ns); + kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** @@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, */ void sysfs_remove_link(struct kobject *kobj, const char *name) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - sysfs_hash_and_remove(parent_sd, name, NULL); + kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); @@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { - struct sysfs_dirent *parent_sd, *sd = NULL; + struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; if (targ->sd) - old_ns = targ->sd->s_ns; + old_ns = targ->sd->ns; result = -ENOENT; - sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); - if (!sd) + kn = kernfs_find_and_get_ns(parent, old, old_ns); + if (!kn) goto out; result = -EINVAL; - if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + if (kernfs_type(kn) != KERNFS_LINK) goto out; - if (sd->s_symlink.target_sd->s_dir.kobj != targ) + if (kn->symlink.target_kn->priv != targ) goto out; - result = sysfs_rename(sd, parent_sd, new, new_ns); + result = kernfs_rename_ns(kn, parent, new, new_ns); out: - sysfs_put(sd); + kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); - -static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, - struct sysfs_dirent *target_sd, char *path) -{ - struct sysfs_dirent *base, *sd; - char *s = path; - int len = 0; - - /* go up to the root, stop at the base */ - base = parent_sd; - while (base->s_parent) { - sd = target_sd->s_parent; - while (sd->s_parent && base != sd) - sd = sd->s_parent; - - if (base == sd) - break; - - strcpy(s, "../"); - s += 3; - base = base->s_parent; - } - - /* determine end of target string for reverse fillup */ - sd = target_sd; - while (sd->s_parent && sd != base) { - len += strlen(sd->s_name) + 1; - sd = sd->s_parent; - } - - /* check limits */ - if (len < 2) - return -EINVAL; - len--; - if ((s - path) + len > PATH_MAX) - return -ENAMETOOLONG; - - /* reverse fillup of target string from target to base */ - sd = target_sd; - while (sd->s_parent && sd != base) { - int slen = strlen(sd->s_name); - - len -= slen; - strncpy(s + len, sd->s_name, slen); - if (len) - s[--len] = '/'; - - sd = sd->s_parent; - } - - return 0; -} - -static int sysfs_getlink(struct dentry *dentry, char *path) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent *target_sd = sd->s_symlink.target_sd; - int error; - - mutex_lock(&sysfs_mutex); - error = sysfs_get_target_path(parent_sd, target_sd, path); - mutex_unlock(&sysfs_mutex); - - return error; -} - -static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = sysfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); -} - -const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .permission = sysfs_permission, -}; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 0af09fb..0e2f1cc 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,248 +8,36 @@ * This file is released under the GPLv2. */ -#include <linux/lockdep.h> -#include <linux/kobject_ns.h> -#include <linux/fs.h> -#include <linux/rbtree.h> +#ifndef __SYSFS_INTERNAL_H +#define __SYSFS_INTERNAL_H -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - - unsigned long subdirs; - /* children rbtree starts here and goes through sd->s_rb */ - struct rb_root children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - union { - struct attribute *attr; - struct bin_attribute *bin_attr; - }; - struct sysfs_open_dirent *open; -}; - -struct sysfs_inode_attrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - struct sysfs_dirent *s_parent; - const char *s_name; - - struct rb_node s_rb; - - union { - struct completion *completion; - struct sysfs_dirent *removed_list; - } u; - - const void *s_ns; /* namespace tag */ - unsigned int s_hash; /* ns + name hash */ - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - }; - - unsigned short s_flags; - umode_t s_mode; - unsigned int s_ino; - struct sysfs_inode_attrs *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) -#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) - -/* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xf00 -#define SYSFS_NS_TYPE_SHIFT 8 - -#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED 0x02000 - -static inline unsigned int sysfs_type(struct sysfs_dirent *sd) -{ - return sd->s_flags & SYSFS_TYPE_MASK; -} - -/* - * Return any namespace tags on this dirent. - * enum kobj_ns_type is defined in linux/kobject.h - */ -static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) -{ - return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -#define sysfs_dirent_init_lockdep(sd) \ -do { \ - struct attribute *attr = sd->s_attr.attr; \ - struct lock_class_key *key = attr->key; \ - if (!key) \ - key = &attr->skey; \ - \ - lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while (0) - -/* Test for attributes that want to ignore lockdep for read-locking */ -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - int type = sysfs_type(sd); - - return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) && - sd->s_attr.attr->ignore_lockdep; -} - -#else - -#define sysfs_dirent_init_lockdep(sd) do {} while (0) - -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - return true; -} - -#endif - -/* - * Context structure to be used while adding/removing nodes. - */ -struct sysfs_addrm_cxt { - struct sysfs_dirent *removed; -}; +#include <linux/sysfs.h> /* * mount.c */ - -/* - * Each sb is associated with a set of namespace tags (i.e. - * the network namespace of the task which mounted this sysfs - * instance). - */ -struct sysfs_super_info { - void *ns[KOBJ_NS_TYPES]; -}; -#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) -extern struct sysfs_dirent sysfs_root; -extern struct kmem_cache *sysfs_dir_cachep; +extern struct kernfs_node *sysfs_root_kn; /* * dir.c */ -extern struct mutex sysfs_mutex; extern spinlock_t sysfs_symlink_target_lock; -extern const struct dentry_operations sysfs_dentry_ops; - -extern const struct file_operations sysfs_dir_operations; -extern const struct inode_operations sysfs_dir_inode_operations; -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); -void sysfs_put_active(struct sysfs_dirent *sd); -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt); -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name); -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -void sysfs_remove(struct sysfs_dirent *sd); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns); -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); - -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns); -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); - -void release_sysfs_dirent(struct sysfs_dirent *sd); - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd); - -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns); - -static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) -{ - if (sd) { - WARN_ON(!atomic_read(&sd->s_count)); - atomic_inc(&sd->s_count); - } - return sd; -} -#define sysfs_get(sd) __sysfs_get(sd) - -static inline void __sysfs_put(struct sysfs_dirent *sd) -{ - if (sd && atomic_dec_and_test(&sd->s_count)) - release_sysfs_dirent(sd); -} -#define sysfs_put(sd) __sysfs_put(sd) - -/* - * inode.c - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); -void sysfs_evict_inode(struct inode *inode); -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_inode_init(void); +void sysfs_warn_dup(struct kernfs_node *parent, const char *name); /* * file.c */ -extern const struct file_operations sysfs_file_operations; -extern const struct file_operations sysfs_bin_operations; - -int sysfs_add_file(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type); - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, +int sysfs_add_file(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin); +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, umode_t amode, const void *ns); -void sysfs_unmap_bin_file(struct sysfs_dirent *sd); /* * symlink.c */ -extern const struct inode_operations sysfs_symlink_inode_operations; -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name); + +#endif /* __SYSFS_INTERNAL_H */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index cc1febd..5157b86 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2118,26 +2118,10 @@ out_free: */ static void free_inodes(struct fsck_data *fsckd) { - struct rb_node *this = fsckd->inodes.rb_node; - struct fsck_inode *fscki; + struct fsck_inode *fscki, *n; - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - fscki = rb_entry(this, struct fsck_inode, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &fscki->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(fscki); - } - } + rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb) + kfree(fscki); } /** diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36bd4ef..a902c59 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum) */ static void destroy_done_tree(struct rb_root *done_tree) { - struct rb_node *this = done_tree->rb_node; - struct done_ref *dr; + struct done_ref *dr, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - dr = rb_entry(this, struct done_ref, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &dr->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb) kfree(dr); - } } /** diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index ba32da3..f1c3e5a1 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) static void dbg_free_check_tree(struct rb_root *root) { - struct rb_node *this = root->rb_node; - struct check_orphan *o; + struct check_orphan *o, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - o = rb_entry(this, struct check_orphan, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &o->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(o, n, root, rb) kfree(o); - } } static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 065096e..c14adb2 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum) */ void ubifs_destroy_size_tree(struct ubifs_info *c) { - struct rb_node *this = c->size_tree.rb_node; - struct size_entry *e; + struct size_entry *e, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - e = rb_entry(this, struct size_entry, rb); + rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { if (e->inode) iput(e->inode); - this = rb_parent(this); - if (this) { - if (this->rb_left == &e->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } kfree(e); } + c->size_tree = RB_ROOT; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f69daa5..5ded849 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c) */ static void free_buds(struct ubifs_info *c) { - struct rb_node *this = c->buds.rb_node; - struct ubifs_bud *bud; - - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - bud = rb_entry(this, struct ubifs_bud, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &bud->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(bud); - } - } + struct ubifs_bud *bud, *n; + + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + kfree(bud); } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 349f31a..9083bc7 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c, */ void destroy_old_idx(struct ubifs_info *c) { - struct rb_node *this = c->old_idx.rb_node; - struct ubifs_old_idx *old_idx; + struct ubifs_old_idx *old_idx, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - old_idx = rb_entry(this, struct ubifs_old_idx, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &old_idx->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb) kfree(old_idx); - } + c->old_idx = RB_ROOT; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5f6fc17..9737cba 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, else udf_truncate_tail_extent(inode); mark_inode_dirty(inode); + up_write(&iinfo->i_data_sem); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) @@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(dir); - up_write(&iinfo->i_data_sem); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c deleted file mode 100644 index 9fbea87..0000000 --- a/fs/xattr_acl.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * linux/fs/xattr_acl.c - * - * Almost all from linux/fs/ext2/acl.c: - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/posix_acl_xattr.h> -#include <linux/gfp.h> -#include <linux/user_namespace.h> - -/* - * Fix up the uids and gids in posix acl extended attributes in place. - */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - kuid_t uid; - kgid_t gid; - - if (!value) - return; - if (size < sizeof(posix_acl_xattr_header)) - return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return; - - count = posix_acl_xattr_count(size); - if (count < 0) - return; - if (count == 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - -/* - * Convert from extended attribute to in-memory representation. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - struct posix_acl *acl; - struct posix_acl_entry *acl_e; - - if (!value) - return NULL; - if (size < sizeof(posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - acl_e = acl->a_entries; - - for (end = entry + count; entry != end; acl_e++, entry++) { - acl_e->e_tag = le16_to_cpu(entry->e_tag); - acl_e->e_perm = le16_to_cpu(entry->e_perm); - - switch(acl_e->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - break; - - case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); - if (!uid_valid(acl_e->e_uid)) - goto fail; - break; - case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); - if (!gid_valid(acl_e->e_gid)) - goto fail; - break; - - default: - goto fail; - } - } - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL (posix_acl_from_xattr); - -/* - * Convert from in-memory to extended attribute representation. - */ -int -posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) -{ - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; - int real_size, n; - - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; - - ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - - for (n=0; n < acl->a_count; n++, ext_entry++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); - ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - ext_entry->e_id = - cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); - break; - case ACL_GROUP: - ext_entry->e_id = - cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); - break; - default: - ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); - break; - } - } - return real_size; -} -EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 370eb3e..0ecec18 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -124,16 +124,12 @@ struct posix_acl * xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct xfs_acl *xfs_acl; unsigned char *ea_name; int error; int len; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - trace_xfs_get_acl(ip); switch (type) { @@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type) * cache entry, for any other error assume it is transient and * leave the cache entry as ACL_NOT_CACHED. */ - if (error == -ENOATTR) { - acl = NULL; + if (error == -ENOATTR) goto out_update_cache; - } goto out; } @@ -183,15 +177,12 @@ out: } STATIC int -xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode) return xfs_acl_exists(inode, SGI_ACL_DEFAULT); } -/* - * No need for i_mutex because the inode is not yet exposed to the VFS. - */ int -xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - umode_t mode = inode->i_mode; - int error = 0, inherit = 0; - - if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - return error; - - /* - * If posix_acl_create returns a positive value we need to - * inherit a permission that can't be represented using the Unix - * mode bits and we actually need to set an ACL. - */ - if (error > 0) - inherit = 1; - - error = xfs_set_mode(inode, mode); - if (error) - goto out; - - if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - -int -xfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static int -xfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int error; - - acl = xfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return error; -} - -static int -xfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; int error = 0; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (!value) + if (!acl) goto set_acl; - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - error = -EINVAL; if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) - goto out_release; + return error; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { - posix_acl_release(acl); acl = NULL; if (error < 0) @@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, error = xfs_set_mode(inode, mode); if (error) - goto out_release; + return error; } set_acl: - error = xfs_set_acl(inode, type, acl); - out_release: - posix_acl_release(acl); - out: - return error; + return __xfs_set_acl(inode, type, acl); } - -const struct xattr_handler xfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; - -const struct xattr_handler xfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 4016a56..5dc1637 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -60,20 +60,15 @@ struct xfs_acl { #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); -extern int xfs_acl_chmod(struct inode *inode); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); - -extern const struct xattr_handler xfs_xattr_acl_access_handler; -extern const struct xattr_handler xfs_xattr_acl_default_handler; #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } -# define xfs_inherit_acl(inode, default_acl) 0 -# define xfs_acl_chmod(inode) 0 +# define xfs_set_acl NULL # define posix_acl_access_exists(inode) 0 # define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71c8c9d..db2cfb0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -407,7 +407,7 @@ xfs_alloc_ioend_bio( struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; return bio; } @@ -1217,7 +1217,7 @@ __xfs_get_blocks( lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { - lockmode = xfs_ilock_map_shared(ip); + lockmode = xfs_ilock_data_map_shared(ip); } ASSERT(offset <= mp->m_super->s_maxbytes); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index b861270..01b6a01 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -164,6 +164,7 @@ xfs_attr_get( { int error; struct xfs_name xname; + uint lock_mode; XFS_STATS_INC(xs_attr_get); @@ -174,9 +175,9 @@ xfs_attr_get( if (error) return error; - xfs_ilock(ip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_attr_map_shared(ip); error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_iunlock(ip, lock_mode); return(error); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d174b1..01db96f 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -507,17 +507,17 @@ xfs_attr_list_int( { int error; xfs_inode_t *dp = context->dp; + uint lock_mode; XFS_STATS_INC(xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return EIO; - xfs_ilock(dp, XFS_ILOCK_SHARED); - /* * Decide on what work routines to call based on the inode size. */ + lock_mode = xfs_ilock_attr_map_shared(dp); if (!xfs_inode_hasattr(dp)) { error = 0; } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { @@ -527,9 +527,7 @@ xfs_attr_list_int( } else { error = xfs_attr_node_list(context); } - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 739e0a52..5549d69 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -110,7 +110,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3ef11b2..152543c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1635,7 +1635,7 @@ xfs_bmap_last_extent( * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be * at, or past the EOF. */ STATIC int @@ -1650,9 +1650,14 @@ xfs_bmap_isaeof( bma->aeof = 0; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); - if (error || is_empty) + if (error) return error; + if (is_empty) { + bma->aeof = 1; + return 0; + } + /* * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. @@ -3643,10 +3648,19 @@ xfs_bmap_btalloc( int isaligned; int tryagain; int error; + int stripe_align; ASSERT(ap->length); mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3655,6 +3669,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->length); } + + nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { @@ -3730,7 +3746,7 @@ xfs_bmap_btalloc( */ if (!ap->flist->xbf_low && ap->aeof) { if (!ap->offset) { - args.alignment = mp->m_dalign; + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* @@ -3755,13 +3771,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= args.maxlen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -3783,7 +3799,7 @@ xfs_bmap_btalloc( */ args.type = atype; args.fsbno = ap->blkno; - args.alignment = mp->m_dalign; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; @@ -3997,6 +4013,7 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -4191,6 +4208,7 @@ xfs_bmapi_delay( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && @@ -4484,6 +4502,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5035,6 +5054,7 @@ xfs_bunmapi( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5887e41..f264616 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -287,6 +287,7 @@ xfs_bmapi_allocate( INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); + destroy_work_on_stack(&args->work); return args->result; } @@ -617,22 +618,27 @@ xfs_getbmap( return XFS_ERROR(ENOMEM); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ } - /* - * even after flushing the inode, there can still be delalloc - * blocks on the inode beyond EOF due to speculative - * preallocation. These are not removed until the release - * function is called or the inode is inactivated. Hence we - * cannot assert here that ip->i_delayed_blks == 0. - */ - } - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } /* * Don't let nex be bigger than the number of extents @@ -737,7 +743,7 @@ xfs_getbmap( out_free_map: kmem_free(map); out_unlock_ilock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes( xfs_buf_unlock(bp); for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1187,7 +1199,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, @@ -1200,7 +1217,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c7f0b77..9c061ef 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -445,8 +445,8 @@ _xfs_buf_find( numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we @@ -698,7 +698,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -1089,7 +1093,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1152,7 +1156,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1164,25 +1168,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1255,7 +1240,7 @@ next_chunk: bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1277,7 +1262,7 @@ next_chunk: total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1516,6 +1501,12 @@ xfs_wait_buftarg( struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } xfs_buf_rele(bp); } if (loop++ != 0) @@ -1602,16 +1593,15 @@ xfs_free_buftarg( kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@ -1624,30 +1614,25 @@ xfs_setsize_buftarg_flags( return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, PAGE_SIZE, + bdev_logical_block_size(bdev)); } xfs_buftarg_t * @@ -1799,7 +1784,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e656833..9953395 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -45,6 +45,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ @@ -80,19 +82,34 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" }, \ { _XBF_COMPOUND, "COMPOUND" } + /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; - unsigned int bt_bsize; - unsigned int bt_sshift; - size_t bt_smask; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; /* LRU control structures */ struct shrinker bt_shrinker; @@ -269,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); - -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); - extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); @@ -282,6 +296,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) +extern int xfs_bioerror_relse(struct xfs_buf *); + static inline int xfs_buf_geterror(xfs_buf_t *bp) { return bp ? bp->b_error : ENOMEM; @@ -301,7 +317,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a64f67b..3314911 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -182,21 +182,47 @@ xfs_buf_item_size( trace_xfs_buf_item_size(bip); } -static struct xfs_log_iovec * +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, - struct xfs_log_iovec *vecp, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -208,21 +234,17 @@ xfs_buf_item_format_segment( */ base_size = xfs_buf_log_format_size(blfp); - nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ - goto out; + return; } - vecp->i_addr = blfp; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -232,14 +254,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - goto out; + return; } /* * Fill in an iovec for each set of contiguous chunks. */ - last_bit = first_bit; nbits = 1; for (;;) { @@ -252,42 +273,22 @@ xfs_buf_item_format_segment( next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, offset + - (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + - (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK)) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -296,9 +297,6 @@ xfs_buf_item_format_segment( nbits++; } } -out: - blfp->blf_size = nvecs; - return vecp; } /* @@ -310,10 +308,11 @@ out: STATIC void xfs_buf_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; @@ -354,8 +353,8 @@ xfs_buf_item_format( } for (i = 0; i < bip->bli_format_count; i++) { - vecp = xfs_buf_item_format_segment(bip, vecp, offset, - &bip->bli_formats[i]); + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); offset += bp->b_maps[i].bm_len; } @@ -496,6 +495,14 @@ xfs_buf_item_unpin( } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -524,6 +531,14 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -1096,8 +1111,9 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 56369d4..48c7d18 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ trace_xfs_dir2_node_removename(args); @@ -2084,19 +2084,18 @@ xfs_dir2_node_removename( state->mp = args->dp->i_mount; state->blocksize = state->mp->m_dirblksize; state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ + + /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); if (error) - rval = error; - /* - * Didn't find it, upper layer screwed up. - */ + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -2107,7 +2106,7 @@ xfs_dir2_node_removename( error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); if (error) - return error; + goto out_free; /* * Fix the hash values up the btree. */ @@ -2122,6 +2121,7 @@ xfs_dir2_node_removename( */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index c4e50c6..aead369 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -674,6 +674,7 @@ xfs_readdir( { int rval; /* return value */ int v; /* type-checking value */ + uint lock_mode; trace_xfs_readdir(dp); @@ -683,6 +684,7 @@ xfs_readdir( ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(dp, ctx); else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) @@ -691,5 +693,7 @@ xfs_readdir( rval = xfs_dir2_block_getdents(dp, ctx); else rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + return rval; } diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index aafc6e4..3725fb1 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -170,6 +170,7 @@ xfs_dir2_block_to_sf( char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ trace_xfs_dir2_block_to_sf(args); @@ -177,35 +178,20 @@ xfs_dir2_block_to_sf( mp = dp->i_mount; /* - * Make a copy of the block data, so we can shrink the inode - * and add local data. + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. */ - hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->b_addr, mp->m_dirblksize); - logflags = XFS_ILOG_CORE; - if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { - ASSERT(error != ENOSPC); - goto out; - } + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; /* - * The buffer is now unconditionally gone, whether - * xfs_dir2_shrink_inode worked or not. - * - * Convert the inode to local format. - */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - logflags |= XFS_ILOG_DDATA; - /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dst; memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); - dp->i_d.di_size = size; + /* * Set up to loop over the block's entries. */ @@ -258,10 +244,34 @@ xfs_dir2_block_to_sf( ptr += dp->d_ops->data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp); + if (error) { + ASSERT(error != ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(hdr); + kmem_free(dst); return error; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8367d6d..4f11ef0 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -157,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -180,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 6b1e695..7aeb4c8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -469,16 +469,17 @@ xfs_qm_dqtobp( struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - xfs_ilock(quotip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); return ESRCH; } @@ -488,7 +489,7 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); if (error) return error; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 92e5f62..f33fbaa 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size( STATIC void xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *logvec) + struct xfs_log_vec *lv) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - - logvec->i_addr = &qlip->qli_format; - logvec->i_len = sizeof(xfs_dq_logformat_t); - logvec->i_type = XLOG_REG_TYPE_QFORMAT; - logvec++; - logvec->i_addr = &qlip->qli_dquot->q_core; - logvec->i_len = sizeof(xfs_disk_dquot_t); - logvec->i_type = XLOG_REG_TYPE_DQUOT; - - qlip->qli_format.qlf_size = 2; - + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); } /* @@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); lp->qli_dquot = dqp; - lp->qli_format.qlf_type = XFS_LI_DQUOT; - lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); - lp->qli_format.qlf_blkno = dqp->q_blkno; - lp->qli_format.qlf_len = 1; - /* - * This is just the offset of this dquot within its buffer - * (which is currently 1 FSB and probably won't change). - * Hence 32 bits for this offset should be just fine. - * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) - * here, and recompute it at recovery time. - */ - lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; } /*------------------ QUOTAOFF LOG ITEMS -------------------*/ @@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size( *nbytes += sizeof(struct xfs_qoff_logitem); } -/* - * This is called to fill in the vector of log iovecs for the - * given quotaoff log item. We use only 1 iovec, and we point that - * at the quotaoff_log_format structure embedded in the quotaoff item. - * It is at this point that we assert that all of the extent - * slots in the quotaoff item have been filled. - */ STATIC void xfs_qm_qoff_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - - ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - - log_vector->i_addr = &qflip->qql_format; - log_vector->i_len = sizeof(xfs_qoff_logitem_t); - log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qflip->qql_format.qf_size = 1; + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); } /* @@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init( xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; - qf->qql_format.qf_type = XFS_LI_QUOTAOFF; - qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; + qf->qql_flags = flags; return qf; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2a..502e946 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { xfs_log_item_t qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - xfs_qoff_logformat_t qql_format; /* logged structure */ + unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3680d04..fb7a4c1 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -26,6 +26,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_extfree_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_efi_zone; @@ -101,9 +102,10 @@ xfs_efi_item_size( STATIC void xfs_efi_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); @@ -111,10 +113,9 @@ xfs_efi_item_format( efip->efi_format.efi_type = XFS_LI_EFI; efip->efi_format.efi_size = 1; - log_vector->i_addr = &efip->efi_format; - log_vector->i_len = xfs_efi_item_sizeof(efip); - log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); } @@ -368,19 +369,19 @@ xfs_efd_item_size( STATIC void xfs_efd_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; efdp->efd_format.efd_size = 1; - log_vector->i_addr = &efdp->efd_format; - log_vector->i_len = xfs_efd_item_sizeof(efdp); - log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 52c91e1..2e7989e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -261,7 +261,8 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (size & target->bt_smask)) { + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); @@ -641,9 +642,11 @@ xfs_file_dio_aio_write( struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (count & target->bt_smask)) + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) return -XFS_ERROR(EINVAL); + /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; @@ -912,7 +915,7 @@ xfs_dir_open( * If there are any blocks, read-ahead block 0 as we're almost * certain to have the next operation be a read there. */ - mode = xfs_ilock_map_shared(ip); + mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) xfs_dir3_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); @@ -1215,7 +1218,7 @@ xfs_seek_data( uint lock; int error; - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1294,7 +1297,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1319,7 +1322,7 @@ xfs_seek_hole( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1402,7 +1405,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a6e54b3..02fb943 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -220,6 +220,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -279,8 +281,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index e87719c..5d7f105 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment( { if (xfs_sb_version_hasalign(&args->mp->m_sb) && args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) return args->mp->m_sb.sb_inoalignmt; return 1; } @@ -170,27 +170,20 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int blks_per_cluster, nbufs, ninodes; + int nbufs, blks_per_cluster, inodes_per_cluster; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. */ - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - nbufs = length; - ninodes = mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - nbufs = length / blks_per_cluster; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -225,7 +218,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, mp->m_sb.sb_inodesize, length, gen); } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; @@ -246,7 +239,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = XFS_IALLOC_INODES(args.mp); + newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) return XFS_ERROR(ENOSPC); - args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill @@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc( newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.mp->m_ialloc_blks; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -585,7 +578,7 @@ xfs_ialloc_ag_select( * Is there enough free space for the file plus a block of * inodes? (if we need to allocate some)? */ - ineed = XFS_IALLOC_BLOCKS(mp); + ineed = mp->m_ialloc_blks; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -999,7 +992,7 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1202,7 +1195,7 @@ xfs_difree( * When an inode cluster is free, it becomes eligible for removal */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + (rec.ir_freecount == mp->m_ialloc_inos)) { *delete = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); @@ -1212,7 +1205,7 @@ xfs_difree( * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = XFS_IALLOC_INODES(mp); + ilen = mp->m_ialloc_inos; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1228,9 +1221,9 @@ xfs_difree( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, - agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), - XFS_IALLOC_BLOCKS(mp), flist, mp); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); } else { *delete = 0; @@ -1311,7 +1304,7 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || - rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + rec.ir_startino + mp->m_ialloc_inos <= agino) return EINVAL; /* for untrusted inodes check it is allocated first */ @@ -1384,7 +1377,7 @@ xfs_imap( return XFS_ERROR(EINVAL); } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + blks_per_cluster = xfs_icluster_size_fsb(mp); /* * For bulkstat and handle lookups, we have an untrusted inode number @@ -1405,7 +1398,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { + if (blks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index a8f76a5..812365d 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -25,17 +25,18 @@ struct xfs_mount; struct xfs_trans; struct xfs_btree_cur; -/* - * Allocation parameters for inode allocation. - */ -#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos -#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks - -/* - * Move inodes in clusters of this size. - */ +/* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} /* * Make an inode pointer out of the buffer/offset. diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d2eaccf..7e45492 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -28,6 +28,7 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_icreate_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -58,13 +59,14 @@ xfs_icreate_item_size( STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; - log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; - log_vector->i_len = sizeof(struct xfs_icreate_log); - log_vector->i_type = XLOG_REG_TYPE_ICREATE; + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 001aa89..3a137e9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -77,48 +77,44 @@ xfs_get_extsz_hint( } /* - * This is a wrapper routine around the xfs_ilock() routine used to centralize - * some grungy code. It is used in places that wish to lock the inode solely - * for reading the extents. The reason these places can't just call - * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode is in b-tree - * format, then we need to lock the inode exclusively until the extents are read - * in. Locking it exclusively all the time would limit our parallelism - * unnecessarily, though. What we do instead is check to see if the extents - * have been read in yet, and only lock the inode exclusively if they have not. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. * - * The function returns a value which should be given to the corresponding - * xfs_iunlock_map_shared(). This value is the mode in which the lock was - * actually taken. + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ uint -xfs_ilock_map_shared( - xfs_inode_t *ip) +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - uint lock_mode; + uint lock_mode = XFS_ILOCK_SHARED; - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - xfs_ilock(ip, lock_mode); - return lock_mode; } -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) { - xfs_iunlock(ip, lock_mode); + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } /* @@ -588,9 +584,9 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - lock_mode = xfs_ilock_map_shared(dp); + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock_map_shared(dp, lock_mode); + xfs_iunlock(dp, lock_mode); if (error) goto out; @@ -2141,8 +2137,8 @@ xfs_ifree_cluster( { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; @@ -2152,18 +2148,11 @@ xfs_ifree_cluster( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; - for (j = 0; j < nbufs; j++, inum += ninodes) { + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2225,7 +2214,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -2906,13 +2895,13 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) goto out_put; - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9e6efccb..65e2350 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -337,8 +337,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); -uint xfs_ilock_map_shared(xfs_inode_t *); -void xfs_iunlock_map_shared(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c index cfee14a..73514c0 100644 --- a/fs/xfs/xfs_inode_fork.c +++ b/fs/xfs/xfs_inode_fork.c @@ -431,6 +431,8 @@ xfs_iread_extents( xfs_ifork_t *ifp; xfs_extnum_t nextents; + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, ip->i_mount); @@ -721,15 +723,16 @@ xfs_idestroy_fork( } /* - * xfs_iextents_copy() + * Convert in-core extents to on-disk form * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. */ int xfs_iextents_copy( diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7c0d391f..686889b 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -30,6 +30,7 @@ #include "xfs_trace.h" #include "xfs_trans_priv.h" #include "xfs_dinode.h" +#include "xfs_log.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } - -/* - * This returns the number of iovecs needed to log the given inode item. - * - * We need one iovec for the inode log format structure, one for the - * inode core, and possibly one for the inode data/extents/b-tree root - * and one for the inode attribute data/extents/b-tree root. - */ STATIC void -xfs_inode_item_size( - struct xfs_log_item *lip, +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, int *nvecs, int *nbytes) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - *nvecs += 2; - *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_icdinode_size(ip->i_d.di_version); - switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && @@ -70,7 +58,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { @@ -78,7 +65,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { @@ -90,19 +76,20 @@ xfs_inode_item_size( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_UUID: break; - default: ASSERT(0); break; } +} - if (!XFS_IFORK_Q(ip)) - return; - +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; - /* - * Log any necessary attribute data. - */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && @@ -113,7 +100,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && ip->i_afp->if_broot_bytes > 0) { @@ -121,7 +107,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { @@ -129,7 +114,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - default: ASSERT(0); break; @@ -137,98 +121,67 @@ xfs_inode_item_size( } /* - * xfs_inode_item_format_extents - convert in-core extents to on-disk form - * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. In this case, we - * need to do this conversion before we write the extents into the log. Because - * we don't have the disk inode to write into here, we allocate a buffer and - * format the extents into it via xfs_iextents_copy(). We free the buffer in - * the unlock routine after the copy for the log has been made. + * This returns the number of iovecs needed to log the given inode item. * - * In the case of the data fork, the in-core and on-disk fork sizes can be - * different due to delayed allocation extents. We only log on-disk extents - * here, so always use the physical fork size to determine the size of the - * buffer we need to allocate. + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. */ STATIC void -xfs_inode_item_format_extents( - struct xfs_inode *ip, - struct xfs_log_iovec *vecp, - int whichfork, - int type) +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - xfs_bmbt_rec_t *ext_buffer; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); - if (whichfork == XFS_DATA_FORK) - ip->i_itemp->ili_extents_buf = ext_buffer; - else - ip->i_itemp->ili_aextents_buf = ext_buffer; + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); - vecp->i_type = type; + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } /* - * This is called to fill in the vector of log iovecs for the - * given inode log item. It fills the first item with an inode - * log format structure, the second with the on-disk inode structure, - * and a possible third and/or fourth with the inode data/extents/b-tree - * root and inode attributes data/extents/b-tree root. + * If this is a v1 format inode, then we need to log it as such. This means + * that we have to copy the link count from the new field to the old. We + * don't have to worry about the new fields, because nothing trusts them as + * long as the old inode version number is there. */ STATIC void -xfs_inode_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +xfs_inode_item_format_v1_inode( + struct xfs_inode *ip) +{ + if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + ip->i_d.di_onlink = ip->i_d.di_nlink; + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = 2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + } +} + +STATIC void +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs; size_t data_bytes; - xfs_mount_t *mp; - - vecp->i_addr = &iip->ili_format; - vecp->i_len = sizeof(xfs_inode_log_format_t); - vecp->i_type = XLOG_REG_TYPE_IFORMAT; - vecp++; - nvecs = 1; - - vecp->i_addr = &ip->i_d; - vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); - vecp->i_type = XLOG_REG_TYPE_ICORE; - vecp++; - nvecs++; - - /* - * If this is really an old format inode, then we need to - * log it as such. This means that we have to copy the link - * count from the new field to the old. We don't have to worry - * about the new fields, because nothing trusts them as long as - * the old inode version number is there. If the superblock already - * has a new version number, then we don't bother converting back. - */ - mp = ip->i_mount; - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } - } switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -239,36 +192,23 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); - ASSERT(iip->ili_extents_buf == NULL); - -#ifdef XFS_NATIVE_HOST - if (ip->i_d.di_nextents == ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) { - /* - * There are no delayed allocation - * extents, so just point to the - * real extents array. - */ - vecp->i_addr = ip->i_df.if_u1.if_extents; - vecp->i_len = ip->i_df.if_bytes; - vecp->i_type = XLOG_REG_TYPE_IEXT; - } else -#endif - { - xfs_inode_item_format_extents(ip, vecp, - XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); - } - ASSERT(vecp->i_len <= ip->i_df.if_bytes); - iip->ili_format.ilf_dsize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | @@ -277,80 +217,70 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = ip->i_df.if_broot; - vecp->i_len = ip->i_df.if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IBROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV | XFS_ILOG_UUID); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - - vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); - ASSERT((ip->i_df.if_real_bytes == 0) || - (ip->i_df.if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_ILOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = (unsigned)data_bytes; + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; } break; - case XFS_DINODE_FMT_DEV: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_UUID); - if (iip->ili_fields & XFS_ILOG_DEV) { - iip->ili_format.ilf_u.ilfu_rdev = - ip->i_df.if_u2.if_rdev; - } + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; break; - case XFS_DINODE_FMT_UUID: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) { - iip->ili_format.ilf_u.ilfu_uuid = - ip->i_df.if_u2.if_uuid; - } + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with the file, then we're done. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); - goto out; - } +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: @@ -360,30 +290,22 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); -#ifdef XFS_NATIVE_HOST - /* - * There are not delayed allocation extents - * for attributes, so just point at the array. - */ - vecp->i_addr = ip->i_afp->if_u1.if_extents; - vecp->i_len = ip->i_afp->if_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; -#else - ASSERT(iip->ili_aextents_buf == NULL); - xfs_inode_item_format_extents(ip, vecp, - XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); -#endif - iip->ili_format.ilf_asize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_AEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); @@ -392,61 +314,89 @@ xfs_inode_item_format( ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = ip->i_afp->if_broot; - vecp->i_len = ip->i_afp->if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); - - vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT((ip->i_afp->if_real_bytes == 0) || - (ip->i_afp->if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = (unsigned)data_bytes; + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; } break; - default: ASSERT(0); break; } - -out: - /* - * Now update the log format that goes out to disk from the in-core - * values. We always write the inode core to make the arithmetic - * games in recovery easier, which isn't a big deal as just about any - * transaction would dirty it anyway. - */ - iip->ili_format.ilf_fields = XFS_ILOG_CORE | - (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); - iip->ili_format.ilf_size = nvecs; } +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + if (ip->i_d.di_version == 1) + xfs_inode_item_format_v1_inode(ip); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} /* * This is called to pin the inode associated with the inode log @@ -563,27 +513,6 @@ xfs_inode_item_unlock( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* - * If the inode needed a separate buffer with which to log - * its extents, then free it now. - */ - if (iip->ili_extents_buf != NULL) { - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_DEXT); - ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf); - iip->ili_extents_buf = NULL; - } - if (iip->ili_aextents_buf != NULL) { - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_AEXT); - ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf); - iip->ili_aextents_buf = NULL; - } - lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; if (lock_flags) @@ -670,11 +599,6 @@ xfs_inode_item_init( iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); - iip->ili_format.ilf_type = XFS_LI_INODE; - iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; - iip->ili_format.ilf_len = ip->i_imap.im_len; - iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index dce4d65..488d812 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - struct xfs_bmbt_rec *ili_extents_buf; /* array of logged - data exts */ - struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged - attr exts */ - xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; static inline int xfs_inode_clean(xfs_inode_t *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4d61340..bcfe612 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -112,15 +112,11 @@ xfs_find_handle( memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); hsize = sizeof(xfs_fsid_t); } else { - int lock_mode; - - lock_mode = xfs_ilock_map_shared(ip); handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } @@ -442,7 +438,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -1586,7 +1583,7 @@ xfs_file_ioctl( XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e8fb123..a7992f8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 27e0e54..f35d5c9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -123,7 +123,7 @@ xfs_vn_mknod( { struct inode *inode; struct xfs_inode *ip = NULL; - struct posix_acl *default_acl = NULL; + struct posix_acl *default_acl, *acl; struct xfs_name name; int error; @@ -139,14 +139,9 @@ xfs_vn_mknod( rdev = 0; } - if (IS_POSIXACL(dir)) { - default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(default_acl)) - return PTR_ERR(default_acl); - - if (!default_acl) - mode &= ~current_umask(); - } + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); @@ -159,22 +154,30 @@ xfs_vn_mknod( if (unlikely(error)) goto out_cleanup_inode; +#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = -xfs_inherit_acl(inode, default_acl); - default_acl = NULL; - if (unlikely(error)) + error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) goto out_cleanup_inode; } - + if (acl) { + error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_cleanup_inode; + } +#endif d_instantiate(dentry, inode); + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); return -error; out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); - out_free_acl: - posix_acl_release(default_acl); - return -error; + goto out_free_acl; } STATIC int @@ -391,18 +394,6 @@ xfs_vn_follow_link( return NULL; } -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) -{ - char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -} - STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -459,14 +450,12 @@ xfs_vn_getattr( static void xfs_setattr_mode( - struct xfs_trans *tp, struct xfs_inode *ip, struct iattr *iattr) { - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; - ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ip->i_d.di_mode &= S_IFMT; @@ -476,6 +465,32 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } +static void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -618,7 +633,8 @@ xfs_setattr_nonsize( } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, @@ -629,30 +645,10 @@ xfs_setattr_nonsize( } } - /* - * Change file access modes. - */ if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - /* - * Change file access or modified times. - */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -683,7 +679,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -xfs_acl_chmod(inode); + error = -posix_acl_chmod(inode, inode->i_mode); if (error) return XFS_ERROR(error); } @@ -867,22 +863,10 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - /* - * Change file access modes. - */ if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1052,6 +1036,7 @@ xfs_vn_fiemap( static const struct inode_operations xfs_inode_operations = { .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1079,6 +1064,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1105,6 +1091,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1117,8 +1104,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .get_acl = xfs_get_acl, + .put_link = kfree_put_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index d2c5057..1c34e43 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -30,7 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *); /* * Internal setattr interfaces. */ -#define XFS_ATTR_NOACL 0x01 /* Don't call xfs_acl_chmod */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c237ad1..f463382 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -209,9 +209,8 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ - int nbcluster; /* # of blocks in a cluster */ - int nicluster; /* # of inodes in a cluster */ - int nimask; /* mask for inode clusters */ + int blks_per_cluster; /* # of blocks per cluster */ + int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -243,11 +242,8 @@ xfs_bulkstat( *done = 0; fmterror = 0; ubufp = ubuffer; - nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? - mp->m_sb.sb_inopblock : - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); - nimask = ~(nicluster - 1); - nbcluster = nicluster >> mp->m_sb.sb_inopblog; + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return ENOMEM; @@ -390,12 +386,12 @@ xfs_bulkstat( agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, nicluster) - & ~r.ir_free) + chunkidx += inodes_per_cluster, + agbno += blks_per_cluster) { + if (xfs_inobt_maskn(chunkidx, + inodes_per_cluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster, + agbno, blks_per_cluster, &xfs_inode_buf_ops); } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index e148719..b0f4ef7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -30,6 +30,52 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + /* + * We need to make sure the next buffer is naturally aligned for the + * biggest basic data type we put into it. We already accounted for + * this when sizing the buffer. + */ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + vec->i_len = len; +} + +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + /* * Structure used to pass callback function and the function's argument * to the log manager. diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5eb51fc..cdebd83 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -82,36 +82,6 @@ xlog_cil_init_post_recovery( log->l_curr_block); } -STATIC int -xlog_cil_lv_item_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - int index; - char *ptr; - - /* format new vectors into array */ - lip->li_ops->iop_format(lip, lv->lv_iovecp); - - /* copy data into existing array */ - ptr = lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; - - memcpy(ptr, vec->i_addr, vec->i_len); - vec->i_addr = ptr; - ptr += vec->i_len; - } - - /* - * some size calculations for log vectors over-estimate, so the caller - * doesn't know the amount of space actually used by the item. Return - * the byte count to the caller so they can check and store it - * appropriately. - */ - return ptr - lv->lv_buf; -} - /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -232,6 +202,13 @@ xlog_cil_insert_format_items( nbytes = 0; } + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. + */ + nbytes += niovecs * sizeof(uint64_t); + /* grab the old item if it exists for reservation accounting */ old_lv = lip->li_lv; @@ -254,34 +231,27 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_buf_len; - - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - lv->lv_buf = (char *)lv + buf_size - nbytes; - - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); - goto insert; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); - lv->lv_item = lip; - lv->lv_size = buf_size; + /* Ensure the lv is set up according to ->iop_size */ lv->lv_niovecs = niovecs; - if (ordered) { - /* track as an ordered logvec */ - ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; - goto insert; - } - - /* The allocated iovec region lies beyond the log vector. */ - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; - - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); + lip->li_ops->iop_format(lip, lv); insert: ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b6b669d..bce53ac 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,7 +193,10 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -1651,6 +1654,7 @@ xlog_recover_reorder_trans( int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); @@ -1692,9 +1696,17 @@ xlog_recover_reorder_trans( "%s: unrecognized type of log operation", __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); @@ -1704,7 +1716,7 @@ xlog_recover_reorder_trans( list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) list_splice_tail(&cancel_list, &trans->r_itemq); - return 0; + return error; } /* @@ -2514,19 +2526,19 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + (__uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -3199,10 +3211,10 @@ xlog_recover_do_icreate_pass2( } /* existing allocation is fixed value */ - ASSERT(count == XFS_IALLOC_INODES(mp)); - ASSERT(length == XFS_IALLOC_BLOCKS(mp)); - if (count != XFS_IALLOC_INODES(mp) || - length != XFS_IALLOC_BLOCKS(mp)) { + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); return EINVAL; } @@ -3608,8 +3620,10 @@ xlog_recover_process_data( error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -4397,7 +4411,13 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - xfsbdstrat(log->l_mp, bp); + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 14a4996..348e4d2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -134,8 +134,6 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { @@ -143,21 +141,6 @@ xfs_qm_dqpurge( return EAGAIN; } - /* - * If this quota has a hint attached, prepare for releasing it now. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); @@ -206,11 +189,47 @@ xfs_qm_dqpurge( XFS_STATS_DEC(xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Release the group or project dquot pointers the user dquots maybe carrying + * around as a hint, and proceed to purge the user dquot cache if requested. +*/ +STATIC int +xfs_qm_dqpurge_hints( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint flags = *((uint *)data); + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + return EAGAIN; + } + /* If this quota has a hint attached, prepare for releasing it now */ + gdqp = dqp->q_gdquot; if (gdqp) - xfs_qm_dqput(gdqp); + dqp->q_gdquot = NULL; + + pdqp = dqp->q_pdquot; if (pdqp) - xfs_qm_dqput(pdqp); + dqp->q_pdquot = NULL; + + xfs_dqunlock(dqp); + + if (gdqp) + xfs_qm_dqrele(gdqp); + if (pdqp) + xfs_qm_dqrele(pdqp); + + if (flags & XFS_QMOPT_UQUOTA) + return xfs_qm_dqpurge(dqp, NULL); + return 0; } @@ -222,8 +241,18 @@ xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + /* + * We have to release group/project dquot hint(s) from the user dquot + * at first if they are there, otherwise we would run into an infinite + * loop while walking through radix tree to purge other type of dquots + * since their refcount is not zero if the user dquot refers to them + * as hint. + * + * Call the special xfs_qm_dqpurge_hints() will end up go through the + * general xfs_qm_dqpurge() against user dquot cache if requested. + */ + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags); + if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) @@ -1193,16 +1222,18 @@ xfs_qm_dqiterate( lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { + uint lock_mode; + nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ - xfs_ilock(qip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); - xfs_iunlock(qip, XFS_ILOCK_SHARED); + xfs_iunlock(qip, lock_mode); if (error) break; @@ -2082,24 +2113,21 @@ xfs_qm_vop_create_dqattach( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if (udqp) { + if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (gdqp) { + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(XFS_IS_GQUOTA_ON(mp)); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (pdqp) { + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(XFS_IS_PQUOTA_ON(mp)); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index a788b66..797fd46 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,13 +20,29 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_quota_priv.h" struct xfs_inode; extern struct kmem_zone *xfs_qm_dqtrxzone; /* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* * This defines the unit of allocation of dquots. * Currently, it is just one file system block, and a 4K blk contains 30 * (136 * 30 = 4080) dquots. It's probably not worth trying to make diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 437c919..3daf5ea 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error = 0, error2 = 0; + int error; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", @@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles( return XFS_ERROR(EINVAL); } - if (flags & XFS_DQ_USER) + if (flags & XFS_DQ_USER) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); - if (flags & XFS_DQ_GROUP) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } if (flags & XFS_DQ_PROJ) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); - return error ? error : error2; + return error; } /* diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h deleted file mode 100644 index 6d86219..0000000 --- a/fs/xfs/xfs_quota_priv.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QUOTA_PRIV_H__ -#define __XFS_QUOTA_PRIV_H__ - -/* - * Number of bmaps that we ask from bmapi when doing a quotacheck. - * We make this restriction to keep the memory usage to a minimum. - */ -#define XFS_DQITER_MAP_SIZE 10 - -#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ - (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) - -#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9b96d35..b5bc1ab 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -64,7 +64,7 @@ typedef struct xfs_log_item { struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index c035d11..647b6f1 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -314,7 +314,18 @@ xfs_trans_read_buf_map( ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); bp->b_ops = ops; - xfsbdstrat(tp->t_mountp, bp); + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index cd2a10e..4117286 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -295,8 +295,8 @@ xfs_trans_mod_dquot( /* * Given an array of dqtrx structures, lock all the dquots associated and join * them to the transaction, provided they have been modified. We know that the - * highest number of dquots of one type - usr, grp OR prj - involved in a - * transaction is 2 so we don't need to make this very generic. + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. */ STATIC void xfs_trans_dqlockedjoin( diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index 2fd59c0..2ffd3e33 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(5, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0))); } @@ -282,7 +282,7 @@ xfs_calc_create_resv_modify( * For create we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode blocks allocated: mp->m_ialloc_blks * blocksize * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size */ @@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc( { return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)); @@ -385,9 +385,9 @@ xfs_calc_ifree_reservation( xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)); diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 7d2c920..af5dbe0 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -47,7 +47,7 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) + ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1) /* * Space reservation values for various transactions. diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 3e8e797..e8a7738 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -35,15 +35,6 @@ struct attrlist_cursor_kern; { IO_INVIS, "INVIS"} /* - * Flush/Invalidate options for vop_toss/flush/flushinval_pages. - */ -#define FI_NONE 0 /* none */ -#define FI_REMAPF 1 /* Do a remapf prior to the operation */ -#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. - Prevent VM access to the pages until - the operation completes. */ - -/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 9d47907..78ed92a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -102,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_acl_access_handler, - &xfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; |