diff options
Diffstat (limited to 'fs')
258 files changed, 5004 insertions, 3137 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 60fb474..ed4f851 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -91,10 +91,10 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) * dentry names. */ static int build_path_from_dentry(struct v9fs_session_info *v9ses, - struct dentry *dentry, char ***names) + struct dentry *dentry, const unsigned char ***names) { int n = 0, i; - char **wnames; + const unsigned char **wnames; struct dentry *ds; for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) @@ -105,7 +105,7 @@ static int build_path_from_dentry(struct v9fs_session_info *v9ses, goto err_out; for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) - wnames[i] = (char *)ds->d_name.name; + wnames[i] = ds->d_name.name; *names = wnames; return n; @@ -117,7 +117,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, kuid_t uid, int any) { struct dentry *ds; - char **wnames, *uname; + const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; struct p9_fid *fid, *old_fid = NULL; @@ -137,7 +137,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = v9fs_fid_find(ds, uid, any); if (fid) { /* Found the parent fid do a lookup with that */ - fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + fid = p9_client_walk(fid, 1, &dentry->d_name.name, 1); goto fid_out; } up_read(&v9ses->rename_sem); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 072e759..a89f3cf 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/parser.h> #include <linux/idr.h> #include <linux/slab.h> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f4f4450..2a5de61 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -643,7 +643,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct dentry *dentry, char *extension, u32 perm, u8 mode) { int err; - char *name; + const unsigned char *name; struct p9_fid *dfid, *ofid, *fid; struct inode *inode; @@ -652,7 +652,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, err = 0; ofid = NULL; fid = NULL; - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); @@ -788,7 +788,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; struct inode *inode; - char *name; + const unsigned char *name; p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n", dir, dentry, dentry, flags); @@ -802,7 +802,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(dfid)) return ERR_CAST(dfid); - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { if (fid == ERR_PTR(-ENOENT)) { @@ -1012,7 +1012,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, } v9fs_blank_wstat(&wstat); wstat.muid = v9ses->uname; - wstat.name = (char *) new_dentry->d_name.name; + wstat.name = new_dentry->d_name.name; retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: @@ -1047,16 +1047,18 @@ done: /** * v9fs_vfs_getattr - retrieve file metadata - * @mnt: mount information - * @dentry: file to get attributes on + * @path: Object to query * @stat: metadata structure to populate + * @request_mask: Mask of STATX_xxx flags indicating the caller's interests + * @flags: AT_STATX_xxx setting * */ static int -v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +v9fs_vfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_wstat *st; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5999bd0..70f9887 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -244,7 +244,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, int err = 0; kgid_t gid; umode_t mode; - char *name = NULL; + const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; @@ -269,7 +269,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); @@ -385,7 +385,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; - char *name; + const unsigned char *name; umode_t mode; struct inode *inode; struct p9_qid qid; @@ -416,7 +416,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err); goto error; } - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); if (err < 0) goto error; @@ -468,9 +468,10 @@ error: } static int -v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_stat_dotl *st; @@ -678,14 +679,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, { int err; kgid_t gid; - char *name; + const unsigned char *name; struct p9_qid qid; struct inode *inode; struct p9_fid *dfid; struct p9_fid *fid = NULL; struct v9fs_session_info *v9ses; - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); @@ -699,7 +700,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, gid = v9fs_get_fsgid_for_create(dir); /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ - err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + err = p9_client_symlink(dfid, name, symname, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); @@ -775,7 +776,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + err = p9_client_link(dfid, oldfid, dentry->d_name.name); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); @@ -812,7 +813,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, { int err; kgid_t gid; - char *name; + const unsigned char *name; umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; @@ -842,7 +843,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, err); goto error; } - name = (char *) dentry->d_name.name; + name = dentry->d_name.name; err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); if (err < 0) diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a5e6097..abcc598 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -10,6 +10,7 @@ * (C) 1991 Linus Torvalds - minix filesystem */ #include <linux/sched.h> +#include <linux/cred.h> #include <linux/gfp.h> #include "affs.h" diff --git a/fs/affs/super.c b/fs/affs/super.c index 37532538..c2c27a8 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -16,6 +16,7 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/blkdev.h> diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 86cc726..1e4897a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -375,12 +375,10 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int afs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode; - - inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8acf367..5dfa569 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -533,7 +533,7 @@ extern struct inode *afs_iget(struct super_block *, struct key *, struct afs_callback *); extern void afs_zap_data(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 95f4287..419ef05 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -10,6 +10,8 @@ */ #include <linux/slab.h> +#include <linux/sched/signal.h> + #include <net/sock.h> #include <net/af_rxrpc.h> #include <rxrpc/packet.h> @@ -260,8 +262,7 @@ void afs_flat_call_destructor(struct afs_call *call) /* * attach the data from a bunch of pages on an inode to a call */ -static int afs_send_pages(struct afs_call *call, struct msghdr *msg, - struct kvec *iov) +static int afs_send_pages(struct afs_call *call, struct msghdr *msg) { struct page *pages[8]; unsigned count, n, loop, offset, to; @@ -284,20 +285,21 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, loop = 0; do { + struct bio_vec bvec = {.bv_page = pages[loop], + .bv_offset = offset}; msg->msg_flags = 0; to = PAGE_SIZE; if (first + loop >= last) to = call->last_to; else msg->msg_flags = MSG_MORE; - iov->iov_base = kmap(pages[loop]) + offset; - iov->iov_len = to - offset; + bvec.bv_len = to - offset; offset = 0; _debug("- range %u-%u%s", offset, to, msg->msg_flags ? " [more]" : ""); - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, - iov, 1, to - offset); + iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, + &bvec, 1, to - offset); /* have to change the state *before* sending the last * packet as RxRPC might give us the reply before it @@ -306,7 +308,6 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, to - offset); - kunmap(pages[loop]); if (ret < 0) break; } while (++loop < count); @@ -391,7 +392,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, goto error_do_abort; if (call->send_pages) { - ret = afs_send_pages(call, &msg, iov); + ret = afs_send_pages(call, &msg); if (ret < 0) goto error_do_abort; } @@ -20,7 +20,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> @@ -1495,7 +1495,7 @@ static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - ret = aio_ret(req, file->f_op->read_iter(req, &iter)); + ret = aio_ret(req, call_read_iter(file, req, &iter)); kfree(iovec); return ret; } @@ -1520,7 +1520,7 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, if (!ret) { req->ki_flags |= IOCB_WRITE; file_start_write(file); - ret = aio_ret(req, file->f_op->write_iter(req, &iter)); + ret = aio_ret(req, call_write_iter(file, req, &iter)); /* * We release freeze protection in aio_complete(). Fool lockdep * by telling it the lock got released so that it doesn't @@ -9,6 +9,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> +#include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c885daa..beef981 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/completion.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 806df74..734cbf8 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -17,6 +17,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 79fbd85..24a58bf 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/signal.h> +#include <linux/sched/signal.h> #include <linux/file.h> #include "autofs_i.h" diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 5f685c8..bb53728 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -89,8 +89,8 @@ static int bad_inode_permission(struct inode *inode, int mask) return -EIO; } -static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int bad_inode_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { return -EIO; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 1940716..c500e95 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -18,6 +18,7 @@ #include <linux/parser.h> #include <linux/namei.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/exportfs.h> #include "befs.h" diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 2a59139..9be82c4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/coredump.h> #include <linux/slab.h> +#include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 443a6f5..5075fd5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,10 @@ #include <linux/utsname.h> #include <linux/coredump.h> #include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> +#include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <asm/param.h> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index ffca4bb..cf93a4f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -15,6 +15,9 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 9b2917a..2edcefc 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9b4688a..bee1a36 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> diff --git a/fs/block_dev.c b/fs/block_dev.c index 77c30f1..2eca00e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -870,6 +870,7 @@ static void init_once(void *foo) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -884,8 +885,10 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); - if (bdev->bd_bdi != &noop_backing_dev_info) + if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); + bdev->bd_bdi = &noop_backing_dev_info; + } } static const struct super_operations bdev_sops = { @@ -988,7 +991,6 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_bdi = &noop_backing_dev_info; bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 819a6d2..0c6baab 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -237,20 +237,20 @@ static inline u64 btrfs_ino(struct btrfs_inode *inode) return ino; } -static inline void btrfs_i_size_write(struct inode *inode, u64 size) +static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { - i_size_write(inode, size); - BTRFS_I(inode)->disk_i_size = size; + i_size_write(&inode->vfs_inode, size); + inode->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct inode *inode) +static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; if (root == root->fs_info->tree_root && - btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID) + btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) return true; return false; } @@ -311,34 +311,33 @@ struct btrfs_dio_private { * to grab i_mutex. It is used to avoid the endless truncate due to * nonlocked dio read. */ -static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) { - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); smp_mb(); } -static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) { smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); } -static inline void btrfs_print_data_csum_error(struct inode *inode, +static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; /* Output minus objectid, which is more meaningful */ if (root->objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(BTRFS_I(inode)), + root->objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); else btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(BTRFS_I(inode)), + root->objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 903c32c..c7721a6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -100,7 +100,7 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev, return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); } -static int check_compressed_csum(struct inode *inode, +static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { @@ -111,7 +111,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -125,7 +125,7 @@ static int check_compressed_csum(struct inode *inode, if (csum != *cb_sum) { btrfs_print_data_csum_error(inode, disk_start, csum, - *cb_sum, cb->mirror_num); + *cb_sum, cb->mirror_num); ret = -EIO; goto fail; } @@ -165,7 +165,7 @@ static void end_compressed_bio_read(struct bio *bio) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, + ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -911,32 +911,28 @@ static void free_workspaces(void) } /* - * given an address space and start/len, compress the bytes. + * Given an address space and start and length, compress the bytes into @pages + * that are allocated on demand. * - * pages are allocated to hold the compressed result and stored - * in 'pages' + * @out_pages is an in/out parameter, holds maximum number of pages to allocate + * and returns number of actually allocated pages * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we + * @total_in is used to return the number of bytes actually read. It + * may be smaller than the input length if we had to exit early because we * ran out of room in the pages array or because we cross the * max_out threshold. * - * total_out is used to return the total number of compressed bytes + * @total_out is an in/out parameter, must be set to the input length and will + * be also used to return the total number of compressed bytes * - * max_out tells us the max number of bytes that we're allowed to + * @max_out tells us the max number of bytes that we're allowed to * stuff into pages */ int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, + u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct list_head *workspace; int ret; @@ -944,10 +940,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, - start, len, pages, - nr_dest_pages, out_pages, - total_in, total_out, - max_out); + start, pages, + out_pages, + total_in, total_out); free_workspace(type, workspace); return ret; } @@ -1015,7 +1010,7 @@ void btrfs_exit_compress(void) * * total_out is the last byte of the buffer */ -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, +int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio) { diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0987957..39ec43a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,20 +19,32 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ +/* + * We want to make sure that amount of RAM required to uncompress an extent is + * reasonable, so we limit the total size in ram of a compressed extent to + * 128k. This is a crucial number because it also controls how easily we can + * spread reads across cpus for decompression. + * + * We also want to make sure the amount of IO required to do a random read is + * reasonably small, so we limit the size of a compressed extent to 128k. + */ + +/* Maximum length of compressed data stored on disk */ +#define BTRFS_MAX_COMPRESSED (SZ_128K) +/* Maximum size of data before compression */ +#define BTRFS_MAX_UNCOMPRESSED (SZ_128K) + void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, + u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); + unsigned long *total_out); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, +int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); @@ -59,13 +71,11 @@ struct btrfs_compress_op { int (*compress_pages)(struct list_head *workspace, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); + unsigned long *total_out); int (*decompress_bio)(struct list_head *workspace, struct page **pages_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1192bc7..7dc8844 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -453,8 +453,6 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node *parent = NULL; struct tree_mod_elem *cur; - BUG_ON(!tm); - tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; @@ -4159,6 +4157,9 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -4214,6 +4215,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, if (wret < 0) return wret; if (wret) { + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(fs_info, + l); wret = push_leaf_left(trans, root, path, space_needed, space_needed, 0, (u32)-1); if (wret < 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 105d4d4..29b7fc2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -20,6 +20,7 @@ #define __BTRFS_CTREE__ #include <linux/mm.h> +#include <linux/sched/signal.h> #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> @@ -2687,7 +2688,7 @@ enum btrfs_flush_state { }; int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); -int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); @@ -2695,16 +2696,16 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode); -void btrfs_orphan_release_metadata(struct inode *inode); + struct btrfs_inode *inode); +void btrfs_orphan_release_metadata(struct btrfs_inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); @@ -2982,7 +2983,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, + int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3081,7 +3082,7 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); -void btrfs_extent_item_to_extent_map(struct inode *inode, +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, @@ -3100,9 +3101,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create); +struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, u64 start, + u64 len, int create); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); @@ -3123,13 +3124,13 @@ static inline void btrfs_force_ra(struct address_space *mapping, } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3166,15 +3167,16 @@ void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 end, - int create); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3215,11 +3217,11 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, int btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode); + struct btrfs_inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, @@ -3233,7 +3235,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end); + struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f7a6ee5..1aff676 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1790,7 +1790,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5de280b9..e653921 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -304,8 +304,9 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, - u64 srcdevid, char *srcdev_name, int read_src) +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, + const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, + int read_src) { struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 54ea12b..f94a768 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -27,8 +27,9 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, - u64 srcdevid, char *srcdev_name, int read_src); +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, + const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, + int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 724504a..60a7506 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -80,7 +80,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)); + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) + return -ENOSPC; key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; @@ -120,7 +121,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *dir, struct btrfs_key *location, + struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -133,7 +134,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_disk_key disk_key; u32 data_size; - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); @@ -174,7 +175,7 @@ second_insert: btrfs_release_path(path); ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name, - name_len, BTRFS_I(dir), &disk_key, type, index); + name_len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 207db02..08b74da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -219,12 +219,12 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct inode *inode, +static struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -265,7 +265,7 @@ out: return em; } -u32 btrfs_csum_data(char *data, u32 seed, size_t len) +u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { return btrfs_crc32c(seed, data, len); } @@ -2205,11 +2205,9 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->submit_workers); @@ -2219,6 +2217,13 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); btrfs_destroy_workqueue(fs_info->extent_workers); + /* + * Now that all other work queues are destroyed, we can safely destroy + * the queues used for metadata I/O, since tasks from those other work + * queues can do metadata I/O operations. + */ + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -3261,7 +3266,6 @@ fail_fsdev_sysfs: fail_block_groups: btrfs_put_block_group_cache(fs_info); - btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); @@ -3269,6 +3273,7 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); + btrfs_free_block_groups(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3448,7 +3453,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; - crc = btrfs_csum_data((char *)sb + + crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -3977,8 +3982,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_put_block_group_cache(fs_info); - btrfs_free_block_groups(fs_info); - /* * we must make sure there is not any read request to * submit after we stopping all workers. @@ -3986,6 +3989,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + btrfs_free_block_groups(fs_info); + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, 1); @@ -4653,9 +4658,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } static const struct extent_io_ops btree_extent_io_ops = { - .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, + /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_io_failed_hook = btree_io_failed_hook, + + /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0be2d4f..2e0ec29 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -116,7 +116,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(char *data, u32 seed, size_t len); +u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c35b966..be54776 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> @@ -4135,10 +4136,10 @@ static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, (may_use_included ? s_info->bytes_may_use : 0); } -int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0; @@ -4281,7 +4282,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(inode, len); + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); if (ret < 0) return ret; @@ -5742,10 +5743,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; /* * We always use trans->block_rsv here as we will have reserved space * for our orphan when starting the transaction, using get_block_rsv() @@ -5762,19 +5763,19 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, */ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", - btrfs_ino(BTRFS_I(inode)), num_bytes, 1); + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + num_bytes, 1); return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } -void btrfs_orphan_release_metadata(struct inode *inode) +void btrfs_orphan_release_metadata(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", - btrfs_ino(BTRFS_I(inode)), num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + num_bytes, 0); btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); } @@ -5846,7 +5847,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) +static unsigned drop_outstanding_extent(struct btrfs_inode *inode, + u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; @@ -5854,25 +5856,23 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) num_extents = count_max_extents(num_bytes); ASSERT(num_extents); - ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); - BTRFS_I(inode)->outstanding_extents -= num_extents; + ASSERT(inode->outstanding_extents >= num_extents); + inode->outstanding_extents -= num_extents; - if (BTRFS_I(inode)->outstanding_extents == 0 && + if (inode->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) drop_inode_space = 1; /* * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ - if (BTRFS_I(inode)->outstanding_extents >= - BTRFS_I(inode)->reserved_extents) + if (inode->outstanding_extents >= inode->reserved_extents) return drop_inode_space; - dropped_extents = BTRFS_I(inode)->reserved_extents - - BTRFS_I(inode)->outstanding_extents; - BTRFS_I(inode)->reserved_extents -= dropped_extents; + dropped_extents = inode->reserved_extents - inode->outstanding_extents; + inode->reserved_extents -= dropped_extents; return dropped_extents + drop_inode_space; } @@ -5894,24 +5894,21 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) * * This must be called with BTRFS_I(inode)->lock held. */ -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, +static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, int reserve) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 old_csums, num_csums; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && - BTRFS_I(inode)->csum_bytes == 0) + if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) return 0; - old_csums = btrfs_csum_bytes_to_leaves(fs_info, - BTRFS_I(inode)->csum_bytes); + old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); if (reserve) - BTRFS_I(inode)->csum_bytes += num_bytes; + inode->csum_bytes += num_bytes; else - BTRFS_I(inode)->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(fs_info, - BTRFS_I(inode)->csum_bytes); + inode->csum_bytes -= num_bytes; + num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5924,10 +5921,10 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); } -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; u64 to_reserve = 0; u64 csum_bytes; @@ -5959,25 +5956,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) schedule_timeout(1); if (delalloc_lock) - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + mutex_lock(&inode->delalloc_mutex); num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); - BTRFS_I(inode)->outstanding_extents += nr_extents; + inode->outstanding_extents += nr_extents; nr_extents = 0; - if (BTRFS_I(inode)->outstanding_extents > - BTRFS_I(inode)->reserved_extents) - nr_extents += BTRFS_I(inode)->outstanding_extents - - BTRFS_I(inode)->reserved_extents; + if (inode->outstanding_extents > inode->reserved_extents) + nr_extents += inode->outstanding_extents - + inode->reserved_extents; /* We always want to reserve a slot for updating the inode. */ to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = BTRFS_I(inode)->csum_bytes; - spin_unlock(&BTRFS_I(inode)->lock); + csum_bytes = inode->csum_bytes; + spin_unlock(&inode->lock); if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { ret = btrfs_qgroup_reserve_meta(root, @@ -5993,38 +5989,38 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { + &inode->runtime_flags)) { to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); release_extra = true; } - BTRFS_I(inode)->reserved_extents += nr_extents; - spin_unlock(&BTRFS_I(inode)->lock); + inode->reserved_extents += nr_extents; + spin_unlock(&inode->lock); if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + mutex_unlock(&inode->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(BTRFS_I(inode)), to_reserve, 1); + btrfs_ino(inode), to_reserve, 1); if (release_extra) btrfs_block_rsv_release(fs_info, block_rsv, btrfs_calc_trans_metadata_size(fs_info, 1)); return 0; out_fail: - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + if (inode->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); } else { - u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 orig_csum_bytes = inode->csum_bytes; u64 bytes; /* @@ -6035,8 +6031,8 @@ out_fail: * number of bytes that were freed while we were trying our * reservation. */ - bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; - BTRFS_I(inode)->csum_bytes = csum_bytes; + bytes = csum_bytes - inode->csum_bytes; + inode->csum_bytes = csum_bytes; to_free = calc_csum_metadata_size(inode, bytes, 0); @@ -6045,7 +6041,7 @@ out_fail: * been making this reservation and our ->csum_bytes were not * artificially inflated. */ - BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + inode->csum_bytes = csum_bytes - num_bytes; bytes = csum_bytes - orig_csum_bytes; bytes = calc_csum_metadata_size(inode, bytes, 0); @@ -6057,23 +6053,23 @@ out_fail: * need to do anything, the other free-ers did the correct * thing. */ - BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + inode->csum_bytes = orig_csum_bytes - num_bytes; if (bytes > to_free) to_free = bytes - to_free; else to_free = 0; } - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (to_free) { btrfs_block_rsv_release(fs_info, block_rsv, to_free); trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(BTRFS_I(inode)), to_free, 0); + btrfs_ino(inode), to_free, 0); } if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + mutex_unlock(&inode->delalloc_mutex); return ret; } @@ -6086,27 +6082,27 @@ out_fail: * once we complete IO for a given set of bytes to release their metadata * reservations. */ -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 to_free = 0; unsigned dropped; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(BTRFS_I(inode)), to_free, 0); + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), + to_free, 0); btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); } @@ -6141,7 +6137,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) ret = btrfs_check_data_free_space(inode, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) btrfs_free_reserved_data_space(inode, start, len); return ret; @@ -6164,7 +6160,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) */ void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len); btrfs_free_reserved_data_space(inode, start, len); } @@ -9740,6 +9736,11 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; @@ -9779,9 +9780,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - /* * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. @@ -9791,6 +9789,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); ASSERT(list_empty(&block_group->dirty_list)); ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); @@ -10342,7 +10341,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&trans->transaction->cache_write_mutex); if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); goto out; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d15b5dd..28e8192 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -428,7 +428,8 @@ static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->mapping->host, state, bits); + tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), + state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -1959,11 +1960,11 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -int free_io_failure(struct inode *inode, struct io_failure_record *rec) +int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) { int ret; int err = 0; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *failure_tree = &inode->io_failure_tree; set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, @@ -1972,7 +1973,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) if (ret) err = ret; - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + ret = clear_extent_bits(&inode->io_tree, rec->start, rec->start + rec->len - 1, EXTENT_DAMAGED); if (ret && !err) @@ -1992,10 +1993,11 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, - struct page *page, unsigned int pg_offset, int mirror_num) +int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, + u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; @@ -2054,7 +2056,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - btrfs_ino(BTRFS_I(inode)), start, + btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2074,7 +2076,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(fs_info->btree_inode, start, + ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) @@ -2089,23 +2091,23 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct inode *inode, u64 start, struct page *page, +int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, unsigned int pg_offset) { u64 private; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *state; int num_copies; int ret; private = 0; - ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + ret = count_range_bits(&inode->io_failure_tree, &private, (u64)-1, 1, EXTENT_DIRTY, 0); if (!ret) return 0; - ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, + ret = get_state_failrec(&inode->io_failure_tree, start, &failrec); if (ret) return 0; @@ -2122,11 +2124,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page, if (fs_info->sb->s_flags & MS_RDONLY) goto out; - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + spin_lock(&inode->io_tree.lock); + state = find_first_extent_bit_state(&inode->io_tree, failrec->start, EXTENT_LOCKED); - spin_unlock(&BTRFS_I(inode)->io_tree.lock); + spin_unlock(&inode->io_tree.lock); if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { @@ -2151,9 +2153,9 @@ out: * - under ordered extent * - the inode is freeing */ -void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; struct extent_state *state, *next; @@ -2393,7 +2395,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } @@ -2406,7 +2408,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, (int)phy_offset, failed_bio->bi_end_io, NULL); if (!bio) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -2418,7 +2420,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); if (ret) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); bio_put(bio); } @@ -2435,12 +2437,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, end, NULL, + uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2568,21 +2567,21 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops && - tree->ops->readpage_end_io_hook)) { + if (likely(uptodate && tree->ops)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); if (ret) uptodate = 0; else - clean_io_failure(inode, start, page, 0); + clean_io_failure(BTRFS_I(inode), start, + page, 0); } if (likely(uptodate)) goto readpage_ok; - if (tree->ops && tree->ops->readpage_io_failed_hook) { + if (tree->ops) { ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !bio->bi_error) uptodate = 1; @@ -2731,7 +2730,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; bio_get(bio); - if (tree->ops && tree->ops->submit_bio_hook) + if (tree->ops) ret = tree->ops->submit_bio_hook(page->mapping->host, bio, mirror_num, bio_flags, start); else @@ -2746,7 +2745,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long bio_flags) { int ret = 0; - if (tree->ops && tree->ops->merge_bio_hook) + if (tree->ops) ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); return ret; @@ -2857,7 +2856,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(inode, page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); atomic_inc(&em->refs); @@ -3101,7 +3100,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, inode = pages[0]->mapping->host; while (1) { lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, end - start + 1); if (!ordered) break; @@ -3173,7 +3172,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, while (1) { lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, PAGE_SIZE); if (!ordered) break; @@ -3370,7 +3369,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(inode, page, pg_offset, cur, + em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); @@ -4335,7 +4334,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = get_extent(inode, NULL, 0, offset, len, 0); + em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 270d03b..3e4fad4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -84,6 +84,7 @@ extern void le_bitmap_clear(u8 *map, unsigned int start, int len); struct extent_state; struct btrfs_root; +struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; @@ -91,24 +92,34 @@ typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written); - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + /* + * The following callbacks must be allways defined, the function + * pointer will be called unconditionally. + */ extent_submit_bio_hook_t *submit_bio_hook; + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); - int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + + /* + * Optional hooks, called if the pointer is not NULL + */ + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, unsigned *bits); - void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned *bits); + void (*clear_bit_hook)(struct btrfs_inode *inode, + struct extent_state *state, + unsigned *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -209,7 +220,7 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct inode *inode, +typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, @@ -451,12 +462,13 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; +struct btrfs_inode; -int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, - struct page *page, unsigned int pg_offset, - int mirror_num); -int clean_io_failure(struct inode *inode, u64 start, struct page *page, - unsigned int pg_offset); +int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, + u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num); @@ -480,7 +492,9 @@ struct io_failure_record { int in_validation; }; -void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); + +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, + u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, @@ -489,7 +503,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct inode *inode, struct io_failure_record *rec); +int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f7b9a92..64fcb31 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -214,7 +214,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -643,7 +643,33 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, /* delete the entire item, it is inside our range */ if (key.offset >= bytenr && csum_end <= end_byte) { - ret = btrfs_del_item(trans, root, path); + int del_nr = 1; + + /* + * Check how many csum items preceding this one in this + * leaf correspond to our range and then delete them all + * at once. + */ + if (key.offset > bytenr && path->slots[0] > 0) { + int slot = path->slots[0] - 1; + + while (slot >= 0) { + struct btrfs_key pk; + + btrfs_item_key_to_cpu(leaf, &pk, slot); + if (pk.offset < bytenr || + pk.type != BTRFS_EXTENT_CSUM_KEY || + pk.objectid != + BTRFS_EXTENT_CSUM_OBJECTID) + break; + path->slots[0] = slot; + del_nr++; + key.offset = pk.offset; + slot--; + } + } + ret = btrfs_del_items(trans, root, path, + path->slots[0], del_nr); if (ret) goto out; if (key.offset == bytenr) @@ -904,14 +930,14 @@ fail_unlock: goto out; } -void btrfs_extent_item_to_extent_map(struct inode *inode, +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; @@ -976,8 +1002,8 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, } } else { btrfs_err(fs_info, - "unknown file extent item type %d, inode %llu, offset %llu, root %llu", - type, btrfs_ino(BTRFS_I(inode)), extent_start, + "unknown file extent item type %d, inode %llu, offset %llu, " + "root %llu", type, btrfs_ino(inode), extent_start, root->root_key.objectid); } } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c1d2a07..520cb72 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -92,10 +92,10 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static int __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; @@ -123,7 +123,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, return -EEXIST; } } - set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; @@ -145,10 +145,10 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct inode_defrag *defrag; u64 transid; int ret; @@ -156,24 +156,24 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!__need_auto_defrag(fs_info)) return 0; - if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return 0; if (trans) transid = trans->transid; else - transid = BTRFS_I(inode)->root->last_trans; + transid = inode->root->last_trans; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; - defrag->ino = btrfs_ino(BTRFS_I(inode)); + defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { /* * If we set IN_DEFRAG flag and evict the inode from memory, * and then re-read this inode, this new inode doesn't have @@ -194,10 +194,10 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * the same inode in the tree, we will merge them together (by * __btrfs_add_inode_defrag()) and free the one that we want to requeue. */ -static void btrfs_requeue_inode_defrag(struct inode *inode, +static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; if (!__need_auto_defrag(fs_info)) @@ -334,7 +334,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, */ if (num_defrag == BTRFS_DEFRAG_BATCH) { defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); } else if (defrag->last_offset && !defrag->cycled) { /* * we didn't fill our defrag batch, but @@ -343,7 +343,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, */ defrag->last_offset = 0; defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } @@ -529,13 +529,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; u64 len = end - start + 1; u64 gen; int ret; @@ -720,7 +720,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0); if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; @@ -1082,10 +1082,10 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end) + struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; @@ -1102,7 +1102,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_slot = 0; int recow; int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) @@ -1415,13 +1415,13 @@ fail: * the other < 0 number - Something wrong happens */ static noinline int -lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, struct extent_state **cached_state) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 start_pos; u64 last_pos; int i; @@ -1432,30 +1432,30 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->i_size) { + if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, cached_state); + lock_extent_bits(&inode->io_tree, start_pos, last_pos, + cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, - cached_state, GFP_NOFS); + unlock_extent_cached(&inode->io_tree, start_pos, + last_pos, cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(&inode->vfs_inode, + ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); - clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, + clear_extent_bit(&inode->io_tree, start_pos, last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached_state, GFP_NOFS); @@ -1474,11 +1474,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct inode *inode, loff_t pos, +static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; @@ -1493,19 +1493,20 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, fs_info->sectorsize) - 1; while (1) { - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + lock_extent(&inode->io_tree, lockstart, lockend); ordered = btrfs_lookup_ordered_range(inode, lockstart, lockend - lockstart + 1); if (!ordered) { break; } - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(inode, ordered, 1); + unlock_extent(&inode->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } num_bytes = lockend - lockstart + 1; - ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, + NULL, NULL, NULL); if (ret <= 0) { ret = 0; btrfs_end_write_no_snapshoting(root); @@ -1514,7 +1515,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, num_bytes - pos + lockstart); } - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + unlock_extent(&inode->io_tree, lockstart, lockend); return ret; } @@ -1579,7 +1580,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { + check_can_nocow(BTRFS_I(inode), pos, + &write_bytes) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1599,7 +1601,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } } - ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + reserve_bytes); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, pos, @@ -1623,9 +1626,9 @@ again: if (ret) break; - ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, - pos, write_bytes, &lockstart, - &lockend, &cached_state); + ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, + num_pages, pos, write_bytes, &lockstart, + &lockend, &cached_state); if (ret < 0) { if (ret == -EAGAIN) goto again; @@ -1677,7 +1680,7 @@ again: spin_unlock(&BTRFS_I(inode)->lock); } if (only_release_metadata) { - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { u64 __pos; @@ -1738,7 +1741,8 @@ again: if (release_bytes) { if (only_release_metadata) { btrfs_end_write_no_snapshoting(root); - btrfs_delalloc_release_metadata(inode, release_bytes); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + release_bytes); } else { btrfs_delalloc_release_space(inode, round_down(pos, fs_info->sectorsize), @@ -2193,7 +2197,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, +static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; @@ -2203,7 +2207,7 @@ static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, return 0; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) return 0; @@ -2222,22 +2226,23 @@ static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, return 0; } -static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, - struct btrfs_path *path, u64 offset, u64 end) +static int fill_holes(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 offset, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct btrfs_key key; int ret; if (btrfs_fs_incompat(fs_info, NO_HOLES)) goto out; - key.objectid = btrfs_ino(BTRFS_I(inode)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; @@ -2253,7 +2258,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, } leaf = path->nodes[0]; - if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { u64 num_bytes; path->slots[0]--; @@ -2285,7 +2290,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, } btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); if (ret) return ret; @@ -2296,8 +2301,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2320,7 +2324,7 @@ out: free_extent_map(hole_em); if (ret) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); } return 0; @@ -2337,7 +2341,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); if (IS_ERR_OR_NULL(em)) { if (!em) ret = -ENOMEM; @@ -2550,8 +2554,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &fs_info->trans_block_rsv; if (cur_offset < drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, inode, path, cur_offset, - drop_end); + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2622,7 +2626,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * cur_offset == drop_end). */ if (cur_offset < ino_size && cur_offset < drop_end) { - ret = fill_holes(trans, inode, path, cur_offset, drop_end); + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); @@ -2747,7 +2752,8 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); if (ret < 0) return ret; @@ -2827,7 +2833,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { if (!em) @@ -2954,7 +2960,8 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) &cached_state); while (start < inode->i_size) { - em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, + start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1a131f7..da6841e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> @@ -260,7 +261,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_free_path(path); } - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); truncate_pagecache(inode, 0); /* @@ -3545,7 +3546,8 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (ret) { if (release_metadata) - btrfs_delalloc_release_metadata(inode, inode->i_size); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + inode->i_size); #ifdef DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 3bbb8f0..5c6c20e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -499,7 +499,7 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_metadata(inode, prealloc); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); goto out_put; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f02823f..c40060c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -316,8 +316,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_delalloc_release_metadata(inode, end + 1 - start); - btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); + btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -389,12 +389,12 @@ static inline int inode_need_compress(struct inode *inode) return 0; } -static inline void inode_should_defrag(struct inode *inode, +static inline void inode_should_defrag(struct btrfs_inode *inode, u64 start, u64 end, u64 num_bytes, u64 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); } @@ -430,23 +430,23 @@ static noinline void compress_file_range(struct inode *inode, int ret = 0; struct page **pages = NULL; unsigned long nr_pages; - unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = SZ_128K; - unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = fs_info->compress_type; int redirty = 0; - inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); + inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, + SZ_16K); actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); + BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); + nr_pages = min_t(unsigned long, nr_pages, + BTRFS_MAX_COMPRESSED / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -471,17 +471,8 @@ again: (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* we want to make sure that amount of ram required to uncompress - * an extent is reasonable, so we limit the total size in ram - * of a compressed extent to 128k. This is a crucial number - * because it also controls how easily we can spread reads across - * cpus for decompression. - * - * We also want to make sure the amount of IO required to do - * a random read is reasonably small, so we limit the size of - * a compressed extent to 128k. - */ - total_compressed = min(total_compressed, max_uncompressed); + total_compressed = min_t(unsigned long, total_compressed, + BTRFS_MAX_UNCOMPRESSED); num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; @@ -516,16 +507,15 @@ again: redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, + pages, + &nr_pages, &total_in, - &total_compressed, - max_compressed); + &total_compressed); if (!ret) { unsigned long offset = total_compressed & (PAGE_SIZE - 1); - struct page *page = pages[nr_pages_ret - 1]; + struct page *page = pages[nr_pages - 1]; char *kaddr; /* zero the tail end of the last page, we might be @@ -606,7 +596,7 @@ cont: * will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, + total_compressed, pages, nr_pages, compress_type); if (start + num_bytes < end) { @@ -623,14 +613,14 @@ cont: * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array */ - for (i = 0; i < nr_pages_ret; i++) { + for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); pages = NULL; total_compressed = 0; - nr_pages_ret = 0; + nr_pages = 0; /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && @@ -659,7 +649,7 @@ cleanup_and_bail_uncompressed: return; free_pages_out: - for (i = 0; i < nr_pages_ret; i++) { + for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } @@ -806,7 +796,8 @@ retry: BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(inode, async_extent->start, + btrfs_drop_extent_cache(BTRFS_I(inode), + async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; @@ -933,7 +924,7 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map *em; int ret = 0; - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; @@ -943,7 +934,7 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - inode_should_defrag(inode, start, end, num_bytes, SZ_64K); + inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ @@ -971,7 +962,8 @@ static noinline int cow_file_range(struct inode *inode, btrfs_super_total_bytes(fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + num_bytes - 1, 0); while (disk_num_bytes > 0) { unsigned long op; @@ -1039,7 +1031,7 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1231,7 +1223,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(inode); + nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); cow_start = (u64)-1; cur_offset = start; @@ -1331,10 +1323,16 @@ next_slot: * either valid or do not exist. */ if (csum_exist_in_range(fs_info, disk_bytenr, - num_bytes)) + num_bytes)) { + if (!nolock) + btrfs_end_write_no_snapshoting(root); goto out_check; - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) + } + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { + if (!nolock) + btrfs_end_write_no_snapshoting(root); goto out_check; + } nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1629,15 +1627,15 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, } static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); spin_lock(&root->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + if (!list_empty(&inode->delalloc_inodes)) { + list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { spin_lock(&fs_info->delalloc_root_lock); @@ -1670,7 +1668,7 @@ static void btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(inode); + bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1700,18 +1698,18 @@ static void btrfs_set_bit_hook(struct inode *inode, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static void btrfs_clear_bit_hook(struct inode *inode, +static void btrfs_clear_bit_hook(struct btrfs_inode *inode, struct extent_state *state, unsigned *bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) - BTRFS_I(inode)->defrag_bytes -= len; - spin_unlock(&BTRFS_I(inode)->lock); + inode->defrag_bytes -= len; + spin_unlock(&inode->lock); /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -1719,15 +1717,15 @@ static void btrfs_clear_bit_hook(struct inode *inode, * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents -= num_extents; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->outstanding_extents -= num_extents; + spin_unlock(&inode->lock); } /* @@ -1747,18 +1745,19 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & (EXTENT_DO_ACCOUNTING | EXTENT_CLEAR_DATA_RESV))) - btrfs_free_reserved_data_space_noquota(inode, + btrfs_free_reserved_data_space_noquota( + &inode->vfs_inode, state->start, len); __percpu_counter_add(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && + spin_lock(&inode->lock); + inode->delalloc_bytes -= len; + if (do_list && inode->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) btrfs_del_delalloc_inode(root, inode); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); } } @@ -1854,7 +1853,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (btrfs_is_free_space_inode(inode)) + if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (bio_op(bio) != REQ_OP_WRITE) { @@ -1963,7 +1962,7 @@ again: if (PagePrivate2(page)) goto out; - ordered = btrfs_lookup_ordered_range(inode, page_start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, @@ -2793,16 +2792,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool nolock; bool truncated = false; - nolock = btrfs_is_free_space_inode(inode); + nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } - btrfs_free_io_failure_record(inode, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -2873,7 +2873,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - ret = btrfs_mark_extent_written(trans, inode, + ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); @@ -2914,7 +2914,8 @@ out_unlock: ordered_extent->len - 1, &cached_state, GFP_NOFS); out: if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, ordered_extent->len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + ordered_extent->len); if (trans) btrfs_end_transaction(trans); @@ -2929,7 +2930,7 @@ out: clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -2977,7 +2978,7 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, +static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { struct inode *inode = page->mapping->host; @@ -2991,9 +2992,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, ClearPagePrivate2(page); if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, end - start + 1, uptodate)) - return 0; + return; - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { wq = fs_info->endio_freespace_worker; func = btrfs_freespace_write_helper; } else { @@ -3004,8 +3005,6 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered_extent->work); - - return 0; } static int __readpage_endio_check(struct inode *inode, @@ -3028,7 +3027,7 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(inode, start, csum, csum_expected, + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, io_bio->mirror_num); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3167,10 +3166,11 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, * NOTE: caller of this function should reserve 5 units of metadata for * this function. */ -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +int btrfs_orphan_add(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = NULL; int reserve = 0; int insert = 0; @@ -3192,7 +3192,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) { + &inode->runtime_flags)) { #if 0 /* * For proper ENOSPC handling, we should do orphan @@ -3209,7 +3209,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) reserve = 1; spin_unlock(&root->orphan_lock); @@ -3220,28 +3220,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret) { atomic_dec(&root->orphan_inodes); clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); if (insert) clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); return ret; } } /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { - ret = btrfs_insert_orphan_item(trans, root, - btrfs_ino(BTRFS_I(inode))); + ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); btrfs_orphan_release_metadata(inode); } if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); btrfs_abort_transaction(trans, ret); return ret; } @@ -3266,20 +3265,20 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) * item for this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int delete_item = 0; int release_rsv = 0; int ret = 0; spin_lock(&root->orphan_lock); if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) delete_item = 1; if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) release_rsv = 1; spin_unlock(&root->orphan_lock); @@ -3287,7 +3286,7 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, atomic_dec(&root->orphan_inodes); if (trans) ret = btrfs_del_orphan_item(trans, root, - btrfs_ino(BTRFS_I(inode))); + btrfs_ino(inode)); } if (release_rsv) @@ -3453,7 +3452,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); btrfs_end_transaction(trans); if (ret) { iput(inode); @@ -3462,7 +3461,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_truncate(inode); if (ret) - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); } else { nr_unlink++; } @@ -3617,7 +3616,7 @@ static int btrfs_read_locked_inode(struct inode *inode) set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -3865,7 +3864,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(inode) + if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); @@ -3988,8 +3987,7 @@ err: if (ret) goto out; - btrfs_i_size_write(&dir->vfs_inode, - dir->vfs_inode.i_size - name_len * 2); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = @@ -4056,7 +4054,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) goto out; if (inode->i_nlink == 0) { - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -4137,7 +4135,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(dir, dir->i_size - name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = current_time(dir); ret = btrfs_update_inode_fallback(trans, root, dir); @@ -4173,7 +4171,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; } - err = btrfs_orphan_add(trans, inode); + err = btrfs_orphan_add(trans, BTRFS_I(inode)); if (err) goto out; @@ -4184,7 +4182,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (!err) { - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to * its parent directory. This is to prevent an unrecoverable @@ -4320,7 +4318,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * for non-free space inodes and ref cows, we want to back off from * time to time */ - if (!btrfs_is_free_space_inode(inode) && + if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && test_bit(BTRFS_ROOT_REF_COWS, &root->state)) be_nice = 1; @@ -4336,7 +4334,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, */ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root) - btrfs_drop_extent_cache(inode, ALIGN(new_size, + btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); @@ -4412,19 +4410,8 @@ search_again: if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) { - /* - * With NO_HOLES mode, for the following mapping - * - * [0-4k][hole][8k-12k] - * - * if truncating isize down to 6k, it ends up - * isize being 8k. - */ - if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) - last_size = new_size; + if (item_end < new_size) break; - } if (found_key.offset >= new_size) del_item = 1; else @@ -4607,8 +4594,12 @@ out: btrfs_abort_transaction(trans, ret); } error: - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ASSERT(last_size >= new_size); + if (!err && last_size > new_size) + last_size = new_size; btrfs_ordered_update_i_size(inode, last_size, NULL); + } btrfs_free_path(path); @@ -4835,7 +4826,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, hole_start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, block_end - hole_start); if (!ordered) break; @@ -4847,7 +4838,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); @@ -4864,7 +4855,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size); if (err) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { @@ -4890,7 +4881,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) write_unlock(&em_tree->lock); if (err != -EEXIST) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), + cur_offset, cur_offset + hole_size - 1, 0); } @@ -4987,7 +4979,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * so we need to guarantee from this point on that everything * will be consistent. */ - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); btrfs_end_transaction(trans); if (ret) return ret; @@ -4996,9 +4988,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); /* Disable nonlocked read DIO to avoid the end less truncate */ - btrfs_inode_block_unlocked_dio(inode); + btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(inode); + btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode); if (ret && inode->i_nlink) { @@ -5007,7 +4999,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* To get a stable disk_i_size */ err = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (err) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); return err; } @@ -5019,11 +5011,11 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); return ret; } i_size_write(inode, BTRFS_I(inode)->disk_i_size); - err = btrfs_orphan_del(trans, inode); + err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans); @@ -5181,18 +5173,18 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || - btrfs_is_free_space_inode(inode))) + btrfs_is_free_space_inode(BTRFS_I(inode)))) goto no_delete; if (is_bad_inode(inode)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ if (!special_file(inode->i_mode)) btrfs_wait_ordered_range(inode, 0, (u64)-1); - btrfs_free_io_failure_record(inode, 0, (u64)-1); + btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, @@ -5208,20 +5200,20 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } rsv->size = min_size; rsv->failfast = 1; global_rsv = &fs_info->global_block_rsv; - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); /* * This is a bit simpler than btrfs_truncate since we've already @@ -5256,14 +5248,14 @@ void btrfs_evict_inode(struct inode *inode) btrfs_warn(fs_info, "Could not get space for a delete, will truncate on mount %d", ret); - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } @@ -5289,7 +5281,7 @@ void btrfs_evict_inode(struct inode *inode) if (ret) { ret = btrfs_commit_transaction(trans); if (ret) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } @@ -5318,9 +5310,9 @@ void btrfs_evict_inode(struct inode *inode) */ if (ret == 0) { trans->block_rsv = root->orphan_block_rsv; - btrfs_orphan_del(trans, inode); + btrfs_orphan_del(trans, BTRFS_I(inode)); } else { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); } trans->block_rsv = &fs_info->trans_block_rsv; @@ -5898,7 +5890,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; - if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) + if (btrfs_fs_closing(root->fs_info) && + btrfs_is_free_space_inode(BTRFS_I(inode))) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -5978,15 +5971,15 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now, * and then set the in-memory index_cnt variable to reflect * free sequence numbers */ -static int btrfs_set_inode_index_count(struct inode *inode) +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; struct btrfs_path *path; struct extent_buffer *leaf; int ret; - key.objectid = btrfs_ino(BTRFS_I(inode)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; @@ -6009,7 +6002,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) * else has to start at 2 */ if (path->slots[0] == 0) { - BTRFS_I(inode)->index_cnt = 2; + inode->index_cnt = 2; goto out; } @@ -6018,13 +6011,13 @@ static int btrfs_set_inode_index_count(struct inode *inode) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - BTRFS_I(inode)->index_cnt = 2; + inode->index_cnt = 2; goto out; } - BTRFS_I(inode)->index_cnt = found_key.offset + 1; + inode->index_cnt = found_key.offset + 1; out: btrfs_free_path(path); return ret; @@ -6034,12 +6027,12 @@ out: * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ -int btrfs_set_inode_index(struct inode *dir, u64 *index) +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; - if (BTRFS_I(dir)->index_cnt == (u64)-1) { - ret = btrfs_inode_delayed_dir_index_count(BTRFS_I(dir)); + if (dir->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) @@ -6047,8 +6040,8 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) } } - *index = BTRFS_I(dir)->index_cnt; - BTRFS_I(dir)->index_cnt++; + *index = dir->index_cnt; + dir->index_cnt++; return ret; } @@ -6109,7 +6102,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir && name) { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(dir, index); + ret = btrfs_set_inode_index(BTRFS_I(dir), index); if (ret) { btrfs_free_path(path); iput(inode); @@ -6244,18 +6237,18 @@ static inline u8 btrfs_inode_type(struct inode *inode) * inode to the parent directory. */ int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret = 0; struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(parent_inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); - u64 parent_ino = btrfs_ino(BTRFS_I(parent_inode)); + struct btrfs_root *root = parent_inode->root; + u64 ino = btrfs_ino(inode); + u64 parent_ino = btrfs_ino(parent_inode); if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + memcpy(&key, &inode->root->root_key, sizeof(key)); } else { key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; @@ -6277,7 +6270,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, - btrfs_inode_type(inode), index); + btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6285,12 +6278,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; } - btrfs_i_size_write(parent_inode, parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); - inode_inc_iversion(parent_inode); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); - ret = btrfs_update_inode(trans, root, parent_inode); + inode_inc_iversion(&parent_inode->vfs_inode); + parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = + current_time(&parent_inode->vfs_inode); + ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6314,8 +6307,8 @@ fail_dir_item: } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry, - struct inode *inode, int backref, u64 index) + struct btrfs_inode *dir, struct dentry *dentry, + struct btrfs_inode *inode, int backref, u64 index) { int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, @@ -6371,7 +6364,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 0, index); if (err) { goto out_unlock_inode; } else { @@ -6448,7 +6442,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 0, index); if (err) goto out_unlock_inode; @@ -6490,7 +6485,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = btrfs_set_inode_index(dir, &index); + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6514,7 +6509,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 1, index); if (err) { drop_inode = 1; @@ -6528,7 +6524,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, inode); + err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) goto fail; } @@ -6589,13 +6585,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); err = btrfs_update_inode(trans, root, inode); if (err) goto out_fail_inode; - err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail_inode; @@ -6725,25 +6722,26 @@ static noinline int uncompress_inline(struct btrfs_path *path, * This also copies inline extents directly into the page. */ -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; int err = 0; u64 extent_start = 0; u64 extent_end = 0; - u64 objectid = btrfs_ino(BTRFS_I(inode)); + u64 objectid = btrfs_ino(inode); u32 found_type; struct btrfs_path *path = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; @@ -6856,7 +6854,8 @@ next: goto not_found_em; } - btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, + new_inline, em); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -6992,7 +6991,7 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, BTRFS_I(inode), em); + trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); if (trans) { @@ -7008,9 +7007,10 @@ out: return em; } -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) +struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { struct extent_map *em; struct extent_map *hole_em = NULL; @@ -7047,7 +7047,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag em = NULL; /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + found = count_range_bits(&inode->io_tree, &range_start, end, len, EXTENT_DELALLOC, 1); found_end = range_start + found; if (found_end < range_start) @@ -7162,7 +7162,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(inode, start, + btrfs_drop_extent_cache(BTRFS_I(inode), start, start + len - 1, 0); } em = ERR_PTR(ret); @@ -7423,7 +7423,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ - ordered = btrfs_lookup_ordered_range(inode, lockstart, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, lockend - lockstart + 1); /* @@ -7529,7 +7529,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } do { - btrfs_drop_extent_cache(inode, em->start, + btrfs_drop_extent_cache(BTRFS_I(inode), em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); @@ -7617,7 +7617,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -7854,7 +7854,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } @@ -7868,7 +7868,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); if (!bio) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -7879,7 +7879,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { - free_io_failure(inode, failrec); + free_io_failure(BTRFS_I(inode), failrec); bio_put(bio); } @@ -7909,7 +7909,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(done->inode, done->start, bvec->bv_page, 0); + clean_io_failure(BTRFS_I(done->inode), done->start, bvec->bv_page, 0); end: complete(&done->done); bio_put(bio); @@ -7995,7 +7995,7 @@ static void btrfs_retry_endio(struct bio *bio) bvec->bv_page, bvec->bv_offset, done->start, bvec->bv_len); if (!ret) - clean_io_failure(done->inode, done->start, + clean_io_failure(BTRFS_I(done->inode), done->start, bvec->bv_page, bvec->bv_offset); else uptodate = 0; @@ -8796,7 +8796,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->len - 1); @@ -8962,7 +8962,8 @@ again: * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, + PAGE_SIZE); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -9160,7 +9161,7 @@ static int btrfs_truncate(struct inode *inode) if (ret == 0 && inode->i_nlink > 0) { trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); if (ret) err = ret; } @@ -9205,7 +9206,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_fop = &btrfs_dir_file_operations; set_nlink(inode, 1); - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); @@ -9278,7 +9279,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif @@ -9333,7 +9334,7 @@ void btrfs_destroy_inode(struct inode *inode) } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); free: call_rcu(&inode->i_rcu, btrfs_i_callback); } @@ -9412,11 +9413,11 @@ fail: return -ENOMEM; } -static int btrfs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) +static int btrfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { u64 delalloc_bytes; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); @@ -9480,10 +9481,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, * We need to find a free sequence number both in the source and * in the destination directory for the exchange. */ - ret = btrfs_set_inode_index(new_dir, &old_idx); + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail; - ret = btrfs_set_inode_index(old_dir, &new_idx); + ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail; @@ -9581,7 +9582,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_fail; } - ret = btrfs_add_link(trans, new_dir, old_inode, + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { @@ -9589,7 +9590,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_fail; } - ret = btrfs_add_link(trans, old_dir, new_inode, + ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { @@ -9691,8 +9692,8 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_add_nondir(trans, dir, dentry, - inode, 0, index); + ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, + BTRFS_I(inode), 0, index); if (ret) goto out; @@ -9791,7 +9792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (dest != root) btrfs_record_root_in_trans(trans, dest); - ret = btrfs_set_inode_index(new_dir, &index); + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail; @@ -9858,14 +9859,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, d_inode(new_dentry)); + ret = btrfs_orphan_add(trans, + BTRFS_I(d_inode(new_dentry))); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } - ret = btrfs_add_link(trans, new_dir, old_inode, + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { @@ -10232,7 +10234,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len); + btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); /* * Last step, add directory indexes for our symlink inode. This is the @@ -10240,7 +10242,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, * elsewhere above. */ if (!err) - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, + BTRFS_I(inode), 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -10326,7 +10329,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset -1, 0); em = alloc_extent_map(); @@ -10353,7 +10356,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, write_unlock(&em_tree->lock); if (ret != -EEXIST) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, 0); } @@ -10475,7 +10478,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) ret = btrfs_update_inode(trans, root, inode); if (ret) goto out_inode; - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out_inode; @@ -10505,6 +10508,12 @@ out_inode: } +__attribute__((const)) +static int dummy_readpage_io_failed_hook(struct page *page, int failed_mirror) +{ + return 0; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10543,10 +10552,14 @@ static const struct file_operations btrfs_dir_file_operations = { }; static const struct extent_io_ops btrfs_extent_io_ops = { - .fill_delalloc = run_delalloc_range, + /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .merge_bio_hook = btrfs_merge_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_io_failed_hook = dummy_readpage_io_failed_hook, + + /* optional callbacks */ + .fill_delalloc = run_delalloc_range, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, .set_bit_hook = btrfs_set_bit_hook, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d853997..dabfc7a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -434,7 +434,7 @@ int btrfs_is_empty_uuid(u8 *uuid) static noinline int create_subvol(struct inode *dir, struct dentry *dentry, - char *name, int namelen, + const char *name, int namelen, u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { @@ -580,21 +580,21 @@ static noinline int create_subvol(struct inode *dir, /* * insert the directory item */ - ret = btrfs_set_inode_index(dir, &index); + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir, &key, + name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(dir, dir->i_size + namelen * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); @@ -832,7 +832,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, - char *name, int namelen, + const char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1009,7 +1009,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) @@ -1625,7 +1625,7 @@ out: } static noinline int btrfs_ioctl_snap_create_transid(struct file *file, - char *name, unsigned long fd, int subvol, + const char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { @@ -3298,7 +3298,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, if (endoff > destoff + olen) endoff = destoff + olen; if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); + btrfs_i_size_write(BTRFS_I(inode), endoff); ret = btrfs_update_inode(trans, root, inode); if (ret) { @@ -3311,20 +3311,19 @@ out: return ret; } -static void clone_update_extent_map(struct inode *inode, +static void clone_update_extent_map(struct btrfs_inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, const u64 hole_offset, const u64 hole_len) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); return; } @@ -3338,7 +3337,7 @@ static void clone_update_extent_map(struct inode *inode, if (btrfs_file_extent_type(path->nodes[0], fi) == BTRFS_FILE_EXTENT_INLINE) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); } else { em->start = hole_offset; em->len = hole_len; @@ -3364,8 +3363,7 @@ static void clone_update_extent_map(struct inode *inode, } if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); } /* @@ -3791,11 +3789,12 @@ process_slot: /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) - clone_update_extent_map(inode, trans, + clone_update_extent_map(BTRFS_I(inode), trans, NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, 0, 0); + clone_update_extent_map(BTRFS_I(inode), trans, + path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3845,8 +3844,9 @@ process_slot: btrfs_end_transaction(trans); goto out; } - clone_update_extent_map(inode, trans, NULL, last_dest_end, - destoff + len - last_dest_end); + clone_update_extent_map(BTRFS_I(inode), trans, NULL, + last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 45d2698..f48c8c1 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -76,7 +76,7 @@ static inline void write_compress_length(char *buf, size_t len) memcpy(buf, &dlen, LZO_LEN); } -static inline size_t read_compress_length(char *buf) +static inline size_t read_compress_length(const char *buf) { __le32 dlen; @@ -86,13 +86,11 @@ static inline size_t read_compress_length(char *buf) static int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -102,7 +100,9 @@ static int lzo_compress_pages(struct list_head *ws, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; - + unsigned long len = *total_out; + unsigned long nr_dest_pages = *out_pages; + const unsigned long max_out = nr_dest_pages * PAGE_SIZE; size_t in_len; size_t out_len; char *buf; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bc2aba8..9a46878 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -879,15 +879,14 @@ out: /* Since the DIO code tries to lock a wide area we need to look for any ordered * extents that exist in the range, rather than just the start of the range. */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len) +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { @@ -923,7 +922,7 @@ bool btrfs_have_ordered_extents_in_range(struct inode *inode, { struct btrfs_ordered_extent *oe; - oe = btrfs_lookup_ordered_range(inode, file_offset, len); + oe = btrfs_lookup_ordered_range(BTRFS_I(inode), file_offset, len); if (oe) { btrfs_put_ordered_extent(oe); return true; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index a8cb8ef..195c93b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -189,9 +189,10 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, + u64 file_offset, + u64 len); bool btrfs_have_ordered_extents_in_range(struct inode *inode, u64 file_offset, u64 len); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ddbde0f..d60df51 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1714,8 +1714,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (!ret) continue; - btrfs_drop_extent_cache(inode, key.offset, end, - 1); + btrfs_drop_extent_cache(BTRFS_I(inode), + key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, end); } @@ -2130,7 +2130,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, /* the lock_extent waits for readpage to complete */ lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(inode, start, end, 1); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; @@ -3161,7 +3161,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, free_extent_map(em); break; } - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); } unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; @@ -3203,7 +3203,8 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_SHIFT; last_index = (cluster->end - offset) >> PAGE_SHIFT; while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + PAGE_SIZE); if (ret) goto out; @@ -3215,7 +3216,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page = find_or_create_page(inode->i_mapping, index, mask); if (!page) { - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); ret = -ENOMEM; goto out; @@ -3234,7 +3235,7 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); put_page(page); - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); ret = -EIO; goto out; @@ -4245,7 +4246,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); BTRFS_I(inode)->index_cnt = group->key.objectid; - err = btrfs_orphan_add(trans, inode); + err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ff9a11c..b0251eb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -731,7 +731,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - ret = repair_io_failure(inode, offset, PAGE_SIZE, + ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE, fixup->logical, page, offset - page_offset(page), fixup->mirror_num); @@ -4236,7 +4236,7 @@ out: scrub_pending_trans_workers_dec(sctx); } -static int check_extent_to_block(struct inode *inode, u64 start, u64 len, +static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, u64 logical) { struct extent_state *cached_state = NULL; @@ -4246,7 +4246,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, u64 lockstart = start, lockend = start + len - 1; int ret = 0; - io_tree = &BTRFS_I(inode)->io_tree; + io_tree = &inode->io_tree; lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); @@ -4325,7 +4325,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, io_tree = &BTRFS_I(inode)->io_tree; nocow_ctx_logical = nocow_ctx->logical; - ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + ret = check_extent_to_block(BTRFS_I(inode), offset, len, + nocow_ctx_logical); if (ret) { ret = ret > 0 ? 0 : ret; goto out; @@ -4372,7 +4373,7 @@ again: } } - ret = check_extent_to_block(inode, offset, len, + ret = check_extent_to_block(BTRFS_I(inode), offset, len, nocow_ctx_logical); if (ret) { ret = ret > 0 ? 0 : ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d145ce8..456c890 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1681,6 +1681,9 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) { int ret; + if (ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + ret = get_cur_inode_state(sctx, ino, gen); if (ret < 0) goto out; @@ -1866,7 +1869,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ - if (sctx->parent_root) { + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) @@ -1934,6 +1937,19 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (ret <= 0) goto out; + if (dir != BTRFS_FIRST_FREE_OBJECTID) { + ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode, &other_type); @@ -3556,6 +3572,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, { int ret = 0; u64 ino = parent_ref->dir; + u64 ino_gen = parent_ref->dir_gen; u64 parent_ino_before, parent_ino_after; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; @@ -3576,6 +3593,8 @@ static int wait_for_parent_move(struct send_ctx *sctx, * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent_ino_after_gen; + if (is_waiting_for_move(sctx, ino)) { /* * If the current inode is an ancestor of ino in the @@ -3598,7 +3617,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - NULL, path_after); + &parent_ino_after_gen, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, @@ -3615,10 +3634,20 @@ static int wait_for_parent_move(struct send_ctx *sctx, if (ino > sctx->cur_ino && (parent_ino_before != parent_ino_after || len1 != len2 || memcmp(path_before->start, path_after->start, len1))) { - ret = 1; - break; + u64 parent_ino_gen; + + ret = get_inode_info(sctx->parent_root, ino, NULL, + &parent_ino_gen, NULL, NULL, NULL, + NULL); + if (ret < 0) + goto out; + if (ino_gen == parent_ino_gen) { + ret = 1; + break; + } } ino = parent_ino_after; + ino_gen = parent_ino_after_gen; } out: @@ -5277,6 +5306,81 @@ out: return ret; } +static int range_is_hole_in_parent(struct send_ctx *sctx, + const u64 start, + const u64 end) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = sctx->parent_root; + u64 search_start = start; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = search_start; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + while (search_start < end) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u64 extent_end; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < sctx->cur_ino || + key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + if (key.objectid > sctx->cur_ino || + key.type > BTRFS_EXTENT_DATA_KEY || + key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(leaf, slot, fi); + + extent_end = ALIGN(key.offset + size, + root->fs_info->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } + if (extent_end <= start) + goto next; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { + search_start = extent_end; + goto next; + } + ret = 0; + goto out; +next: + path->slots[0]++; + } + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { @@ -5321,8 +5425,17 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, return ret; } - if (sctx->cur_inode_last_extent < key->offset) - ret = send_hole(sctx, key->offset); + if (sctx->cur_inode_last_extent < key->offset) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + key->offset); + if (ret < 0) + return ret; + else if (ret == 0) + ret = send_hole(sctx, key->offset); + else + ret = 0; + } sctx->cur_inode_last_extent = extent_end; return ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 4d0f038..8c91d03 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -293,7 +293,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } free_extent_map(em); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); /* * All of the magic numbers are based on the mapping setup in @@ -302,7 +302,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -323,7 +323,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -350,7 +350,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -372,7 +372,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -399,7 +399,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -428,7 +428,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -450,7 +450,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,7 +484,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -513,7 +513,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -543,7 +543,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -576,7 +576,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -611,7 +611,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -645,7 +645,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -680,7 +680,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -707,7 +707,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -742,7 +742,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, + sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -769,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -802,7 +803,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -885,7 +886,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -907,7 +908,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, + 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6b3e0fc..61b807d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1505,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* * insert the directory item */ - ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); BUG_ON(ret); /* -ENOMEM */ /* check if there is a file/dir which has the same name. */ @@ -1644,7 +1644,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, - parent_inode, &key, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); @@ -1653,7 +1653,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_i_size_write(parent_inode, parent_inode->i_size + + btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = current_time(parent_inode); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3806853..a59674c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -673,6 +673,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, unsigned long dest_offset; struct btrfs_key ins; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; + ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret) @@ -825,6 +829,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } inode_add_bytes(inode, nbytes); +update_inode: ret = btrfs_update_inode(trans, root, inode); out: if (inode) @@ -1322,8 +1327,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, - 0, ref_index); + ret = btrfs_add_link(trans, BTRFS_I(dir), + BTRFS_I(inode), + name, namelen, 0, ref_index); if (ret) goto out; @@ -1641,7 +1647,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return -EIO; } - ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 1, index); /* FIXME, put inode into FIXUP list */ @@ -1780,7 +1787,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); if (!ret && update_size) { - btrfs_i_size_write(dir, dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); ret = btrfs_update_inode(trans, root, dir); } kfree(name); @@ -5045,14 +5052,14 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, * a full commit is required. */ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct dentry *parent, struct super_block *sb, u64 last_committed) { int ret = 0; struct dentry *old_parent = NULL; - struct inode *orig_inode = inode; + struct btrfs_inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -5060,15 +5067,15 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * we can use the last_unlink_trans field to record renames * and other fun in this file. */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto out; + if (S_ISREG(inode->vfs_inode.i_mode) && + inode->generation <= last_committed && + inode->last_unlink_trans <= last_committed) + goto out; - if (!S_ISDIR(inode->i_mode)) { + if (!S_ISDIR(inode->vfs_inode.i_mode)) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) goto out; - inode = d_inode(parent); + inode = BTRFS_I(d_inode(parent)); } while (1) { @@ -5079,10 +5086,10 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * think this inode has already been logged. */ if (inode != orig_inode) - BTRFS_I(inode)->logged_trans = trans->transid; + inode->logged_trans = trans->transid; smp_mb(); - if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) { + if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; } @@ -5091,8 +5098,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; if (IS_ROOT(parent)) { - inode = d_inode(parent); - if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) + inode = BTRFS_I(d_inode(parent)); + if (btrfs_must_commit_transaction(trans, inode)) ret = 1; break; } @@ -5100,7 +5107,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = d_inode(parent); + inode = BTRFS_I(d_inode(parent)); } dput(old_parent); @@ -5287,15 +5294,15 @@ next_dir_inode: } static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(inode)->root; - const u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) @@ -5390,7 +5397,8 @@ out: * the last committed transaction */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, + struct btrfs_inode *inode, struct dentry *parent, const loff_t start, const loff_t end, @@ -5404,9 +5412,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct inode *orig_inode = inode; + struct btrfs_inode *orig_inode = inode; - sb = inode->i_sb; + sb = inode->vfs_inode.i_sb; if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; @@ -5423,18 +5431,17 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - if (root != BTRFS_I(inode)->root || - btrfs_root_refs(&root->root_item) == 0) { + if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, - sb, last_committed); + ret = check_parent_dirs_for_sync(trans, inode, parent, sb, + last_committed); if (ret) goto end_no_trans; - if (btrfs_inode_in_log(BTRFS_I(inode), trans->transid)) { + if (btrfs_inode_in_log(inode, trans->transid)) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } @@ -5443,8 +5450,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), inode_only, - start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); if (ret) goto end_trans; @@ -5454,14 +5460,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * we can use the last_unlink_trans field to record renames * and other fun in this file. */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) { + if (S_ISREG(inode->vfs_inode.i_mode) && + inode->generation <= last_committed && + inode->last_unlink_trans <= last_committed) { ret = 0; goto end_trans; } - if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; /* @@ -5505,7 +5511,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * but the file inode does not have a matching BTRFS_INODE_REF_KEY item * and has a link count of 2. */ - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + if (inode->last_unlink_trans > last_committed) { ret = btrfs_log_all_parents(trans, orig_inode, ctx); if (ret) goto end_trans; @@ -5515,14 +5521,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - inode = d_inode(parent); - if (root != BTRFS_I(inode)->root) + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) break; - if (BTRFS_I(inode)->generation > last_committed) { - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + if (inode->generation > last_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; } @@ -5534,7 +5539,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, old_parent = parent; } if (log_dentries) - ret = log_new_dir_dentries(trans, root, BTRFS_I(orig_inode), ctx); + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); else ret = 0; end_trans: @@ -5566,8 +5571,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, - start, end, 0, ctx); + ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), + parent, start, end, 0, ctx); dput(parent); return ret; @@ -5829,7 +5834,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, &inode->vfs_inode, parent, 0, + return btrfs_log_inode_parent(trans, root, inode, parent, 0, LLONG_MAX, 1, NULL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 13e55d1..73d56ee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1725,7 +1725,7 @@ out: * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. */ -static void update_dev_time(char *path_name) +static void update_dev_time(const char *path_name) { struct file *filp; @@ -1851,7 +1851,8 @@ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = next_device->bdev; } -int btrfs_rm_device(struct btrfs_fs_info *fs_info, char *device_path, u64 devid) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2091,7 +2092,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device) { int ret = 0; @@ -2118,7 +2119,7 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, } int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device) { *device = NULL; @@ -2151,7 +2152,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, * Lookup a device given by device id, or the path if the id is 0. */ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - char *devpath, struct btrfs_device **device) + const char *devpath, + struct btrfs_device **device) { int ret; @@ -2307,7 +2309,7 @@ error: return ret; } -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; struct request_queue *q; @@ -2515,7 +2517,7 @@ error: } int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out) { @@ -6954,7 +6956,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, key.offset = device->devid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn_in_rcu(fs_info, @@ -7102,7 +7105,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) +void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 24ba6bc..59be812 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -422,16 +422,16 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device); int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - char *devpath, + const char *devpath, struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - char *device_path, u64 devid); + const char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -439,9 +439,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *path); +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, @@ -474,7 +474,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); +void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index da497f1..135b108 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -73,13 +73,11 @@ fail: static int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; @@ -89,6 +87,9 @@ static int zlib_compress_pages(struct list_head *ws, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned long len = *total_out; + unsigned long nr_dest_pages = *out_pages; + const unsigned long max_out = nr_dest_pages * PAGE_SIZE; *out_pages = 0; *total_out = 0; diff --git a/fs/buffer.c b/fs/buffer.c index 28484b3..9196f2a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -19,6 +19,7 @@ */ #include <linux/kernel.h> +#include <linux/sched/signal.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/iomap.h> diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cd1effe..9bf90bc 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -19,6 +19,7 @@ #include <linux/fscache-cache.h> #include <linux/timer.h> #include <linux/wait.h> +#include <linux/cred.h> #include <linux/workqueue.h> #include <linux/security.h> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ce35ae..1a3e1b4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> +#include <linux/signal.h> #include "super.h" #include "mds_client.h" @@ -391,6 +392,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) nr_pages = i; if (nr_pages > 0) { len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); break; } goto out_pages; @@ -771,7 +773,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1017,8 +1019,7 @@ new_request: &ci->i_layout, vino, offset, &len, 0, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, false); if (IS_ERR(req)) { @@ -1028,8 +1029,7 @@ new_request: min(num_ops, CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); BUG_ON(IS_ERR(req)); @@ -1194,7 +1194,7 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); unlock_page(page); return -EIO; @@ -1681,8 +1681,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, - CEPH_OSD_OP_CREATE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1699,8 +1698,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1873,7 +1871,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5bc5d37..4e7421c 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, inode); if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabing cache\n", + dout("fscache_file_set_cookie %p %p enabling cache\n", inode, filp); } } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94fd76d..68c78be 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2,7 +2,7 @@ #include <linux/fs.h> #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> @@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) /* * Return caps we have registered with the MDS(s) as 'wanted'. */ -int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; @@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (!__cap_is_valid(cap)) + if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; @@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, delayed = 1; } ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + if (want & ~cap->mds_wanted) { + /* user space may open/close single file frequently. + * This avoids droping mds_wanted immediately after + * requesting new mds_wanted. + */ + __cap_set_timeouts(mdsc, ci); + } cap->issued &= retain; /* drop bits we don't want */ if (cap->implemented & ~cap->issued) { @@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ceph_sync_write_wait(inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) goto out; @@ -2477,23 +2482,22 @@ again: if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { int mds_wanted; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); *err = -EIO; ret = 1; goto out_unlock; } - mds_wanted = __ceph_caps_mds_wanted(ci); - if ((mds_wanted & need) != need) { + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~(mds_wanted & need)) { dout("get_cap_refs %p caps were dropped" " (session killed?)\n", inode); *err = -ESTALE; ret = 1; goto out_unlock; } - if ((mds_wanted & file_wanted) == - (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + if (!(file_wanted & ~mds_wanted)) ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } @@ -3404,6 +3408,7 @@ retry: tcap->implemented |= issued; if (cap == ci->i_auth_cap) ci->i_auth_cap = tcap; + if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); @@ -3417,9 +3422,18 @@ retry: } else if (tsession) { /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } + __ceph_remove_cap(cap, false); goto out_unlock; } @@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode, } int ceph_encode_dentry_release(void **p, struct dentry *dentry, + struct inode *dir, int mds, int drop, int unless) { - struct inode *dir = d_inode(dentry->d_parent); + struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; @@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; + if (!dir) { + parent = dget(dentry->d_parent); + dir = d_inode(parent); + } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + dput(parent); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 39ff678..f2ae393 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) seq_puts(s, "\t(unsafe)"); else seq_puts(s, "\t"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8ab1fdf..3e9ad50 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -371,7 +371,7 @@ more: /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); - req->r_direct_is_hash = true; + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { @@ -417,7 +417,7 @@ more: fi->frag = frag; fi->last_readdir = req; - if (req->r_did_prepopulate) { + if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { fi->readdir_cache_idx = req->r_readdir_cache_idx; if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ @@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); @@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); goto out; } - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_SHARED on source inode (mds will lock it) */ @@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = drop_caps_for_unlink(inode); @@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); req->r_old_dentry_dir = old_dir; - req->r_locked_dir = new_dir; + req->r_parent = new_dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; op = ceph_snap(dir) == CEPH_SNAPDIR ? - CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (!IS_ERR(req)) { req->r_dentry = dget(dentry); - req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; + req->r_num_caps = 2; + req->r_parent = dir; mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 180bbef..e8f11fa 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_inode = d_inode(child); ihold(d_inode(child)); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_locked_dir = d_inode(parent); + req->r_parent = d_inode(parent); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 045d30d..26cc954 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { - int mds_wanted = __ceph_caps_mds_wanted(ci); + int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); dout("open %p fmode %d want %s issued %s using existing\n", @@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); @@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -794,89 +793,6 @@ out: kfree(aio_work); } -/* - * Write commit request unsafe callback, called to tell us when a - * request is unsafe (that is, in flight--has been handed to the - * messenger to send to its target osd). It is called again when - * we've received a response message indicating the request is - * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request - * is completed early (and unsuccessfully) due to a timeout or - * interrupt. - * - * This is used if we requested both an ACK and ONDISK commit reply - * from the OSD. - */ -static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) -{ - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - - dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, - unsafe ? "un" : ""); - if (unsafe) { - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - - complete_all(&req->r_completion); - } else { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } -} - -/* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. - */ -void ceph_sync_write_wait(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; - - if (!S_ISREG(inode->i_mode)) - return; - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - /* set upper bound as _last_ entry in chain */ - - req = list_last_entry(head, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; - - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); - ceph_osdc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_first_entry(head, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); -} - static ssize_t ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_snap_context *snapc, @@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, goto out; } - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; req->r_inode = inode; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, @@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode, ceph_vino(inode), offset, length, 0, 1, op, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { ret = PTR_ERR(req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5e659d0..d449e1c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; - INIT_LIST_HEAD(&ci->i_unsafe_writes); INIT_LIST_HEAD(&ci->i_unsafe_dirops); INIT_LIST_HEAD(&ci->i_unsafe_iops); spin_lock_init(&ci->i_unsafe_lock); @@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode) return 1; } -void ceph_evict_inode(struct inode *inode) -{ - /* wait unsafe sync writes */ - ceph_sync_write_wait(inode); - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); -} - static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -1016,7 +1007,9 @@ out: static void update_dentry_lease(struct dentry *dentry, struct ceph_mds_reply_lease *lease, struct ceph_mds_session *session, - unsigned long from_time) + unsigned long from_time, + struct ceph_vino *tgt_vino, + struct ceph_vino *dir_vino) { struct ceph_dentry_info *di = ceph_dentry(dentry); long unsigned duration = le32_to_cpu(lease->duration_ms); @@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry, long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; struct inode *dir; + /* + * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that + * we expect a negative dentry. + */ + if (!tgt_vino && d_really_is_positive(dentry)) + return; + + if (tgt_vino && (d_really_is_negative(dentry) || + !ceph_ino_compare(d_inode(dentry), tgt_vino))) + return; + spin_lock(&dentry->d_lock); dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dentry, duration, ttl); - /* make lease_rdcache_gen match directory */ dir = d_inode(dentry->d_parent); + /* make sure parent matches dir_vino */ + if (!ceph_ino_compare(dir, dir_vino)) + goto out_unlock; + /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) goto out_unlock; @@ -1108,61 +1115,27 @@ out: * * Called with snap_rwsem (read). */ -int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, - struct ceph_mds_session *session) +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) { + struct ceph_mds_session *session = req->r_session; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, rinfo->head->is_dentry, rinfo->head->is_target); -#if 0 - /* - * Debugging hook: - * - * If we resend completed ops to a recovering mds, we get no - * trace. Since that is very rare, pretend this is the case - * to ensure the 'no trace' handlers in the callers behave. - * - * Fill in inodes unconditionally to avoid breaking cap - * invariants. - */ - if (rinfo->head->op & CEPH_MDS_OP_WRITE) { - pr_info("fill_trace faking empty trace on %lld %s\n", - req->r_tid, ceph_mds_op_name(rinfo->head->op)); - if (rinfo->head->is_dentry) { - rinfo->head->is_dentry = 0; - err = fill_inode(req->r_locked_dir, - &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1); - } - if (rinfo->head->is_target) { - rinfo->head->is_target = 0; - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - err = fill_inode(in, &rinfo->targeti, NULL, - session, req->r_request_started, - req->r_fmode); - iput(in); - } - } -#endif - if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) + if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; if (dir) { err = fill_inode(dir, NULL, @@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dname.name = rinfo->dname; dname.len = rinfo->dname_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); dout("d_lookup on parent=%p name=%.*s got %p\n", @@ -1206,8 +1179,8 @@ retry_lookup: } err = 0; } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1221,10 +1194,10 @@ retry_lookup: } if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - in = ceph_get_inode(sb, vino); + in = ceph_get_inode(sb, tvino); if (IS_ERR(in)) { err = PTR_ERR(in); goto done; @@ -1233,8 +1206,8 @@ retry_lookup: err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, - (!req->r_aborted && rinfo->head->result == 0) ? - req->r_fmode : -1, + (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { pr_err("fill_inode badness %p %llx.%llx\n", @@ -1247,8 +1220,9 @@ retry_lookup: * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later */ - if (rinfo->head->is_dentry && !req->r_aborted && - req->r_locked_dir && + if (rinfo->head->is_dentry && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { @@ -1257,17 +1231,19 @@ retry_lookup: * mknod symlink mkdir : null -> new inode * unlink : linked -> null */ - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; struct dentry *dn = req->r_dentry; bool have_dir_cap, have_lease; BUG_ON(!dn); BUG_ON(!dir); BUG_ON(d_inode(dn->d_parent) != dir); - BUG_ON(ceph_ino(dir) != - le64_to_cpu(rinfo->diri.in->ino)); - BUG_ON(ceph_snap(dir) != - le64_to_cpu(rinfo->diri.in->snapid)); + + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + BUG_ON(ceph_ino(dir) != dvino.ino); + BUG_ON(ceph_snap(dir) != dvino.snap); /* do we have a lease on the whole dir? */ have_dir_cap = @@ -1319,12 +1295,13 @@ retry_lookup: ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); - } else { - if (have_lease && d_unhashed(dn)) + } else if (have_lease) { + if (d_unhashed(dn)) d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + NULL, &dvino); } goto done; } @@ -1347,15 +1324,19 @@ retry_lookup: have_lease = false; } - if (have_lease) + if (have_lease) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + &tvino, &dvino); + } dout(" final dn %p\n", dn); - } else if (!req->r_aborted && - (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP)) { + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct dentry *dn = req->r_dentry; - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); @@ -1370,6 +1351,26 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ + } else if (rinfo->head->is_dentry) { + struct ceph_vino *ptvino = NULL; + + if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || + le32_to_cpu(rinfo->dlease->duration_ms)) { + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; + } + + update_dentry_lease(req->r_dentry, rinfo->dlease, + session, req->r_request_started, ptvino, + &dvino); + } else { + dout("%s: no dentry lease or dir cap\n", __func__); + } } done: dout("fill_trace done err=%d\n", err); @@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; - if (req->r_aborted) + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); if (rinfo->hash_order && req->r_path2) { @@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; dname.name = rde->name; dname.len = rde->name_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rde->inode.in->ino); - vino.snap = le64_to_cpu(rde->inode.in->snapid); + tvino.ino = le64_to_cpu(rde->inode.in->ino); + tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, @@ -1559,8 +1560,8 @@ retry_lookup: goto out; } } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1572,7 +1573,7 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, vino); + in = ceph_get_inode(parent->d_sb, tvino); if (IS_ERR(in)) { dout("new_inode badness\n"); d_drop(dn); @@ -1617,8 +1618,9 @@ retry_lookup: ceph_dentry(dn)->offset = rde->offset; + dvino = ceph_vino(d_inode(parent)); update_dentry_lease(dn, rde->lease, req->r_session, - req->r_request_started); + req->r_request_started, &tvino, &dvino); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { ret = fill_readdir_cache(d_inode(parent), dn, @@ -1632,7 +1634,7 @@ next_item: } out: if (err == 0 && skipped == 0) { - req->r_did_prepopulate = true; + set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); @@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work) mutex_lock(&ci->i_truncate_mutex); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); @@ -2185,10 +2187,10 @@ int ceph_permission(struct inode *inode, int mask) * Get all attributes. Hopefully somedata we'll have a statlite() * and can limit the fields we require to be accurate. */ -int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int ceph_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); int err; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 7d752d5..4c9c72f 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) l.stripe_count = ci->i_layout.stripe_count; l.object_size = ci->i_layout.object_size; l.data_pool = ci->i_layout.pool_id; - l.preferred_osd = (s32)-1; + l.preferred_osd = -1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ - nl.preferred_osd = le64_to_cpu(-1); + nl.preferred_osd = -1; err = __validate_layout(mdsc, &nl); if (err) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c9d2e55..c681762 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } - if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc, erase_request(&mdsc->request_tree, req); - if (req->r_unsafe_dir && req->r_got_unsafe) { + if (req->r_unsafe_dir && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } - if (req->r_target_inode && req->r_got_unsafe) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_target_item); @@ -668,6 +673,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } /* + * Walk back up the dentry tree until we hit a dentry representing a + * non-snapshot inode. We do this using the rcu_read_lock (which must be held + * when calling this) to ensure that the objects won't disappear while we're + * working with them. Once we hit a candidate dentry, we attempt to take a + * reference to it, and return that as the result. + */ +static struct inode *get_nonsnap_parent(struct dentry *dentry) +{ + struct inode *inode = NULL; + + while (dentry && !IS_ROOT(dentry)) { + inode = d_inode_rcu(dentry); + if (!inode || ceph_snap(inode) == CEPH_NOSNAP) + break; + dentry = dentry->d_parent; + } + if (inode) + inode = igrab(inode); + return inode; +} + +/* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the @@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -static struct dentry *get_nonsnap_parent(struct dentry *dentry) -{ - /* - * we don't need to worry about protecting the d_parent access - * here because we never renaming inside the snapped namespace - * except to resplice to another snapdir, and either the old or new - * result is a valid result. - */ - while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) - dentry = dentry->d_parent; - return dentry; -} - static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; - bool is_hash = req->r_direct_is_hash; + bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); /* * is there a specific mds we should try? ignore hint if we have @@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { inode = req->r_inode; + ihold(inode); } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ - struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = d_inode(parent); + struct dentry *parent; + struct inode *dir; + + rcu_read_lock(); + parent = req->r_dentry->d_parent; + dir = req->r_parent ? : d_inode_rcu(parent); - if (dir->i_sb != mdsc->fsc->sb) { - /* not this fs! */ + if (!dir || dir->i_sb != mdsc->fsc->sb) { + /* not this fs or parent went negative */ inode = d_inode(req->r_dentry); + if (inode) + ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = get_nonsnap_parent(parent); - inode = d_inode(dn); + inode = get_nonsnap_parent(parent); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ - inode = dir; + inode = igrab(dir); hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; + } else { + ihold(inode); } } + rcu_read_unlock(); } dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, @@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } /* since this file/dir wasn't known to be @@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } } } @@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); + iput(inode); goto random; } mds = cap->session->s_mds; @@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); +out: + iput(inode); return mds; random: @@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; if (ci->i_wrbuffer_ref > 0 && - ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) invalidate = true; while (!list_empty(&ci->i_cap_flush_list)) { @@ -1775,18 +1800,23 @@ retry: return path; } -static int build_dentry_path(struct dentry *dentry, +static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, int *pfreepath) { char *path; - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { - *pino = ceph_ino(d_inode(dentry->d_parent)); + rcu_read_lock(); + if (!dir) + dir = d_inode_rcu(dentry->d_parent); + if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + *pino = ceph_ino(dir); + rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } + rcu_read_unlock(); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); @@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode, * an explicit ino+path. */ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, - const char *rpath, u64 rino, - const char **ppath, int *pathlen, + struct inode *rdiri, const char *rpath, + u64 rino, const char **ppath, int *pathlen, u64 *ino, int *freepath) { int r = 0; @@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, + freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, int ret; ret = set_request_path_attr(req->r_inode, req->r_dentry, - req->r_path1, req->r_ino1.ino, + req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1); if (ret < 0) { msg = ERR_PTR(ret); @@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2); if (ret < 0) { @@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, - mds, req->r_dentry_drop, req->r_dentry_unless); + req->r_parent, mds, req->r_dentry_drop, + req->r_dentry_unless); if (req->r_old_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_old_dentry, - mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + req->r_old_dentry_dir, mds, + req->r_old_dentry_drop, + req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; /* * Replay. Do not regenerate message (and rebuild @@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead = msg->front.iov_base; rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; - if (req->r_locked_dir) + if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; rhead->ino = 0; - dout(" r_locked_dir = %p\n", req->r_locked_dir); + dout(" r_parent = %p\n", req->r_parent); return 0; } @@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = 0; - if (req->r_err || req->r_got_result) { - if (req->r_aborted) + if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); goto out; } @@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc, err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("do_request forced umount\n"); err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; dout("do_request mdsmap err %d\n", err); @@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts > 0) continue; /* only new requests */ @@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, dout("do_request on %p\n", req); - /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_locked_dir) - ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ - if (req->r_got_result) { + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { dout("aborted request %lld with %d\n", req->r_tid, err); @@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, */ mutex_lock(&req->r_fill_mutex); req->r_err = err; - req->r_aborted = true; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); - if (req->r_locked_dir && + if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { @@ -2323,7 +2358,7 @@ out: */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { - struct inode *inode = req->r_locked_dir; + struct inode *inode = req->r_parent; dout("invalidate_dir_request %p (complete, lease(s))\n", inode); @@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* dup? */ - if ((req->r_got_unsafe && !head->safe) || - (req->r_got_safe && head->safe)) { + if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || + (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } - if (req->r_got_safe) { + if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); @@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { - req->r_got_safe = true; + set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS @@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) @@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out; } } else { - req->r_got_unsafe = true; + set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = @@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); current->journal_info = req; - err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) @@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_got_unsafe && req->r_target_inode) { + if (err == 0 && req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); @@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } out_err: mutex_lock(&mdsc->mutex); - if (!req->r_aborted) { + if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { req->r_reply = ceph_msg_get(msg); - req->r_got_result = true; + set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { dout("reply arrived after request %lld was aborted\n", tid); @@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, goto out; /* dup reply? */ } - if (req->r_aborted) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { @@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); - BUG_ON(req->r_got_result); + BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; @@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts == 0) continue; /* only old requests */ @@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) <= skipped; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3c6f77b..ac0475a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -202,9 +202,18 @@ struct ceph_mds_request { char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; - struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ +#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ +#define CEPH_MDS_R_ABORTED (2) /* call was aborted */ +#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ +#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ +#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ +#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ +#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ + unsigned long r_req_flags; + struct mutex r_fill_mutex; union ceph_mds_request_args r_args; @@ -216,7 +225,6 @@ struct ceph_mds_request { /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ - bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ struct ceph_pagelist *r_pagelist; @@ -234,7 +242,6 @@ struct ceph_mds_request { struct ceph_mds_reply_info_parsed r_reply_info; struct page *r_locked_page; int r_err; - bool r_aborted; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -262,9 +269,7 @@ struct ceph_mds_request { ceph_mds_request_callback_t r_callback; ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe, r_got_result; - bool r_did_prepopulate; long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6bd20d7..0ec8d01 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = { .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, - .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + if (fsc->mount_options->rsize > fsc->mount_options->rasize && + fsc->mount_options->rsize >= PAGE_SIZE) + fsc->backing_dev_info.io_pages = + (fsc->mount_options->rsize + PAGE_SIZE - 1) + >> PAGE_SHIFT; + else if (fsc->mount_options->rsize == 0) + fsc->backing_dev_info.io_pages = ULONG_MAX; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3373b61..fe6b9cf 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -45,8 +45,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT 0 /* max read size */ -#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -343,7 +343,6 @@ struct ceph_inode_info { u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ - struct list_head i_unsafe_writes; /* uncommitted sync writes */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; @@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) } /* what the mds thinks we want */ -extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); @@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); -extern void ceph_evict_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec *ctime, struct timespec *mtime, struct timespec *atime); extern int ceph_fill_trace(struct super_block *sb, - struct ceph_mds_request *req, - struct ceph_mds_session *session); + struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); @@ -787,8 +784,8 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) extern int ceph_permission(struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +extern int ceph_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); @@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + struct inode *dir, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, @@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -extern void ceph_sync_write_wait(struct inode *inode); + /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 9156be5..6b61df1 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -303,7 +303,9 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) * gives us the latter, so we must adjust the result. */ mnt = ERR_PTR(-ENOMEM); - full_path = build_path_from_dentry(mntpt); + + /* always use tree name prefix */ + full_path = build_path_from_dentry_optional_prefix(mntpt, true); if (full_path == NULL) goto cdda_exit; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 479bc0a..3d7298c 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -130,10 +130,10 @@ wchar_t cifs_toupper(wchar_t in); * Returns: * Address of the first string */ -static inline wchar_t * -UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +static inline __le16 * +UniStrcat(__le16 *ucs1, const __le16 *ucs2) { - wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + __le16 *anchor = ucs1; /* save a pointer to start of ucs1 */ while (*ucs1++) ; /* To end of first string */ ucs1--; /* Return to the null */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c9c00a8..da717fe 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -83,7 +83,7 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int cifs_setattr(struct dentry *, struct iattr *); extern const struct inode_operations cifs_file_inode_ops; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1a90bb3..d42dd32 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -443,6 +443,9 @@ struct smb_version_operations { int (*is_transform_hdr)(void *buf); int (*receive_transform)(struct TCP_Server_Info *, struct mid_q_entry **); + enum securityEnum (*select_sectype)(struct TCP_Server_Info *, + enum securityEnum); + }; struct smb_version_values { @@ -822,7 +825,7 @@ struct cifs_ses { int ses_count; /* reference counter */ enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ - __u16 ipc_tid; /* special tid for connection to IPC share */ + __u32 ipc_tid; /* special tid for connection to IPC share */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index f5b8730..1ce733f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2086,17 +2086,21 @@ typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */ __u8 ServiceSiteGuid[16]; /* MBZ, ignored */ } __attribute__((packed)) REFERRAL3; -typedef struct smb_com_transaction_get_dfs_refer_rsp { - struct smb_hdr hdr; /* wct = 10 */ - struct trans2_resp t2; - __u16 ByteCount; - __u8 Pad; +struct get_dfs_referral_rsp { __le16 PathConsumed; __le16 NumberOfReferrals; __le32 DFSFlags; REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ /* followed by the strings pointed to by the referral structures */ -} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP; +} __packed; + +typedef struct smb_com_transaction_get_dfs_refer_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; + __u8 Pad; + struct get_dfs_referral_rsp dfs_data; +} __packed TRANSACTION2_GET_DFS_REFER_RSP; /* DFS Flags */ #define DFSREF_REFERRAL_SERVER 0x00000001 /* all targets are DFS roots */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 406d2c1..97e5d23 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -61,6 +61,8 @@ extern void exit_cifs_idmap(void); extern int init_cifs_spnego(void); extern void exit_cifs_spnego(void); extern char *build_path_from_dentry(struct dentry *); +extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry, + bool prefix); extern char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, @@ -284,6 +286,11 @@ extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_codepage, unsigned int *num_referrals, struct dfs_info3_param **referrals, int remap); +extern int parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, + unsigned int *num_of_nodes, + struct dfs_info3_param **target_nodes, + const struct nls_table *nls_codepage, int remap, + const char *searchName, bool is_unicode); extern void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct smb_vol *vol); @@ -526,4 +533,6 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash); +enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, + enum securityEnum); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f5099fb..0669506 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4786,117 +4786,6 @@ GetInodeNumOut: return rc; } -/* parses DFS refferal V3 structure - * caller is responsible for freeing target_nodes - * returns: - * on success - 0 - * on failure - errno - */ -static int -parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, - unsigned int *num_of_nodes, - struct dfs_info3_param **target_nodes, - const struct nls_table *nls_codepage, int remap, - const char *searchName) -{ - int i, rc = 0; - char *data_end; - bool is_unicode; - struct dfs_referral_level_3 *ref; - - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) - is_unicode = true; - else - is_unicode = false; - *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); - - if (*num_of_nodes < 1) { - cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n", - *num_of_nodes); - rc = -EINVAL; - goto parse_DFS_referrals_exit; - } - - ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); - if (ref->VersionNumber != cpu_to_le16(3)) { - cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", - le16_to_cpu(ref->VersionNumber)); - rc = -EINVAL; - goto parse_DFS_referrals_exit; - } - - /* get the upper boundary of the resp buffer */ - data_end = (char *)(&(pSMBr->PathConsumed)) + - le16_to_cpu(pSMBr->t2.DataCount); - - cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n", - *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags)); - - *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param), - GFP_KERNEL); - if (*target_nodes == NULL) { - rc = -ENOMEM; - goto parse_DFS_referrals_exit; - } - - /* collect necessary data from referrals */ - for (i = 0; i < *num_of_nodes; i++) { - char *temp; - int max_len; - struct dfs_info3_param *node = (*target_nodes)+i; - - node->flags = le32_to_cpu(pSMBr->DFSFlags); - if (is_unicode) { - __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, - GFP_KERNEL); - if (tmp == NULL) { - rc = -ENOMEM; - goto parse_DFS_referrals_exit; - } - cifsConvertToUTF16((__le16 *) tmp, searchName, - PATH_MAX, nls_codepage, remap); - node->path_consumed = cifs_utf16_bytes(tmp, - le16_to_cpu(pSMBr->PathConsumed), - nls_codepage); - kfree(tmp); - } else - node->path_consumed = le16_to_cpu(pSMBr->PathConsumed); - - node->server_type = le16_to_cpu(ref->ServerType); - node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags); - - /* copy DfsPath */ - temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); - max_len = data_end - temp; - node->path_name = cifs_strndup_from_utf16(temp, max_len, - is_unicode, nls_codepage); - if (!node->path_name) { - rc = -ENOMEM; - goto parse_DFS_referrals_exit; - } - - /* copy link target UNC */ - temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); - max_len = data_end - temp; - node->node_name = cifs_strndup_from_utf16(temp, max_len, - is_unicode, nls_codepage); - if (!node->node_name) { - rc = -ENOMEM; - goto parse_DFS_referrals_exit; - } - - ref++; - } - -parse_DFS_referrals_exit: - if (rc) { - free_dfs_info_array(*target_nodes, *num_of_nodes); - *target_nodes = NULL; - *num_of_nodes = 0; - } - return rc; -} - int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, const char *search_name, struct dfs_info3_param **target_nodes, @@ -4993,9 +4882,11 @@ getDFSRetry: get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset)); /* parse returned result into more usable form */ - rc = parse_DFS_referrals(pSMBr, num_of_nodes, - target_nodes, nls_codepage, remap, - search_name); + rc = parse_dfs_referrals(&pSMBr->dfs_data, + le16_to_cpu(pSMBr->t2.DataCount), + num_of_nodes, target_nodes, nls_codepage, + remap, search_name, + (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) != 0); GetDFSRefExit: cifs_buf_release(pSMB); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 777ad9f..9ae695a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -21,6 +21,7 @@ #include <linux/fs.h> #include <linux/net.h> #include <linux/string.h> +#include <linux/sched/signal.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/slab.h> @@ -2073,7 +2074,8 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol) * that was specified, or "Unspecified" if that sectype was not * compatible with the given NEGOTIATE request. */ - if (select_sectype(server, vol->sectype) == Unspecified) + if (server->ops->select_sectype(server, vol->sectype) + == Unspecified) return false; /* @@ -2455,7 +2457,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } down_read(&key->sem); - upayload = user_key_payload(key); + upayload = user_key_payload_locked(key); if (IS_ERR_OR_NULL(upayload)) { rc = upayload ? PTR_ERR(upayload) : -EINVAL; goto out_key_put; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2c227a9..56366e9 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -81,6 +81,17 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, char * build_path_from_dentry(struct dentry *direntry) { + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + bool prefix = tcon->Flags & SMB_SHARE_IS_IN_DFS; + + return build_path_from_dentry_optional_prefix(direntry, + prefix); +} + +char * +build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) +{ struct dentry *temp; int namelen; int dfsplen; @@ -92,7 +103,7 @@ build_path_from_dentry(struct dentry *direntry) unsigned seq; dirsep = CIFS_DIR_SEP(cifs_sb); - if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + if (prefix) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else dfsplen = 0; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7ab5be7..b261db3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -23,6 +23,8 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/freezer.h> +#include <linux/sched/signal.h> + #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -1990,9 +1992,10 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int cifs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct inode *inode = d_inode(dentry); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c672915..d3fb115 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -640,3 +640,108 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, cifs_add_pending_open_locked(fid, tlink, open); spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } + +/* parses DFS refferal V3 structure + * caller is responsible for freeing target_nodes + * returns: + * - on success - 0 + * - on failure - errno + */ +int +parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, + unsigned int *num_of_nodes, + struct dfs_info3_param **target_nodes, + const struct nls_table *nls_codepage, int remap, + const char *searchName, bool is_unicode) +{ + int i, rc = 0; + char *data_end; + struct dfs_referral_level_3 *ref; + + *num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals); + + if (*num_of_nodes < 1) { + cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n", + *num_of_nodes); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + ref = (struct dfs_referral_level_3 *) &(rsp->referrals); + if (ref->VersionNumber != cpu_to_le16(3)) { + cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", + le16_to_cpu(ref->VersionNumber)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + /* get the upper boundary of the resp buffer */ + data_end = (char *)rsp + rsp_size; + + cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n", + *num_of_nodes, le32_to_cpu(rsp->DFSFlags)); + + *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param), + GFP_KERNEL); + if (*target_nodes == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* collect necessary data from referrals */ + for (i = 0; i < *num_of_nodes; i++) { + char *temp; + int max_len; + struct dfs_info3_param *node = (*target_nodes)+i; + + node->flags = le32_to_cpu(rsp->DFSFlags); + if (is_unicode) { + __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, + GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + cifsConvertToUTF16((__le16 *) tmp, searchName, + PATH_MAX, nls_codepage, remap); + node->path_consumed = cifs_utf16_bytes(tmp, + le16_to_cpu(rsp->PathConsumed), + nls_codepage); + kfree(tmp); + } else + node->path_consumed = le16_to_cpu(rsp->PathConsumed); + + node->server_type = le16_to_cpu(ref->ServerType); + node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags); + + /* copy DfsPath */ + temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); + max_len = data_end - temp; + node->path_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); + if (!node->path_name) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* copy link target UNC */ + temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); + max_len = data_end - temp; + node->node_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); + if (!node->node_name) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + ref++; + } + +parse_DFS_referrals_exit: + if (rc) { + free_dfs_info_array(*target_nodes, *num_of_nodes); + *target_nodes = NULL; + *num_of_nodes = 0; + } + return rc; +} diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index dcbcc92..8b0502c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -498,7 +498,7 @@ setup_ntlmv2_ret: } enum securityEnum -select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) +cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) { switch (server->negflavor) { case CIFS_NEGFLAVOR_EXTENDED: @@ -1391,7 +1391,7 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) { int type; - type = select_sectype(ses->server, ses->sectype); + type = cifs_select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 67a987e..cc93ba4 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1087,6 +1087,7 @@ struct smb_version_operations smb1_operations = { .is_read_op = cifs_is_read_op, .wp_retry_size = cifs_wp_retry_size, .dir_needs_close = cifs_dir_needs_close, + .select_sectype = cifs_select_sectype, #ifdef CONFIG_CIFS_XATTR .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b2aff0c..b4b1f03 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -73,7 +73,8 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */ nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, - fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true, + fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, + true /* is_fsctl */, false /* use_ipc */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a44b4db..0231108 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -282,6 +282,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, + false /* use_ipc */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); if (rc != 0) @@ -571,6 +572,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, + false /* use_ipc */, NULL, 0 /* no input */, (char **)&res_key, &ret_data_len); @@ -635,7 +637,8 @@ smb2_clone_range(const unsigned int xid, /* Request server copy to target from src identified by key */ rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, (char *)pcchunk, + true /* is_fsctl */, false /* use_ipc */, + (char *)pcchunk, sizeof(struct copychunk_ioctl), (char **)&retbuf, &ret_data_len); if (rc == 0) { @@ -787,7 +790,8 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, - true /* is_fctl */, &setsparse, 1, NULL, NULL); + true /* is_fctl */, false /* use_ipc */, + &setsparse, 1, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; cifs_dbg(FYI, "set sparse rc = %d\n", rc); @@ -857,7 +861,8 @@ smb2_duplicate_extents(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, - true /* is_fsctl */, (char *)&dup_ext_buf, + true /* is_fsctl */, false /* use_ipc */, + (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), NULL, &ret_data_len); @@ -891,7 +896,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_INTEGRITY_INFORMATION, - true /* is_fsctl */, (char *)&integr_info, + true /* is_fsctl */, false /* use_ipc */, + (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), NULL, &ret_data_len); @@ -910,7 +916,8 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, - true /* is_fsctl */, NULL, 0 /* no input data */, + true /* is_fsctl */, false /* use_ipc */, + NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", @@ -1097,6 +1104,103 @@ smb2_new_lease_key(struct cifs_fid *fid) generate_random_uuid(fid->lease_key); } +static int +smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, + const char *search_name, + struct dfs_info3_param **target_nodes, + unsigned int *num_of_nodes, + const struct nls_table *nls_codepage, int remap) +{ + int rc; + __le16 *utf16_path = NULL; + int utf16_path_len = 0; + struct cifs_tcon *tcon; + struct fsctl_get_dfs_referral_req *dfs_req = NULL; + struct get_dfs_referral_rsp *dfs_rsp = NULL; + u32 dfs_req_size = 0, dfs_rsp_size = 0; + + cifs_dbg(FYI, "smb2_get_dfs_refer path <%s>\n", search_name); + + /* + * Use any tcon from the current session. Here, the first one. + */ + spin_lock(&cifs_tcp_ses_lock); + tcon = list_first_entry_or_null(&ses->tcon_list, struct cifs_tcon, + tcon_list); + if (tcon) + tcon->tc_count++; + spin_unlock(&cifs_tcp_ses_lock); + + if (!tcon) { + cifs_dbg(VFS, "session %p has no tcon available for a dfs referral request\n", + ses); + rc = -ENOTCONN; + goto out; + } + + utf16_path = cifs_strndup_to_utf16(search_name, PATH_MAX, + &utf16_path_len, + nls_codepage, remap); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + dfs_req_size = sizeof(*dfs_req) + utf16_path_len; + dfs_req = kzalloc(dfs_req_size, GFP_KERNEL); + if (!dfs_req) { + rc = -ENOMEM; + goto out; + } + + /* Highest DFS referral version understood */ + dfs_req->MaxReferralLevel = DFS_VERSION; + + /* Path to resolve in an UTF-16 null-terminated string */ + memcpy(dfs_req->RequestFileName, utf16_path, utf16_path_len); + + do { + /* try first with IPC */ + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_DFS_GET_REFERRALS, + true /* is_fsctl */, true /* use_ipc */, + (char *)dfs_req, dfs_req_size, + (char **)&dfs_rsp, &dfs_rsp_size); + if (rc == -ENOTCONN) { + /* try with normal tcon */ + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_DFS_GET_REFERRALS, + true /* is_fsctl */, false /*use_ipc*/, + (char *)dfs_req, dfs_req_size, + (char **)&dfs_rsp, &dfs_rsp_size); + } + } while (rc == -EAGAIN); + + if (rc) { + cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); + goto out; + } + + rc = parse_dfs_referrals(dfs_rsp, dfs_rsp_size, + num_of_nodes, target_nodes, + nls_codepage, remap, search_name, + true /* is_unicode */); + if (rc) { + cifs_dbg(VFS, "parse error in smb2_get_dfs_refer rc=%d\n", rc); + goto out; + } + + out: + if (tcon) { + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count--; + spin_unlock(&cifs_tcp_ses_lock); + } + kfree(utf16_path); + kfree(dfs_req); + kfree(dfs_rsp); + return rc; +} #define SMB2_SYMLINK_STRUCT_SIZE \ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) @@ -1220,7 +1324,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, + true /* is_fctl */, false /* use_ipc */, + (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -1254,7 +1359,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, + true /* is_fctl */, false /* use_ipc */, + (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -1609,6 +1715,26 @@ static void cifs_crypt_complete(struct crypto_async_request *req, int err) complete(&res->completion); } +static int +smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) +{ + struct cifs_ses *ses; + u8 *ses_enc_key; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid != ses_id) + continue; + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + spin_unlock(&cifs_tcp_ses_lock); + + return 1; +} /* * Encrypt or decrypt @rqst message. @rqst has the following format: * iov[0] - transform header (associate data), @@ -1622,10 +1748,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; - struct cifs_ses *ses; int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; + u8 key[SMB3_SIGN_KEY_SIZE]; struct aead_request *req; char *iv; unsigned int iv_len; @@ -1635,9 +1761,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) init_completion(&result.completion); - ses = smb2_find_smb_ses(server, tr_hdr->SessionId); - if (!ses) { - cifs_dbg(VFS, "%s: Could not find session\n", __func__); + rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); + if (rc) { + cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__, + enc ? "en" : "de"); return 0; } @@ -1649,8 +1776,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - rc = crypto_aead_setkey(tfm, enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); return rc; @@ -2254,6 +2380,8 @@ struct smb_version_operations smb20_operations = { .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .get_dfs_refer = smb2_get_dfs_refer, + .select_sectype = smb2_select_sectype, }; struct smb_version_operations smb21_operations = { @@ -2335,6 +2463,8 @@ struct smb_version_operations smb21_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .enum_snapshots = smb3_enum_snapshots, + .get_dfs_refer = smb2_get_dfs_refer, + .select_sectype = smb2_select_sectype, }; struct smb_version_operations smb30_operations = { @@ -2426,6 +2556,8 @@ struct smb_version_operations smb30_operations = { .free_transform_rq = smb3_free_transform_rq, .is_transform_hdr = smb3_is_transform_hdr, .receive_transform = smb3_receive_transform, + .get_dfs_refer = smb2_get_dfs_refer, + .select_sectype = smb2_select_sectype, }; #ifdef CONFIG_CIFS_SMB311 @@ -2518,6 +2650,8 @@ struct smb_version_operations smb311_operations = { .free_transform_rq = smb3_free_transform_rq, .is_transform_hdr = smb3_is_transform_hdr, .receive_transform = smb3_receive_transform, + .get_dfs_refer = smb2_get_dfs_refer, + .select_sectype = smb2_select_sectype, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ad83b3d..7446496 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -620,6 +620,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + false /* use_ipc */, (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), (char **)&pneg_rsp, &rsplen); @@ -656,6 +657,28 @@ vneg_out: return -EIO; } +enum securityEnum +smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) +{ + switch (requested) { + case Kerberos: + case RawNTLMSSP: + return requested; + case NTLMv2: + return RawNTLMSSP; + case Unspecified: + if (server->sec_ntlmssp && + (global_secflags & CIFSSEC_MAY_NTLMSSP)) + return RawNTLMSSP; + if ((server->sec_kerberos || server->sec_mskerberos) && + (global_secflags & CIFSSEC_MAY_KRB5)) + return Kerberos; + /* Fallthrough */ + default: + return Unspecified; + } +} + struct SMB2_sess_data { unsigned int xid; struct cifs_ses *ses; @@ -1008,10 +1031,17 @@ out: static int SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) { - if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) - ses->sectype = RawNTLMSSP; + int type; + + type = smb2_select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } - switch (ses->sectype) { + switch (type) { case Kerberos: sess_data->func = SMB2_auth_kerberos; break; @@ -1019,7 +1049,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate; break; default: - cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype); + cifs_dbg(VFS, "secType %d not supported!\n", type); return -EOPNOTSUPP; } @@ -1167,8 +1197,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* since no tcon, smb2_init can not do this, so do here */ req->hdr.sync_hdr.SessionId = ses->Suid; - /* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED) - req->hdr.Flags |= SMB2_FLAGS_SIGNED; */ + if (ses->server->sign) + req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; } else if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1527,6 +1557,51 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, return 0; } +static int +alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, + const char *treename, const __le16 *path) +{ + int treename_len, path_len; + struct nls_table *cp; + const __le16 sep[] = {cpu_to_le16('\\'), cpu_to_le16(0x0000)}; + + /* + * skip leading "\\" + */ + treename_len = strlen(treename); + if (treename_len < 2 || !(treename[0] == '\\' && treename[1] == '\\')) + return -EINVAL; + + treename += 2; + treename_len -= 2; + + path_len = UniStrnlen((wchar_t *)path, PATH_MAX); + + /* + * make room for one path separator between the treename and + * path + */ + *out_len = treename_len + 1 + path_len; + + /* + * final path needs to be null-terminated UTF16 with a + * size aligned to 8 + */ + + *out_size = roundup((*out_len+1)*2, 8); + *out_path = kzalloc(*out_size, GFP_KERNEL); + if (!*out_path) + return -ENOMEM; + + cp = load_nls_default(); + cifs_strtoUTF16(*out_path, treename, treename_len, cp); + UniStrcat(*out_path, sep); + UniStrcat(*out_path, path); + unload_nls(cp); + + return 0; +} + int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, @@ -1575,30 +1650,49 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, req->ShareAccess = FILE_SHARE_ALL_LE; req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); - uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; - /* do not count rfc1001 len field */ - req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; - - /* MUST set path len (NameLength) to 0 opening root of share */ - req->NameLength = cpu_to_le16(uni_path_len - 2); /* -1 since last byte is buf[0] which is sent below (path) */ iov[0].iov_len--; - if (uni_path_len % 8 != 0) { - copy_size = uni_path_len / 8 * 8; - if (copy_size < uni_path_len) - copy_size += 8; - - copy_path = kzalloc(copy_size, GFP_KERNEL); - if (!copy_path) - return -ENOMEM; - memcpy((char *)copy_path, (const char *)path, - uni_path_len); + + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4); + + /* [MS-SMB2] 2.2.13 NameOffset: + * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of + * the SMB2 header, the file name includes a prefix that will + * be processed during DFS name normalization as specified in + * section 3.3.5.9. Otherwise, the file name is relative to + * the share that is identified by the TreeId in the SMB2 + * header. + */ + if (tcon->share_flags & SHI1005_FLAGS_DFS) { + int name_len; + + req->hdr.sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + rc = alloc_path_with_tree_prefix(©_path, ©_size, + &name_len, + tcon->treeName, path); + if (rc) + return rc; + req->NameLength = cpu_to_le16(name_len * 2); uni_path_len = copy_size; path = copy_path; + } else { + uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; + /* MUST set path len (NameLength) to 0 opening root of share */ + req->NameLength = cpu_to_le16(uni_path_len - 2); + if (uni_path_len % 8 != 0) { + copy_size = roundup(uni_path_len, 8); + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) + return -ENOMEM; + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } } iov[1].iov_len = uni_path_len; @@ -1683,8 +1777,9 @@ creat_exit: */ int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, - u32 indatalen, char **out_data, u32 *plen /* returned data len */) + u64 volatile_fid, u32 opcode, bool is_fsctl, bool use_ipc, + char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */) { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; @@ -1721,6 +1816,16 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; + if (use_ipc) { + if (ses->ipc_tid == 0) { + cifs_small_buf_release(req); + return -ENOTCONN; + } + + cifs_dbg(FYI, "replacing tid 0x%x with IPC tid 0x%x\n", + req->hdr.sync_hdr.TreeId, ses->ipc_tid); + req->hdr.sync_hdr.TreeId = ses->ipc_tid; + } if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1843,6 +1948,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, + false /* use_ipc */, (char *)&fsctl_input /* data input */, 2 /* in data len */, &ret_data /* out data */, NULL); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c03b252..18700fd 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -695,6 +695,14 @@ struct fsctl_get_integrity_information_rsp { /* Integrity flags for above */ #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 +/* See MS-DFSC 2.2.2 */ +struct fsctl_get_dfs_referral_req { + __le16 MaxReferralLevel; + __u8 RequestFileName[]; +} __packed; + +/* DFS response is struct get_dfs_refer_rsp */ + /* See MS-SMB2 2.2.31.3 */ struct network_resiliency_req { __le32 Timeout; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 85fc7a7..69e3587 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -121,7 +121,8 @@ extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, struct smb2_err_rsp **err_buf); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + bool is_fsctl, bool use_ipc, + char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); @@ -180,4 +181,6 @@ extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state); extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); +extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, + enum securityEnum); #endif /* _SMB2PROTO_H */ diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 5104d84..d3c3618 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -47,7 +47,7 @@ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); int coda_permission(struct inode *inode, int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int coda_getattr(const struct path *, struct kstat *, u32, unsigned int); int coda_setattr(struct dentry *, struct iattr *); /* this file: heloers */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 6e0154e..9d956cd 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -96,7 +96,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) cfi->cfi_mapcount++; spin_unlock(&cii->c_lock); - return host_file->f_op->mmap(host_file, vma); + return call_mmap(host_file, vma); } int coda_open(struct inode *coda_inode, struct file *coda_file) diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 71dbe7e..2dea594 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -255,11 +255,12 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int coda_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - int err = coda_revalidate_inode(d_inode(dentry)); + int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(d_inode(path->dentry), stat); return err; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 8226291..f40e395 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -22,7 +22,7 @@ #include <linux/kernel.h> #include <linux/major.h> #include <linux/time.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fcntl.h> diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f6c6c8a..e82357c 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -15,7 +15,7 @@ */ #include <linux/signal.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> diff --git a/fs/compat.c b/fs/compat.c index e50a211..c61b506 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -21,6 +21,7 @@ #include <linux/compat.h> #include <linux/errno.h> #include <linux/time.h> +#include <linux/cred.h> #include <linux/fs.h> #include <linux/fcntl.h> #include <linux/namei.h> diff --git a/fs/coredump.c b/fs/coredump.c index ae6b056..5926837 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -16,6 +16,9 @@ #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/coredump.h> +#include <linux/sched/coredump.h> +#include <linux/sched/signal.h> +#include <linux/sched/task_stack.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> @@ -33,7 +36,6 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> -#include <linux/sched.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/timekeeping.h> diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 02eb6b9..d5d896f 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -103,7 +103,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); + ukp = user_key_payload_locked(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; up_read(&keyring_key->sem); @@ -27,6 +27,7 @@ #include <linux/pagevec.h> #include <linux/pmem.h> #include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> #include <linux/pfn_t.h> diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1ce908c..23488f5 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -17,6 +17,7 @@ #include <linux/dlm.h> #include <linux/dlm_device.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include "dlm_internal.h" #include "lockspace.h" diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 599a292..95c1c8d 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -117,7 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key) auth_tok = ecryptfs_get_encrypted_key_payload_data(key); if (!auth_tok) - return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; + return (struct ecryptfs_auth_tok *)user_key_payload_locked(key)->data; else return auth_tok; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e7413f8..efc2db4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -959,9 +959,10 @@ out: return rc; } -static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; int rc = 0; @@ -983,13 +984,15 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, return rc; } -static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct kstat lower_stat; int rc; - rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); + rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat, + request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 158a3a3..039e627 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -22,6 +22,8 @@ #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/sched/signal.h> + #include "ecryptfs_kernel.h" /** diff --git a/fs/eventfd.c b/fs/eventfd.c index 1231cd1..68b9fff 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -9,7 +9,7 @@ #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ec1631..3412514 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -13,7 +13,7 @@ #include <linux/init.h> #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> @@ -32,6 +32,11 @@ #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/signal.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> @@ -1088,7 +1093,7 @@ static int de_thread(struct task_struct *tsk) struct task_struct *leader = tsk->group_leader; for (;;) { - threadgroup_change_begin(tsk); + cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that @@ -1099,7 +1104,7 @@ static int de_thread(struct task_struct *tsk) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); - threadgroup_change_end(tsk); + cgroup_threadgroup_change_end(tsk); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; @@ -1157,7 +1162,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); - threadgroup_change_end(tsk); + cgroup_threadgroup_change_end(tsk); release_task(leader); } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index a4b531b..329a5d1 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -15,6 +15,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/sched.h> +#include <linux/cred.h> #define dprintk(fmt, args...) do{}while(0) @@ -299,7 +300,8 @@ static int get_name(const struct path *path, char *name, struct dentry *child) * filesystem supports 64-bit inode numbers. So we need to * actually call ->getattr, not just read i_ino: */ - error = vfs_getattr_nosec(&child_path, &stat); + error = vfs_getattr_nosec(&child_path, &stat, + STATX_INO, AT_STATX_SYNC_AS_STAT); if (error) return error; buffer.ino = stat.ino; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 4c40c07..d0bdb74 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -15,6 +15,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/capability.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2fd17e8..f493af6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -28,6 +28,7 @@ #include <linux/timer.h> #include <linux/version.h> #include <linux/wait.h> +#include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> @@ -2462,8 +2463,7 @@ extern struct inode *ext4_iget(struct super_block *, unsigned long); extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); -extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_sync_inode(handle_t *, struct inode *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b14bae2..17bc043 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -21,6 +21,8 @@ #include <linux/random.h> #include <linux/bitops.h> #include <linux/blkdev.h> +#include <linux/cred.h> + #include <asm/byteorder.h> #include "ext4.h" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 971f663..7385e6a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5387,13 +5387,13 @@ err_out: return error; } -int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int ext4_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { struct inode *inode; unsigned long long delalloc_blocks; - inode = d_inode(dentry); + inode = d_inode(path->dentry); generic_fillattr(inode, stat); /* diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f73ee95..0339daf 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -249,7 +249,8 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, META, WRITE); unlock_page(page); @@ -493,6 +494,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif @@ -681,8 +683,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; @@ -891,7 +892,7 @@ retry: F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { @@ -924,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); @@ -998,8 +999,6 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -1025,6 +1024,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); + if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason == CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else @@ -1137,6 +1140,28 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + unsigned int i; + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1248,15 +1273,20 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - unblock_operations(sbi); - goto out; + if (cpc->reason == CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* @@ -1268,17 +1298,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) { + if (err) release_discard_addrs(sbi); - } else { + else clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ac2625..1602b4b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/sched/signal.h> #include "f2fs.h" #include "node.h" @@ -55,8 +56,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { @@ -93,6 +96,17 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { @@ -171,10 +185,46 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { + unsigned int start; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(bio); } @@ -185,19 +235,19 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -206,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -216,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -228,22 +279,21 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; bool ret; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); + ret = __has_merged_page(io, inode, ino, idx); up_read(&io->io_rwsem); return ret; } static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -252,16 +302,16 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) + if (!__has_merged_page(io, inode, ino, idx)) goto out; /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); out: @@ -271,15 +321,15 @@ out: void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, int rw) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); } void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + if (has_merged_page(sbi, inode, ino, idx, type)) + __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); } void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) @@ -315,13 +365,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_mbio(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->op); struct page *bio_page; + int err = 0; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -331,6 +382,9 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + /* set submitted = 1 as a return value */ + fio->submitted = 1; + if (!is_read) inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -342,6 +396,13 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + if (!is_read) + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, BIO_MAX_PAGES, is_read); io->fio = *fio; @@ -355,9 +416,10 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); - +out_fail: up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); + return err; } static void __set_data_blkaddr(struct dnode_of_data *dn) @@ -453,7 +515,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -470,7 +532,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -694,6 +756,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -742,7 +807,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -806,7 +871,7 @@ next_block: } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { @@ -906,7 +971,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } @@ -1088,7 +1153,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -1207,7 +1272,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); @@ -1288,8 +1353,8 @@ out_writepage: return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1307,6 +1372,7 @@ static int f2fs_write_data_page(struct page *page, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, + .submitted = false, }; trace_f2fs_writepage(page, DATA); @@ -1352,9 +1418,12 @@ write: goto redirty_out; err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); + if (!err) + goto out; + } + f2fs_lock_op(sbi); if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) @@ -1370,15 +1439,22 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, + DATA, WRITE); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1390,6 +1466,12 @@ redirty_out: return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -1406,10 +1488,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); @@ -1446,6 +1528,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1479,7 +1562,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, &submitted, wbc); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1493,8 +1576,8 @@ continue_unlock: done_index = page->index + 1; done = 1; break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (--wbc->nr_to_write <= 0 && @@ -1516,9 +1599,9 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) + if (last_idx != ULONG_MAX) f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + 0, last_idx, DATA, WRITE); return ret; } @@ -1591,14 +1674,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || @@ -1682,7 +1766,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; @@ -1715,6 +1804,11 @@ repeat: if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); @@ -1768,7 +1862,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); @@ -1917,7 +2011,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fbd5184..a77df37 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,8 +50,16 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) + si->nr_flush = + atomic_read(&SM_I(sbi)->fcc_info->submit_flush); + if (SM_I(sbi) && SM_I(sbi)->dcc_info) + si->nr_discard = + atomic_read(&SM_I(sbi)->dcc_info->submit_discard); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -62,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -183,6 +193,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; get_cache: si->cache_mem = 0; @@ -192,8 +205,10 @@ get_cache: si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) + si->cache_mem += sizeof(struct discard_cmd_control); /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -254,8 +269,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -314,8 +329,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", - si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flush, si->nr_discard); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -414,6 +432,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 18607fc..4650c9b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,9 +207,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + /* This is to increase the speed of f2fs_create */ + if (!de && room) { + F2FS_I(dir)->task = current; + if (F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } } return de; @@ -548,8 +552,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; @@ -646,14 +652,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 4db44da..c6934f0 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -77,7 +77,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +94,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = parent; if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } return en; } @@ -352,11 +348,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } @@ -416,7 +413,7 @@ do_insert: return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -429,7 +426,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -437,7 +434,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -492,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) @@ -535,8 +531,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -552,7 +546,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -574,11 +568,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -608,7 +602,7 @@ free_node: spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -655,10 +649,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -666,12 +660,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -718,7 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 069fc72..e849f83 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -112,9 +112,9 @@ struct f2fs_mount_info { #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* * For checkpoint manager @@ -132,11 +132,14 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) \ + ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -185,11 +188,30 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; -struct bio_entry { - struct list_head list; - struct bio *bio; - struct completion event; - int error; +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + +struct discard_cmd { + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + block_t lstart; /* logical start address */ + block_t len; /* length */ + struct bio *bio; /* bio */ + int state; /* state */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head discard_entry_list; /* 4KB discard entry list */ + int nr_discards; /* # of discards in the list */ + struct list_head discard_cmd_list; /* discard cmd list */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + struct mutex cmd_lock; + int max_discards; /* max. discards to be issued */ + atomic_t submit_discard; /* # of issued discard */ }; /* for the list of fsync inodes, used only during recovery */ @@ -214,6 +236,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -221,6 +244,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -306,12 +330,14 @@ static inline void make_dentry_ptr(struct inode *inode, if (type == 1) { struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; + d->max = NR_DENTRY_IN_BLOCK; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } else { struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; + d->max = NR_INLINE_DENTRY; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; @@ -438,8 +464,8 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ struct list_head dirty_list; /* dirty list for dirs and files */ @@ -474,13 +500,6 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) -{ - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); -} - static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -500,7 +519,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -532,6 +551,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -539,9 +559,19 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; @@ -632,12 +662,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - /* for batched trimming */ unsigned int trim_sections; /* # of sections to trim */ @@ -648,8 +672,10 @@ struct f2fs_sm_info { unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; + struct flush_cmd_control *fcc_info; + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* @@ -708,6 +734,7 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + bool submitted; /* indicate IO submission */ }; #define is_read_io(rw) (rw == READ) @@ -787,6 +814,8 @@ struct f2fs_sb_info { struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -811,7 +840,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ @@ -858,6 +887,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -877,6 +909,8 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -908,6 +942,10 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -921,10 +959,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1089,6 +1123,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -1133,6 +1173,27 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock(&sbi->cp_lock); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -1212,8 +1273,10 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t diff; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); return false; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1449,11 +1512,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -1532,6 +1598,7 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } @@ -1545,6 +1612,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); @@ -1628,6 +1696,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1635,6 +1704,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1779,6 +1849,7 @@ static inline unsigned int addrs_per_inode(struct inode *inode) static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1817,6 +1888,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); @@ -1835,6 +1911,7 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); } @@ -1918,8 +1995,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } @@ -1957,29 +2036,30 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -1989,40 +2069,47 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +int update_dent_inode(struct inode *inode, struct inode *to, + const struct qstr *name); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2033,18 +2120,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); /* * node.c @@ -2052,163 +2139,183 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +int recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +void rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw); +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2227,8 +2334,9 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data; - int inline_xattr, inline_inode, inline_dir, orphans; + int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2300,6 +2408,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2332,8 +2451,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else @@ -2353,6 +2472,9 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) #define stat_dec_inline_dir(inode) +#define stat_inc_atomic_write(inode) +#define stat_dec_atomic_write(inode) +#define stat_update_max_atomic_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) @@ -2382,49 +2504,55 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +bool truncate_inline_inode(struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1edc86e..5f73178 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,6 +20,7 @@ #include <linux/uaccess.h> #include <linux/mount.h> #include <linux/pagevec.h> +#include <linux/uio.h> #include <linux/uuid.h> #include <linux/file.h> @@ -140,8 +141,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -167,7 +166,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); @@ -276,7 +274,8 @@ sync_nodes: flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -567,8 +566,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); + truncate_inline_inode(ipage, from); + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -633,10 +633,10 @@ int f2fs_truncate(struct inode *inode) return 0; } -int f2fs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) +int f2fs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); generic_fillattr(inode, stat); stat->blocks <<= 3; return 0; @@ -1541,6 +1541,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(inode, FI_ATOMIC_FILE); out: + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1564,15 +1566,18 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; + + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -1870,7 +1875,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2250,8 +2255,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2259,6 +2268,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); } inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 88bfc3d..418fd98 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -48,8 +48,10 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif /* @@ -166,7 +168,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; @@ -199,6 +202,10 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -237,6 +244,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -245,7 +262,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } @@ -322,13 +339,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -569,6 +588,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -661,6 +683,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -921,8 +946,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) { @@ -930,30 +953,23 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - } - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - goto stop; + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) + goto stop; + if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; @@ -983,5 +999,16 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count, blocks_per_sec; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; + + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, + (main_count - resv_count)); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda..24bb821 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -373,8 +373,10 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); goto no_delete; + } #endif remove_ino_entry(sbi, inode->i_ino, APPEND_INO); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 11cabca..98f00a3 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { bool nokey = f2fs_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); err = nokey ? -ENOKEY : -EPERM; @@ -663,6 +663,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -843,6 +849,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b9078fd..9496717 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -325,6 +338,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); + if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) + clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); + /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -958,9 +974,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -1018,7 +1031,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1033,13 +1046,15 @@ struct page *new_node_page(struct dnode_of_data *dn, err = -ENOSPC; goto fail; } - - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); @@ -1305,16 +1320,99 @@ continue_unlock: return last_page; } +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, + page->index, NODE, WRITE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_bio(sbi, NODE, WRITE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1336,6 +1434,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1384,13 +1483,15 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (page == last_page) { @@ -1416,8 +1517,9 @@ continue_unlock: goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, + NODE, WRITE); return ret ? -EIO: 0; } @@ -1445,6 +1547,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1498,9 +1601,10 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, wbc); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -1564,72 +1668,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1727,7 +1765,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -1736,14 +1775,14 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; + return false; if (build) { /* do not add allocated nids */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; + return false; } i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1752,7 +1791,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (radix_tree_preload(GFP_NOFS)) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } spin_lock(&nm_i->nid_list_lock); @@ -1761,9 +1800,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) radix_tree_preload_end(); if (err) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } - return 1; + return true; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1784,17 +1823,36 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1802,11 +1860,106 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + update_free_nid_bitmap(sbi, start_nid, freed); + } +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static int scan_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct page *page; + unsigned int i = 0; + nid_t nid; + + if (!enabled_nat_bits(sbi, NULL)) + return -EAGAIN; + + down_read(&nm_i->nat_tree_lock); +check_empty: + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + i = 0; + goto check_partial; + } + + for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; + nid++) { + if (unlikely(nid >= nm_i->max_nid)) + break; + add_free_nid(sbi, nid, true); + } + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + i++; + goto check_empty; + +check_partial: + i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + disable_nat_bits(sbi, true); + up_read(&nm_i->nat_tree_lock); + return -EINVAL; + } + + nid = i * NAT_ENTRY_PER_BLOCK; + page = get_current_nat_page(sbi, nid); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + + if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { + i++; + goto check_partial; + } +out: + up_read(&nm_i->nat_tree_lock); + return 0; +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1821,6 +1974,29 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + + /* try to find free nids with nat_bits */ + if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) + return; + } + + /* find next valid candidate */ + if (enabled_nat_bits(sbi, NULL)) { + int idx = find_next_zero_bit_le(nm_i->full_nat_bits, + nm_i->nat_blocks, 0); + + if (idx >= nm_i->nat_blocks) + set_sbi_flag(sbi, SBI_NEED_FSCK); + else + nid = idx * NAT_ENTRY_PER_BLOCK; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1863,10 +2039,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1881,8 +2057,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); @@ -1902,13 +2080,16 @@ retry: i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false); + spin_unlock(&nm_i->nid_list_lock); return true; } spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -1956,6 +2137,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2018,18 +2201,18 @@ update_inode: f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); @@ -2037,19 +2220,27 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ + /* 2: update xattr nid in inode */ + remove_free_nid(sbi, new_xnid); + f2fs_i_xnid_write(inode, new_xnid); if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); + if (!xpage) + return -ENOMEM; + + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); - remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2152,7 +2343,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } @@ -2192,8 +2383,39 @@ add_out: list_add_tail(&nes->set_list, head); } +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + set_bit_le(nat_index, nm_i->empty_nat_bits); + clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + set_bit_le(nat_index, nm_i->full_nat_bits); + else + clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2208,7 +2430,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2244,14 +2467,21 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } f2fs_bug_on(sbi, set->entry_cnt); @@ -2262,7 +2492,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2283,7 +2513,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2297,27 +2528,69 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2350,6 +2623,34 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + + err = __get_nat_bitmaps(sbi); + if (err) + return err; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + + return 0; +} + +int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; return 0; } @@ -2365,7 +2666,11 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + err = init_free_nid_cache(sbi); + if (err) + return err; + + build_free_nids(sbi, true, true); return 0; } @@ -2423,7 +2728,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e7997e2..2f9603f 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); @@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -228,6 +234,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) @@ -291,14 +300,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -306,14 +312,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } @@ -343,7 +346,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 981a958..d025aa8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -378,11 +378,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } @@ -428,8 +426,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -552,10 +551,8 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; bool need_writecp = false; @@ -571,8 +568,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err || list_empty(&inode_list)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0d88024..4bd7a8b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,7 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -242,11 +242,12 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } static int __commit_inmem_pages(struct inode *inode, @@ -262,7 +263,7 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_PRIO, .encrypted_page = NULL, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -288,15 +289,15 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, + DATA, WRITE); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -315,6 +316,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -336,6 +339,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } @@ -347,8 +352,10 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif if (!need) @@ -381,7 +388,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); if (!is_idle(sbi)) return; @@ -423,6 +430,9 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) if (sbi->s_ndevs && !ret) { for (i = 1; i < sbi->s_ndevs; i++) { + trace_f2fs_issue_flush(FDEV(i).bdev, + test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); ret = __submit_flush_wait(FDEV(i).bdev); if (ret) break; @@ -434,7 +444,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) @@ -463,16 +473,16 @@ repeat: int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { + if (!test_opt(sbi, FLUSH_MERGE)) + return submit_flush_wait(sbi); + + if (!atomic_read(&fcc->submit_flush)) { int ret; atomic_inc(&fcc->submit_flush); @@ -506,8 +516,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; - if (SM_I(sbi)->cmd_control_info) { - fcc = SM_I(sbi)->cmd_control_info; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; goto init_thread; } @@ -517,14 +527,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } @@ -533,7 +543,7 @@ init_thread: void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; @@ -543,7 +553,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } if (free) { kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; } } @@ -623,60 +633,144 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) +static void __add_discard_cmd(struct f2fs_sb_info *sbi, + struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bio = bio; + bio->bi_private = dc; + dc->lstart = lstart; + dc->len = len; + dc->state = D_PREP; + init_completion(&dc->wait); - return be; + mutex_lock(&dcc->cmd_lock); + list_add_tail(&dc->list, cmd_list); + mutex_unlock(&dcc->cmd_lock); } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + int err = dc->bio->bi_error; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; - int err; + if (dc->state == D_DONE) + atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); - wait_for_completion_io(&be->event); - err = be->error; - if (err == -EOPNOTSUPP) - err = 0; + if (err == -EOPNOTSUPP) + err = 0; - if (err) - f2fs_msg(sbi->sb, KERN_INFO, + if (err) + f2fs_msg(sbi->sb, KERN_INFO, "Issue discard failed, ret: %d", err); + bio_put(dc->bio); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->discard_cmd_list); + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + + mutex_lock(&dcc->cmd_lock); - bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); + blk_start_plug(&plug); + + list_for_each_entry_safe(dc, tmp, wait_list, list) { + + if (blkaddr == NULL_ADDR) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(dc->bio); + atomic_inc(&dcc->submit_discard); + } + continue; + } + + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + if (dc->state == D_SUBMIT) + wait_for_completion_io(&dc->wait); + else + __remove_discard_cmd(sbi, dc); + } + } + blk_finish_plug(&plug); + + /* this comes from f2fs_put_super */ + if (blkaddr == NULL_ADDR) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } } + mutex_unlock(&dcc->cmd_lock); +} + +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + + complete(&dc->wait); + dc->state = D_DONE; } -static void f2fs_submit_bio_wait_endio(struct bio *bio) +static int issue_discard_thread(void *data) { - struct bio_entry *be = (struct bio_entry *)bio->bi_private; + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct list_head *cmd_list = &dcc->discard_cmd_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0; +repeat: + if (kthread_should_stop()) + return 0; + + blk_start_plug(&plug); + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, cmd_list, list) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(dc->bio); + atomic_inc(&dcc->submit_discard); + if (iter++ > DISCARD_ISSUE_RATE) + break; + } else if (dc->state == D_DONE) { + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); + + blk_finish_plug(&plug); + + iter = 0; + congestion_wait(BLK_RW_SYNC, HZ/50); - be->error = bio->bi_error; - complete(&be->event); + wait_event_interruptible(*q, + kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); + goto repeat; } + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { struct bio *bio = NULL; + block_t lblkstart = blkstart; int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_issue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); @@ -688,14 +782,12 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); - - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= REQ_SYNC; - submit_bio(bio); - } + __add_discard_cmd(sbi, bio, lblkstart, blklen); + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + } return err; } @@ -703,24 +795,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_sectors(bdev) - 1) || - nr_sects != bdev_zone_sectors(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -735,7 +816,18 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: @@ -800,13 +892,14 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_list; + struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { + last->blkaddr + last->len && + last->len < MAX_DISCARD_BLOCKS(sbi)) { last->len += end - start; goto done; } @@ -818,10 +911,11 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, new->len = end - start; list_add_tail(&new->list, head); done: - SM_I(sbi)->nr_discards += end - start; + SM_I(sbi)->dcc_info->nr_discards += end - start; } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -835,12 +929,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -848,7 +943,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -858,13 +954,17 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; + if (check_only) + return true; + __add_discard_entry(sbi, cpc, se, start, end); } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -890,17 +990,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); - blk_start_plug(&plug); - mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -916,9 +1013,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); @@ -935,6 +1036,8 @@ next: start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); @@ -946,11 +1049,62 @@ next: cpc->trimmed += entry->len; skip: list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } +} - blk_finish_plug(&plug); +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + INIT_LIST_HEAD(&dcc->discard_entry_list); + INIT_LIST_HEAD(&dcc->discard_cmd_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->submit_discard, 0); + dcc->nr_discards = 0; + dcc->max_discards = 0; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } + if (free) { + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + } } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -995,14 +1149,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1167,17 +1339,6 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int segno = curseg->segno + 1; - struct free_segmap_info *free_i = FREE_I(sbi); - - if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) - return !test_bit(segno, free_i->free_segmap); - return 0; -} - /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1382,16 +1543,39 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + int i, cnt; + bool reversed = false; + + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + return 1; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + BG_GC, i, SSR)) return 1; + } return 0; } @@ -1402,20 +1586,17 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, curseg); + stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -1424,9 +1605,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; @@ -1439,6 +1617,24 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + bool has_candidate = false; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; + break; + } + } + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + cpc->trim_start = trim_start; + return has_candidate; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -1573,6 +1769,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the @@ -1584,14 +1782,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -1603,15 +1802,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + int err; if (fio->type == NODE || fio->type == DATA) mutex_lock(&fio->sbi->wio_mutex[fio->type]); - +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_mbio(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } if (fio->type == NODE || fio->type == DATA) mutex_unlock(&fio->sbi->wio_mutex[fio->type]); @@ -1753,7 +1957,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, type, WRITE); if (ordered) wait_on_page_writeback(page); else @@ -2228,7 +2433,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* add discard candidates */ if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2263,8 +2468,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason == CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); @@ -2276,7 +2485,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2305,6 +2514,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2331,17 +2547,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; @@ -2626,11 +2847,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); @@ -2641,6 +2857,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2734,6 +2954,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } @@ -2746,6 +2969,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } @@ -2756,6 +2982,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -2771,15 +2998,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2789,8 +3016,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2800,7 +3027,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9d44ce8..5e8ad42 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -164,6 +164,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -186,9 +189,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; @@ -203,6 +209,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -317,6 +326,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } @@ -414,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -634,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -659,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) @@ -689,6 +716,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) @@ -700,8 +736,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a831303..96fe8ed 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -89,6 +89,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -101,6 +102,7 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, @@ -121,6 +123,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -133,6 +136,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, @@ -143,6 +147,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -166,6 +171,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) @@ -281,7 +288,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -439,6 +446,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -452,6 +462,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -535,11 +549,23 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -558,6 +584,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -591,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -623,10 +657,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) @@ -750,6 +786,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bio(sbi, NULL_ADDR); + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -782,7 +821,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); kfree(sbi); } @@ -882,6 +921,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -918,6 +959,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_puts(seq, ",fault_injection"); +#endif return 0; } @@ -995,6 +1042,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); @@ -1686,36 +1734,55 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; + max_devices = 1; + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, + sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1735,6 +1802,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, @@ -1751,6 +1820,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -1868,12 +1939,19 @@ try_onemore: if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) + goto free_options; + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -2048,6 +2126,8 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; @@ -2091,6 +2171,8 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: destroy_percpu_info(sbi); kfree(options); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c47ce2f..7298a44 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -217,6 +217,112 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = 0; + int err = 0; + + inline_size = inline_xattr_size(inode); + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { @@ -348,23 +454,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -373,21 +476,16 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { @@ -395,8 +493,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } @@ -445,6 +542,13 @@ cleanup: return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + return (entry->e_value_size == size) && !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -479,12 +583,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index f990de2..d5a9492 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e6b764a..051dac1 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -364,8 +364,8 @@ extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +extern int fat_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/fat/file.c b/fs/fat/file.c index 3d04b12..4724cc9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -365,9 +365,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int fat_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; @@ -7,6 +7,7 @@ #include <linux/syscalls.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched/task.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> @@ -12,7 +12,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/time.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/file.h> diff --git a/fs/file_table.c b/fs/file_table.c index 6d982b5..954d510 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/security.h> +#include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 7dca743..be02507 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -1,5 +1,6 @@ #include <linux/export.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/slab.h> diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 5d5ddaa..67f9408 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) config = 0; rcu_read_lock(); - confkey = user_key_payload(key); + confkey = user_key_payload_rcu(key); buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f117926..b681b43 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/poll.h> +#include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/miscdevice.h> #include <linux/pagemap.h> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 811fd89..00800c0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -473,7 +473,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (err) { fuse_sync_release(ff, flags); } else { - file->private_data = fuse_file_get(ff); + file->private_data = ff; fuse_finish_open(inode, file); } return err; @@ -1777,10 +1777,10 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) return ret; } -static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, - struct kstat *stat) +static int fuse_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(entry); + struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); if (!fuse_allow_current_process(fc)) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e80bfd0..ec238fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); - atomic_set(&ff->count, 0); + atomic_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -75,7 +75,7 @@ void fuse_file_free(struct fuse_file *ff) kfree(ff); } -struct fuse_file *fuse_file_get(struct fuse_file *ff) +static struct fuse_file *fuse_file_get(struct fuse_file *ff) { atomic_inc(&ff->count); return ff; @@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { + __set_bit(FR_FORCE, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); @@ -146,7 +147,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->open_flags &= ~FOPEN_DIRECT_IO; ff->nodeid = nodeid; - file->private_data = fuse_file_get(ff); + file->private_data = ff; return 0; } @@ -245,14 +246,9 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) void fuse_release_common(struct file *file, int opcode) { - struct fuse_file *ff; - struct fuse_req *req; - - ff = file->private_data; - if (unlikely(!ff)) - return; + struct fuse_file *ff = file->private_data; + struct fuse_req *req = ff->reserved_req; - req = ff->reserved_req; fuse_prepare_release(ff, file->f_flags, opcode); if (ff->flock) { @@ -297,13 +293,13 @@ static int fuse_release(struct inode *inode, struct file *file) void fuse_sync_release(struct fuse_file *ff, int flags) { - WARN_ON(atomic_read(&ff->count) > 1); + WARN_ON(atomic_read(&ff->count) != 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); - __set_bit(FR_FORCE, &ff->reserved_req->flags); - __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags); - fuse_request_send(ff->fc, ff->reserved_req); - fuse_put_request(ff->fc, ff->reserved_req); - kfree(ff); + /* + * iput(NULL) is a no-op and since the refcount is 1 and everything's + * synchronous, we are fine with not doing igrab() here" + */ + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 052f8d3..32ac2c9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -732,7 +732,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, int fuse_open_common(struct inode *inode, struct file *file, bool isdir); struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); -struct fuse_file *fuse_file_get(struct fuse_file *ff); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index eb7724b..e279c3c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/namei.h> #include <linux/mm.h> +#include <linux/cred.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/gfs2_ondisk.h> @@ -1959,9 +1960,10 @@ out: /** * gfs2_getattr - Read out an inode's attributes - * @mnt: The vfsmount the inode is being accessed from - * @dentry: The dentry to stat + * @path: Object to query * @stat: The inode's stats + * @request_mask: Mask of STATX_xxx flags indicating the caller's interests + * @flags: AT_STATX_xxx setting * * This may be called from the VFS directly, or from within GFS2 with the * inode locked, so we look to see if the glock is already locked and only @@ -1972,10 +1974,10 @@ out: * Returns: errno */ -static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int gfs2_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8b907c5..0515f0a 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -15,6 +15,7 @@ #include <linux/types.h> #include <linux/delay.h> #include <linux/gfs2_ondisk.h> +#include <linux/sched/signal.h> #include "incore.h" #include "glock.h" diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e3ee387..361796a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -10,7 +10,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bio.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index f8d30e4..7a51534 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> +#include <linux/cred.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 5de5c48..75b2542 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -169,7 +169,7 @@ static int hfs_readdir(struct file *file, struct dir_context *ctx) * Can be done after the list insertion; exclusion with * hfs_delete_cat() is provided by directory lock. */ - memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); + memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); return err; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f776acf..bfbba79 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/uio.h> #include <linux/xattr.h> diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 2e796f8..e8638d5 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/uio.h> #include "hfsplus_fs.h" diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index aebb78f..d352f3a 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -18,7 +18,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/blkdev.h> #include <asm/unaligned.h> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 54de77e..8f96461 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -11,7 +11,7 @@ #include <linux/thread_info.h> #include <asm/current.h> -#include <linux/sched.h> /* remove ASAP */ +#include <linux/sched/signal.h> /* remove ASAP */ #include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> @@ -15,6 +15,8 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> +#include <linux/sched/signal.h> + #include "internal.h" #include <asm/ioctls.h> @@ -26,6 +26,8 @@ #include <linux/buffer_head.h> #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> +#include <linux/sched/signal.h> + #include "internal.h" /* diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 871c8b3..020ba09 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/nls.h> #include <linux/ctype.h> #include <linux/statfs.h> diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index e5c1783..453a6a1 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -16,7 +16,7 @@ #include <linux/jffs2.h> #include <linux/mtd/mtd.h> #include <linux/completion.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/kthread.h> #include "nodelist.h" diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 567653f..76fa814 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -15,6 +15,7 @@ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/mtd/mtd.h> diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index cda0774..a7bbe87 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -14,7 +14,7 @@ #include <linux/kernel.h> #include <linux/mtd/mtd.h> #include <linux/compiler.h> -#include <linux/sched.h> /* For cond_resched() */ +#include <linux/sched/signal.h> #include "nodelist.h" #include "debug.h" diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 35043a8..8e4dc7a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -13,7 +13,7 @@ #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/fsnotify.h> #include "kernfs-internal.h" diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index ac9e108..fb4b4a7 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -200,11 +200,11 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int kernfs_iop_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct kernfs_node *kn = dentry->d_fsdata; - struct inode *inode = d_inode(dentry); + struct kernfs_node *kn = path->dentry->d_fsdata; + struct inode *inode = d_inode(path->dentry); mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3100987..2d5144a 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -80,8 +80,8 @@ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +int kernfs_iop_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); /* @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> @@ -20,10 +21,10 @@ #include "internal.h" -int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int simple_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); generic_fillattr(inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; @@ -1143,10 +1144,10 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int empty_dir_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); generic_fillattr(inode, stat); return 0; } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1c13dd8..e7c8b9c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -17,7 +17,7 @@ #include <linux/sysctl.h> #include <linux/moduleparam.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/uio.h> @@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e7d9bf8..6ac76b0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -622,11 +622,14 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int minix_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct super_block *sb = dentry->d_sb; - generic_fillattr(d_inode(dentry), stat); - if (INODE_VERSION(d_inode(dentry)) == MINIX_V1) + struct super_block *sb = path->dentry->d_sb; + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(inode, stat); + if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 01ad81d..663d661 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -51,7 +51,7 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); @@ -672,17 +672,15 @@ static bool legitimize_links(struct nameidata *nd) /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry or NULL - * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * - * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd or NULL. Must be called from rcu-walk context. + * unlazy_walk attempts to legitimize the current nd->path and nd->root + * for ref-walk mode. + * Must be called from rcu-walk context. * Nothing should touch nameidata between unlazy_walk() failure and * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static int unlazy_walk(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; @@ -691,33 +689,66 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; + if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) + goto out1; + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) + goto out; + } + rcu_read_unlock(); + BUG_ON(nd->inode != parent->d_inode); + return 0; + +out2: + nd->path.mnt = NULL; + nd->path.dentry = NULL; +out1: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; +out: + rcu_read_unlock(); + return -ECHILD; +} + +/** + * unlazy_child - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * @dentry: child of nd->path.dentry + * @seq: seq number to check dentry against + * Returns: 0 on success, -ECHILD on failure + * + * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry + * for ref-walk mode. @dentry must be a path found by a do_lookup call on + * @nd. Must be called from rcu-walk context. + * Nothing should touch nameidata between unlazy_child() failure and + * terminate_walk(). + */ +static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq) +{ + BUG_ON(!(nd->flags & LOOKUP_RCU)); + + nd->flags &= ~LOOKUP_RCU; + if (unlikely(!legitimize_links(nd))) + goto out2; if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) goto out2; - if (unlikely(!lockref_get_not_dead(&parent->d_lockref))) + if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* - * For a negative lookup, the lookup sequence point is the parents - * sequence point, and it only needs to revalidate the parent dentry. - * - * For a positive lookup, we need to move both the parent and the - * dentry from the RCU domain to be properly refcounted. And the - * sequence number in the dentry validates *both* dentry counters, - * since we checked the sequence number of the parent after we got - * the child sequence number. So we know the parent must still - * be valid if the child sequence number is still valid. + * We need to move both the parent and the dentry from the RCU domain + * to be properly refcounted. And the sequence number in the dentry + * validates *both* dentry counters, since we checked the sequence + * number of the parent after we got the child sequence number. So we + * know the parent must still be valid if the child sequence number is */ - if (!dentry) { - if (read_seqcount_retry(&parent->d_seq, nd->seq)) - goto out; - BUG_ON(nd->inode != parent->d_inode); - } else { - if (!lockref_get_not_dead(&dentry->d_lockref)) - goto out; - if (read_seqcount_retry(&dentry->d_seq, seq)) - goto drop_dentry; + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) + goto out; + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) { + rcu_read_unlock(); + dput(dentry); + goto drop_root_mnt; } - /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. @@ -733,10 +764,6 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq rcu_read_unlock(); return 0; -drop_dentry: - rcu_read_unlock(); - dput(dentry); - goto drop_root_mnt; out2: nd->path.mnt = NULL; out1: @@ -749,27 +776,12 @@ drop_root_mnt: return -ECHILD; } -static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq) -{ - if (unlikely(!legitimize_path(nd, link, seq))) { - drop_links(nd); - nd->depth = 0; - nd->flags &= ~LOOKUP_RCU; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) { - return 0; - } - path_put(link); - return -ECHILD; -} - static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { - return dentry->d_op->d_revalidate(dentry, flags); + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + return dentry->d_op->d_revalidate(dentry, flags); + else + return 1; } /** @@ -790,7 +802,7 @@ static int complete_walk(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) { if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return -ECHILD; } @@ -1016,7 +1028,7 @@ const char *get_link(struct nameidata *nd) touch_atime(&last->link); cond_resched(); } else if (atime_needs_update_rcu(&last->link, inode)) { - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); touch_atime(&last->link); } @@ -1035,7 +1047,7 @@ const char *get_link(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd, NULL, 0))) + if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); res = get(dentry, inode, &last->done); } @@ -1469,19 +1481,14 @@ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { - struct dentry *dentry; - int error; - - dentry = d_lookup(dir, name); + struct dentry *dentry = d_lookup(dir, name); if (dentry) { - if (dentry->d_flags & DCACHE_OP_REVALIDATE) { - error = d_revalidate(dentry, flags); - if (unlikely(error <= 0)) { - if (!error) - d_invalidate(dentry); - dput(dentry); - return ERR_PTR(error); - } + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) + d_invalidate(dentry); + dput(dentry); + return ERR_PTR(error); } } return dentry; @@ -1546,7 +1553,7 @@ static int lookup_fast(struct nameidata *nd, bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; return 0; } @@ -1571,14 +1578,8 @@ static int lookup_fast(struct nameidata *nd, return -ECHILD; *seqp = seq; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; - if (status == -ECHILD) - status = d_revalidate(dentry, nd->flags); - } else { + status = d_revalidate(dentry, nd->flags); + if (likely(status > 0)) { /* * Note: do negative dentry check after revalidation in * case that drops it. @@ -1589,15 +1590,17 @@ static int lookup_fast(struct nameidata *nd, path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 1; - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; } + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + if (unlikely(status == -ECHILD)) + /* we'd been told to redo it in non-rcu mode */ + status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return 0; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) @@ -1636,8 +1639,7 @@ again: if (IS_ERR(dentry)) goto out; if (unlikely(!d_in_lookup(dentry))) { - if ((dentry->d_flags & DCACHE_OP_REVALIDATE) && - !(flags & LOOKUP_NO_REVAL)) { + if (!(flags & LOOKUP_NO_REVAL)) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { @@ -1668,7 +1670,7 @@ static inline int may_lookup(struct nameidata *nd) int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); @@ -1703,9 +1705,17 @@ static int pick_link(struct nameidata *nd, struct path *link, error = nd_alloc_stack(nd); if (unlikely(error)) { if (error == -ECHILD) { - if (unlikely(unlazy_link(nd, link, seq))) - return -ECHILD; - error = nd_alloc_stack(nd); + if (unlikely(!legitimize_path(nd, link, seq))) { + drop_links(nd); + nd->depth = 0; + nd->flags &= ~LOOKUP_RCU; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } else if (likely(unlazy_walk(nd)) == 0) + error = nd_alloc_stack(nd); } if (error) { path_put(link); @@ -2122,7 +2132,7 @@ OK: } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } return -ENOTDIR; @@ -2579,7 +2589,7 @@ mountpoint_last(struct nameidata *nd) /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL, 0)) + if (unlazy_walk(nd)) return -ECHILD; } @@ -3072,9 +3082,6 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (d_in_lookup(dentry)) break; - if (!(dentry->d_flags & DCACHE_OP_REVALIDATE)) - break; - error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; @@ -3356,13 +3363,50 @@ out: return error; } +struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *child = NULL; + struct inode *dir = dentry->d_inode; + struct inode *inode; + int error; + + /* we want directory to be writable */ + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + goto out_err; + error = -EOPNOTSUPP; + if (!dir->i_op->tmpfile) + goto out_err; + error = -ENOMEM; + child = d_alloc(dentry, &name); + if (unlikely(!child)) + goto out_err; + error = dir->i_op->tmpfile(dir, child, mode); + if (error) + goto out_err; + error = -ENOENT; + inode = child->d_inode; + if (unlikely(!inode)) + goto out_err; + if (!(open_flag & O_EXCL)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } + return child; + +out_err: + dput(child); + return ERR_PTR(error); +} +EXPORT_SYMBOL(vfs_tmpfile); + static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file, int *opened) { - static const struct qstr name = QSTR_INIT("/", 1); struct dentry *child; - struct inode *dir; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) @@ -3370,25 +3414,12 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - dir = path.dentry->d_inode; - /* we want directory to be writable */ - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); - if (error) + child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); + error = PTR_ERR(child); + if (unlikely(IS_ERR(child))) goto out2; - if (!dir->i_op->tmpfile) { - error = -EOPNOTSUPP; - goto out2; - } - child = d_alloc(path.dentry, &name); - if (unlikely(!child)) { - error = -ENOMEM; - goto out2; - } dput(path.dentry); path.dentry = child; - error = dir->i_op->tmpfile(dir, child, op->mode); - if (error) - goto out2; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ error = may_open(&path, 0, op->open_flag); @@ -3399,14 +3430,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, if (error) goto out2; error = open_check_o_direct(file); - if (error) { + if (error) fput(file); - } else if (!(op->open_flag & O_EXCL)) { - struct inode *inode = file_inode(file); - spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; - spin_unlock(&inode->i_lock); - } out2: mnt_drop_write(path.mnt); out: diff --git a/fs/namespace.c b/fs/namespace.c index 8bfad42..cc1375ef 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -15,6 +15,7 @@ #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> +#include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ @@ -24,6 +25,8 @@ #include <linux/magic.h> #include <linux/bootmem.h> #include <linux/task_work.h> +#include <linux/sched/task.h> + #include "pnode.h" #include "internal.h" diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 7eb89c2..d560609 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -30,6 +30,7 @@ #include <linux/vfs.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/sched/signal.h> #include <linux/namei.h> #include <net/sock.h> diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 4434e49..12550c2 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -19,6 +19,7 @@ #include <linux/highuid.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/uaccess.h> diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 97b111d..98b6db0 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -16,6 +16,7 @@ #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> +#include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/in.h> #include <linux/net.h> @@ -40,19 +41,12 @@ static int _recv(struct socket *sock, void *buf, int size, unsigned flags) return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); } -static inline int do_send(struct socket *sock, struct kvec *vec, int count, - int len, unsigned flags) -{ - struct msghdr msg = { .msg_flags = flags }; - return kernel_sendmsg(sock, &msg, vec, count, len); -} - static int _send(struct socket *sock, const void *buff, int len) { - struct kvec vec; - vec.iov_base = (void *) buff; - vec.iov_len = len; - return do_send(sock, &vec, 1, len, 0); + struct msghdr msg = { .msg_flags = 0 }; + struct kvec vec = {.iov_base = (void *)buff, .iov_len = len}; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &vec, 1, len); + return sock_sendmsg(sock, &msg); } struct ncp_request_reply { @@ -63,9 +57,7 @@ struct ncp_request_reply { size_t datalen; int result; enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status; - struct kvec* tx_ciov; - size_t tx_totallen; - size_t tx_iovlen; + struct iov_iter from; struct kvec tx_iov[3]; u_int16_t tx_type; u_int32_t sign[6]; @@ -205,28 +197,22 @@ static inline void __ncptcp_abort(struct ncp_server *server) static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req) { - struct kvec vec[3]; - /* sock_sendmsg updates iov pointers for us :-( */ - memcpy(vec, req->tx_ciov, req->tx_iovlen * sizeof(vec[0])); - return do_send(sock, vec, req->tx_iovlen, - req->tx_totallen, MSG_DONTWAIT); + struct msghdr msg = { .msg_iter = req->from, .msg_flags = MSG_DONTWAIT }; + return sock_sendmsg(sock, &msg); } static void __ncptcp_try_send(struct ncp_server *server) { struct ncp_request_reply *rq; - struct kvec *iov; - struct kvec iovc[3]; + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT }; int result; rq = server->tx.creq; if (!rq) return; - /* sock_sendmsg updates iov pointers for us :-( */ - memcpy(iovc, rq->tx_ciov, rq->tx_iovlen * sizeof(iov[0])); - result = do_send(server->ncp_sock, iovc, rq->tx_iovlen, - rq->tx_totallen, MSG_NOSIGNAL | MSG_DONTWAIT); + msg.msg_iter = rq->from; + result = sock_sendmsg(server->ncp_sock, &msg); if (result == -EAGAIN) return; @@ -236,21 +222,12 @@ static void __ncptcp_try_send(struct ncp_server *server) __ncp_abort_request(server, rq, result); return; } - if (result >= rq->tx_totallen) { + if (!msg_data_left(&msg)) { server->rcv.creq = rq; server->tx.creq = NULL; return; } - rq->tx_totallen -= result; - iov = rq->tx_ciov; - while (iov->iov_len <= result) { - result -= iov->iov_len; - iov++; - rq->tx_iovlen--; - } - iov->iov_base += result; - iov->iov_len -= result; - rq->tx_ciov = iov; + rq->from = msg.msg_iter; } static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h) @@ -263,22 +240,21 @@ static inline void ncp_init_header(struct ncp_server *server, struct ncp_request static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req) { - size_t signlen; - struct ncp_request_header* h; + size_t signlen, len = req->tx_iov[1].iov_len; + struct ncp_request_header *h = req->tx_iov[1].iov_base; - req->tx_ciov = req->tx_iov + 1; - - h = req->tx_iov[1].iov_base; ncp_init_header(server, req, h); - signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, - req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, - cpu_to_le32(req->tx_totallen), req->sign); + signlen = sign_packet(server, + req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, + len - sizeof(struct ncp_request_header) + 1, + cpu_to_le32(len), req->sign); if (signlen) { - req->tx_ciov[1].iov_base = req->sign; - req->tx_ciov[1].iov_len = signlen; - req->tx_iovlen += 1; - req->tx_totallen += signlen; + /* NCP over UDP appends signature */ + req->tx_iov[2].iov_base = req->sign; + req->tx_iov[2].iov_len = signlen; } + iov_iter_kvec(&req->from, WRITE | ITER_KVEC, + req->tx_iov + 1, signlen ? 2 : 1, len + signlen); server->rcv.creq = req; server->timeout_last = server->m.time_out; server->timeout_retries = server->m.retry_count; @@ -292,24 +268,23 @@ static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req) { - size_t signlen; - struct ncp_request_header* h; + size_t signlen, len = req->tx_iov[1].iov_len; + struct ncp_request_header *h = req->tx_iov[1].iov_base; - req->tx_ciov = req->tx_iov; - h = req->tx_iov[1].iov_base; ncp_init_header(server, req, h); signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, - req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, - cpu_to_be32(req->tx_totallen + 24), req->sign + 4) + 16; + len - sizeof(struct ncp_request_header) + 1, + cpu_to_be32(len + 24), req->sign + 4) + 16; req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC); - req->sign[1] = htonl(req->tx_totallen + signlen); + req->sign[1] = htonl(len + signlen); req->sign[2] = htonl(NCP_TCP_XMIT_VERSION); req->sign[3] = htonl(req->datalen + 8); + /* NCP over TCP prepends signature */ req->tx_iov[0].iov_base = req->sign; req->tx_iov[0].iov_len = signlen; - req->tx_iovlen += 1; - req->tx_totallen += signlen; + iov_iter_kvec(&req->from, WRITE | ITER_KVEC, + req->tx_iov, 2, len + signlen); server->tx.creq = req; __ncptcp_try_send(server); @@ -364,18 +339,17 @@ static void __ncp_next_request(struct ncp_server *server) static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len) { if (server->info_sock) { - struct kvec iov[2]; - __be32 hdr[2]; - - hdr[0] = cpu_to_be32(len + 8); - hdr[1] = cpu_to_be32(id); - - iov[0].iov_base = hdr; - iov[0].iov_len = 8; - iov[1].iov_base = (void *) data; - iov[1].iov_len = len; + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + __be32 hdr[2] = {cpu_to_be32(len + 8), cpu_to_be32(id)}; + struct kvec iov[2] = { + {.iov_base = hdr, .iov_len = 8}, + {.iov_base = (void *)data, .iov_len = len}, + }; + + iov_iter_kvec(&msg.msg_iter, ITER_KVEC | WRITE, + iov, 2, len + 8); - do_send(server->info_sock, iov, 2, len + 8, MSG_NOSIGNAL); + sock_sendmsg(server->info_sock, &msg); } } @@ -711,8 +685,6 @@ static int do_ncp_rpc_call(struct ncp_server *server, int size, req->datalen = max_reply_size; req->tx_iov[1].iov_base = server->packet; req->tx_iov[1].iov_len = size; - req->tx_iovlen = 1; - req->tx_totallen = size; req->tx_type = *(u_int16_t*)server->packet; result = ncp_add_request(server, req); diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 6de1570..2ae676f 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -141,8 +141,7 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd) void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) { - if (cd->u.pipefs.dir) - sunrpc_cache_unregister_pipefs(cd); + sunrpc_cache_unregister_pipefs(cd); } void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 484bebc..bb79972 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,6 +9,7 @@ #include <linux/completion.h> #include <linux/ip.h> #include <linux/module.h> +#include <linux/sched/signal.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index eb094c6..d051fc3 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -83,23 +83,15 @@ static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) return p; } -static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) +static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, + const char **str, size_t maxlen) { - __be32 *p; - - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *len = ntohl(*p); - - if (*len != 0) { - p = read_buf(xdr, *len); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *str = (const char *)p; - } else - *str = NULL; + ssize_t err; + err = xdr_stream_decode_opaque_inline(xdr, (void **)str, maxlen); + if (err < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); + *len = err; return 0; } @@ -162,15 +154,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound __be32 *p; __be32 status; - status = decode_string(xdr, &hdr->taglen, &hdr->tag); + status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - /* We do not like overly long tags! */ - if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) { - printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", - __func__, hdr->taglen); - return htonl(NFS4ERR_RESOURCE); - } p = read_buf(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -582,12 +568,8 @@ out: static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { - __be32 *p; - - p = xdr_reserve_space(xdr, 4 + len); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - xdr_encode_opaque(p, str, len); + if (unlikely(xdr_stream_encode_opaque(xdr, str, len) < 0)) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } @@ -1083,7 +1065,8 @@ struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; struct svc_version nfs4_callback_version4 = { @@ -1092,5 +1075,6 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fad8104..fb499a3 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2002,6 +2002,29 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); +static void +nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *old_dentry = data->old_dentry; + struct dentry *new_dentry = data->new_dentry; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + + nfs_mark_for_revalidate(old_inode); + + switch (task->tk_status) { + case 0: + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(data->new_dir)); + break; + case -ENOENT: + nfs_dentry_handle_enoent(old_dentry); + } +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2084,7 +2107,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, + nfs_complete_rename); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2094,21 +2118,11 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); - nfs_mark_for_revalidate(old_inode); out: if (rehash) d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); - if (!error) { - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(new_dir)); - } else if (error == -ENOENT) - nfs_dentry_handle_enoent(old_dentry); - /* new dentry created? */ if (dentry) dput(dentry); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 18f98e0..44347f4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -305,7 +305,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } hdr->pgio_done_cb = filelayout_read_done_cb; - if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + if (nfs4_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -403,7 +403,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - if (nfs41_setup_sequence(hdr->ds_clp->cl_session, + if (nfs4_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -438,7 +438,7 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - nfs41_setup_sequence(wdata->ds_clp->cl_session, + nfs4_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, task); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index d6acc68..42dedf2 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1053,9 +1053,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs_client *mds_client = mds_server->nfs_client; struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: @@ -1157,9 +1154,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1195,6 +1189,13 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; + if (task->tk_status >= 0) + return 0; + + /* Handle the case of an invalid layout segment */ + if (!pnfs_is_valid_lseg(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, lseg, idx); @@ -1384,30 +1385,14 @@ static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data) rpc_call_start(task); } -static int ff_layout_setup_sequence(struct nfs_client *ds_clp, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - if (ds_clp->cl_session) - return nfs41_setup_sequence(ds_clp->cl_session, - args, - res, - task); - return nfs40_setup_sequence(ds_clp->cl_slot_tbl, - args, - res, - task); -} - static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_setup_sequence(hdr->ds_clp, - &hdr->args.seq_args, - &hdr->res.seq_res, - task)) + if (nfs4_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) return; if (ff_layout_read_prepare_common(task, hdr)) @@ -1578,10 +1563,10 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_setup_sequence(hdr->ds_clp, - &hdr->args.seq_args, - &hdr->res.seq_res, - task)) + if (nfs4_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) return; if (ff_layout_write_prepare_common(task, hdr)) @@ -1667,10 +1652,10 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - if (ff_layout_setup_sequence(wdata->ds_clp, - &wdata->args.seq_args, - &wdata->res.seq_res, - task)) + if (nfs4_setup_sequence(wdata->ds_clp, + &wdata->args.seq_args, + &wdata->res.seq_res, + task)) return; ff_layout_commit_prepare_common(task, data); } @@ -1965,10 +1950,7 @@ static int ff_layout_encode_ioerr(struct xdr_stream *xdr, static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) { - __be32 *p; - - p = xdr_reserve_space(xdr, len); - xdr_encode_opaque_fixed(p, buf, len); + WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); } static void @@ -2092,7 +2074,7 @@ ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args) kfree(ff_args); } -const struct nfs4_xdr_opaque_ops layoutreturn_ops = { +static const struct nfs4_xdr_opaque_ops layoutreturn_ops = { .encode = ff_layout_encode_layoutreturn, .free = ff_layout_free_layoutreturn, }; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5ca4d96..f489a5a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -15,7 +15,7 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -703,9 +703,10 @@ static bool nfs_need_revalidate_inode(struct inode *inode) return false; } -int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int nfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err = 0; @@ -726,17 +727,17 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is * no point in checking those. */ - if ((mnt->mnt_flags & MNT_NOATIME) || - ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + if ((path->mnt->mnt_flags & MNT_NOATIME) || + ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - nfs_readdirplus_parent_cache_miss(dentry); + nfs_readdirplus_parent_cache_miss(path->dentry); err = __nfs_revalidate_inode(server, inode); } else - nfs_readdirplus_parent_cache_hit(dentry); + nfs_readdirplus_parent_cache_hit(path->dentry); if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e49d831..786f175 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -178,11 +178,12 @@ out_nofree: } static int -nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +nfs_namespace_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_getattr(mnt, dentry, stat); - generic_fillattr(d_inode(dentry), stat); + if (NFS_FH(d_inode(path->dentry))->size != 0) + return nfs_getattr(path, stat, request_mask, query_flags); + generic_fillattr(d_inode(path->dentry), stat); return 0; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index d12ff93..1e486c7 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -12,6 +12,7 @@ #include "nfs42.h" #include "iostat.h" #include "pnfs.h" +#include "nfs4session.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -128,30 +129,26 @@ out_unlock: return err; } -static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, +static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, - struct file *dst, loff_t pos_dst, + struct file *dst, struct nfs_lock_context *dst_lock, - size_t count) + struct nfs42_copy_args *args, + struct nfs42_copy_res *res) { - struct nfs42_copy_args args = { - .src_fh = NFS_FH(file_inode(src)), - .src_pos = pos_src, - .dst_fh = NFS_FH(file_inode(dst)), - .dst_pos = pos_dst, - .count = count, - }; - struct nfs42_copy_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], - .rpc_argp = &args, - .rpc_resp = &res, + .rpc_argp = args, + .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); struct nfs_server *server = NFS_SERVER(dst_inode); + loff_t pos_src = args->src_pos; + loff_t pos_dst = args->dst_pos; + size_t count = args->count; int status; - status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, src_lock, FMODE_READ); if (status) return status; @@ -161,7 +158,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; - status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; @@ -171,22 +168,22 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, return status; status = nfs4_call_sync(server->client, server, &msg, - &args.seq_args, &res.seq_res, 0); + &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) return status; - if (res.write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (res->write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res->write_res.verifier.verifier); if (status) return status; } truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res.write_res.count); + pos_dst + res->write_res.count); - return res.write_res.count; + return res->write_res.count; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -196,8 +193,22 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; struct nfs_lock_context *dst_lock; - struct nfs4_exception src_exception = { }; - struct nfs4_exception dst_exception = { }; + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct nfs4_exception src_exception = { + .inode = file_inode(src), + .stateid = &args.src_stateid, + }; + struct nfs4_exception dst_exception = { + .inode = file_inode(dst), + .stateid = &args.dst_stateid, + }; ssize_t err, err2; if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) @@ -207,7 +218,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (IS_ERR(src_lock)) return PTR_ERR(src_lock); - src_exception.inode = file_inode(src); src_exception.state = src_lock->open_context->state; dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); @@ -216,15 +226,17 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, goto out_put_src_lock; } - dst_exception.inode = file_inode(dst); dst_exception.state = dst_lock->open_context->state; do { inode_lock(file_inode(dst)); - err = _nfs42_proc_copy(src, pos_src, src_lock, - dst, pos_dst, dst_lock, count); + err = _nfs42_proc_copy(src, src_lock, + dst, dst_lock, + &args, &res); inode_unlock(file_inode(dst)); + if (err >= 0) + break; if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; @@ -331,9 +343,8 @@ nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) } nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); spin_unlock(&inode->i_lock); - nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, - &data->res.seq_res, task); - + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); } static void diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6651658..af285cc 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -273,14 +273,6 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, fmode_t fmode); #if defined(CONFIG_NFS_V4_1) -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) -{ - return server->nfs_client->cl_session; -} - -extern int nfs41_setup_sequence(struct nfs4_session *session, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - struct rpc_task *task); extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); @@ -357,11 +349,6 @@ nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, hdr->args.stable = NFS_FILE_SYNC; } #else /* CONFIG_NFS_v4_1 */ -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) -{ - return NULL; -} - static inline bool is_ds_only_client(struct nfs_client *clp) { @@ -466,7 +453,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); -extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl, +extern int nfs4_setup_sequence(const struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index c444285..835c163 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = user_key_payload(rkey); + payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0a0eaec..1b18368 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -577,12 +577,7 @@ nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server, static bool _nfs4_is_integrity_protected(struct nfs_client *clp) { rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; - - if (flavor == RPC_AUTH_GSS_KRB5I || - flavor == RPC_AUTH_GSS_KRB5P) - return true; - - return false; + return (flavor == RPC_AUTH_GSS_KRB5I) || (flavor == RPC_AUTH_GSS_KRB5P); } static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) @@ -622,48 +617,6 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) args->sa_privileged = 1; } -int nfs40_setup_sequence(struct nfs4_slot_table *tbl, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_slot *slot; - - /* slot already allocated? */ - if (res->sr_slot != NULL) - goto out_start; - - spin_lock(&tbl->slot_tbl_lock); - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; - } - spin_unlock(&tbl->slot_tbl_lock); - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - res->sr_slot = slot; - -out_start: - rpc_call_start(task); - return 0; - -out_sleep: - if (args->sa_privileged) - rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); - else - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); - return -EAGAIN; -} -EXPORT_SYMBOL_GPL(nfs40_setup_sequence); - static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; @@ -815,10 +768,6 @@ static int nfs41_sequence_process(struct rpc_task *task, case -NFS4ERR_SEQ_FALSE_RETRY: ++slot->seq_nr; goto retry_nowait; - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_BADSESSION: - nfs4_schedule_session_recovery(session, res->sr_status); - goto retry_nowait; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -882,101 +831,14 @@ int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -int nfs41_setup_sequence(struct nfs4_session *session, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_slot *slot; - struct nfs4_slot_table *tbl; - - dprintk("--> %s\n", __func__); - /* slot already allocated? */ - if (res->sr_slot != NULL) - goto out_success; - - tbl = &session->fc_slot_table; - - task->tk_timeout = 0; - - spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && - !args->sa_privileged) { - /* The state manager will wait until the slot table is empty */ - dprintk("%s session is draining\n", __func__); - goto out_sleep; - } - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* If out of memory, try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - dprintk("<-- %s: no free slots\n", __func__); - goto out_sleep; - } - spin_unlock(&tbl->slot_tbl_lock); - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - - dprintk("<-- %s slotid=%u seqid=%u\n", __func__, - slot->slot_nr, slot->seq_nr); - - res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - /* - * sr_status is only set in decode_sequence, and so will remain - * set to 1 if an rpc level failure occurs. - */ - res->sr_status = 1; - trace_nfs4_setup_sequence(session, args); -out_success: - rpc_call_start(task); - return 0; -out_sleep: - /* Privileged tasks are queued with top priority */ - if (args->sa_privileged) - rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); - else - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); - return -EAGAIN; -} -EXPORT_SYMBOL_GPL(nfs41_setup_sequence); - -static int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - struct nfs4_session *session = nfs4_get_session(server); - int ret = 0; - - if (!session) - return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, - args, res, task); - - dprintk("--> %s clp %p session %p sr_slot %u\n", - __func__, session->clp, session, res->sr_slot ? - res->sr_slot->slot_nr : NFS4_NO_SLOT); - - ret = nfs41_setup_sequence(session, args, res, task); - - dprintk("<-- %s status=%d\n", __func__, ret); - return ret; -} - static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; - struct nfs4_session *session = nfs4_get_session(data->seq_server); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); + nfs4_setup_sequence(data->seq_server->nfs_client, + data->seq_args, data->seq_res, task); } static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) @@ -993,15 +855,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { #else /* !CONFIG_NFS_V4_1 */ -static int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) -{ - return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, - args, res, task); -} - static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { return nfs40_sequence_done(task, res); @@ -1022,10 +875,68 @@ EXPORT_SYMBOL_GPL(nfs4_sequence_done); #endif /* !CONFIG_NFS_V4_1 */ +int nfs4_setup_sequence(const struct nfs_client *client, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_session *session = nfs4_get_session(client); + struct nfs4_slot_table *tbl = client->cl_slot_tbl; + struct nfs4_slot *slot; + + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_start; + + if (session) { + tbl = &session->fc_slot_table; + task->tk_timeout = 0; + } + + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + slot->privileged = args->sa_privileged ? 1 : 0; + args->sa_slot = slot; + + res->sr_slot = slot; + if (session) { + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; + } + + trace_nfs4_setup_sequence(session, args); +out_start: + rpc_call_start(task); + return 0; + +out_sleep: + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(nfs4_setup_sequence); + static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs4_call_sync_data *data = calldata; - nfs4_setup_sequence(data->seq_server, + nfs4_setup_sequence(data->seq_server->nfs_client, data->seq_args, data->seq_res, task); } @@ -1330,14 +1241,6 @@ static void nfs4_opendata_put(struct nfs4_opendata *p) kref_put(&p->kref, nfs4_opendata_free); } -static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) -{ - int ret; - - ret = rpc_wait_for_completion_task(task); - return ret; -} - static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, fmode_t fmode) { @@ -1732,17 +1635,15 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) int ret; if (!data->rpc_done) { - if (data->rpc_status) { - ret = data->rpc_status; - goto err; - } + if (data->rpc_status) + return ERR_PTR(data->rpc_status); /* cached opens have already been processed */ goto update; } ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) - goto err; + return ERR_PTR(ret); if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); @@ -1752,9 +1653,6 @@ update: atomic_inc(&state->count); return state; -err: - return ERR_PTR(ret); - } static struct nfs4_state * @@ -2048,8 +1946,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl, - &data->c_arg.seq_args, &data->c_res.seq_res, task); + nfs4_setup_sequence(data->o_arg.server->nfs_client, + &data->c_arg.seq_args, &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -2124,7 +2022,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status != 0) { data->cancelled = 1; smp_wmb(); @@ -2172,7 +2070,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->o_arg.server, + if (nfs4_setup_sequence(data->o_arg.server->nfs_client, &data->o_arg.seq_args, &data->o_res.seq_res, task) != 0) @@ -2289,15 +2187,15 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->is_recover = 1; } task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) { - data->cancelled = 1; - smp_wmb(); - } else - status = data->rpc_status; - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + status = rpc_wait_for_completion_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); return status; } @@ -2306,7 +2204,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) { struct inode *dir = d_inode(data->dir); struct nfs_openres *o_res = &data->o_res; - int status; + int status; status = nfs4_run_open_task(data, 1); if (status != 0 || !data->rpc_done) @@ -2314,11 +2212,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); - if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) status = _nfs4_proc_open_confirm(data); - if (status != 0) - return status; - } return status; } @@ -2412,11 +2307,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) -{ - return nfs4_client_recover_expired_lease(server->nfs_client); -} - /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -2730,6 +2620,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = PTR_ERR(state); if (IS_ERR(state)) goto out; + ctx->state = state; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) @@ -2755,7 +2646,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - ctx->state = state; if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -2794,7 +2684,7 @@ static int _nfs4_do_open(struct inode *dir, dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } - status = nfs4_recover_expired_lease(server); + status = nfs4_client_recover_expired_lease(server->nfs_client); if (status != 0) goto err_put_state_owner; if (d_really_is_positive(dentry)) @@ -2940,12 +2830,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_open_context *ctx) { struct nfs_server *server = NFS_SERVER(inode); - struct rpc_message msg = { + struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], .rpc_argp = arg, .rpc_resp = res, .rpc_cred = cred, - }; + }; struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; @@ -2993,18 +2883,18 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_state *state = ctx ? ctx->state : NULL; - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, .server = server, .bitmask = server->attr_bitmask, .label = ilabel, - }; - struct nfs_setattrres res = { + }; + struct nfs_setattrres res = { .fattr = fattr, .label = olabel, .server = server, - }; + }; struct nfs4_exception exception = { .state = state, .inode = inode, @@ -3118,7 +3008,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) } } - /* hmm. we are done with the inode, and in the process of freeing + /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ switch (task->tk_status) { @@ -3234,7 +3124,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) else if (calldata->arg.bitmask == NULL) calldata->res.fattr = NULL; calldata->timestamp = jiffies; - if (nfs4_setup_sequence(NFS_SERVER(inode), + if (nfs4_setup_sequence(NFS_SERVER(inode)->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, task) != 0) @@ -3522,16 +3412,11 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl .pseudoflavor = flavor, }; struct rpc_auth *auth; - int ret; auth = rpcauth_create(&auth_args, server->client); - if (IS_ERR(auth)) { - ret = -EACCES; - goto out; - } - ret = nfs4_lookup_root(server, fhandle, info); -out: - return ret; + if (IS_ERR(auth)) + return -EACCES; + return nfs4_lookup_root(server, fhandle, info); } /* @@ -4114,7 +3999,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - nfs4_setup_sequence(NFS_SB(data->dentry->d_sb), + nfs4_setup_sequence(NFS_SB(data->dentry->d_sb)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -4148,7 +4033,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) { - nfs4_setup_sequence(NFS_SERVER(data->old_dir), + nfs4_setup_sequence(NFS_SERVER(data->old_dir)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -4723,7 +4608,7 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (nfs4_setup_sequence(NFS_SERVER(hdr->inode), + if (nfs4_setup_sequence(NFS_SERVER(hdr->inode)->nfs_client, &hdr->args.seq_args, &hdr->res.seq_res, task)) @@ -4822,7 +4707,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->inode), + nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -4975,8 +4860,8 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen, if (newpage == NULL) goto unwind; memcpy(page_address(newpage), buf, len); - buf += len; - buflen -= len; + buf += len; + buflen -= len; *pages++ = newpage; rc++; } while (buflen != 0); @@ -5069,7 +4954,7 @@ out: */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -5083,13 +4968,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; int ret = -ENOMEM, i; - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - if (npages == 0) - npages = 1; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -5299,8 +5180,8 @@ static int _nfs4_do_set_security_label(struct inode *inode, struct nfs_server *server = NFS_SERVER(inode); const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = &sattr, + .fh = NFS_FH(inode), + .iap = &sattr, .server = server, .bitmask = bitmask, .label = ilabel, @@ -5311,9 +5192,9 @@ static int _nfs4_do_set_security_label(struct inode *inode, .server = server, }; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, }; int status; @@ -5747,7 +5628,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; - nfs4_setup_sequence(d_data->res.server, + nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, &d_data->res.seq_res, task); @@ -5817,7 +5698,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co return PTR_ERR(task); if (!issync) goto out; - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status != 0) goto out; status = data->rpc_status; @@ -5859,8 +5740,8 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = &arg, + .rpc_resp = &res, .rpc_cred = state->owner->so_cred, }; struct nfs4_lock_state *lsp; @@ -5989,7 +5870,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) goto out_no_action; } calldata->timestamp = jiffies; - if (nfs4_setup_sequence(calldata->server, + if (nfs4_setup_sequence(calldata->server->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, task) != 0) @@ -6087,7 +5968,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = PTR_ERR(task); if (IS_ERR(task)) goto out; - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: request->fl_flags = fl_flags; @@ -6174,7 +6055,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) goto out_release_open_seqid; } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->server, + if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, &data->res.seq_res, task) == 0) @@ -6314,7 +6195,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - ret = nfs4_wait_for_completion_rpc_task(task); + ret = rpc_wait_for_completion_task(task); if (ret == 0) { ret = data->rpc_status; if (ret) @@ -6393,8 +6274,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) return 0; - status = nfs4_lock_expired(state, request); - return status; + return nfs4_lock_expired(state, request); } #endif @@ -6640,8 +6520,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata { struct nfs_release_lockowner_data *data = calldata; struct nfs_server *server = data->server; - nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, - &data->args.seq_args, &data->res.seq_res, task); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); data->args.lock_owner.clientid = server->nfs_client->cl_clientid; data->timestamp = jiffies; } @@ -7232,11 +7112,9 @@ static bool nfs41_same_server_scope(struct nfs41_server_scope *a, struct nfs41_server_scope *b) { - if (a->server_scope_sz == b->server_scope_sz && - memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) - return true; - - return false; + if (a->server_scope_sz != b->server_scope_sz) + return false; + return memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0; } static void @@ -7831,7 +7709,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, dprintk("--> %s\n", __func__); /* just setup sequence, do not trigger session recovery since we're invoked within one */ - nfs41_setup_sequence(data->clp->cl_session, + nfs4_setup_sequence(data->clp, &data->args->la_seq_args, &data->res->lr_seq_res, task); @@ -8202,7 +8080,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - nfs41_setup_sequence(clp->cl_session, args, res, task); + nfs4_setup_sequence(clp, args, res, task); } static const struct rpc_call_ops nfs41_sequence_ops = { @@ -8290,7 +8168,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; - nfs41_setup_sequence(calldata->clp->cl_session, + nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, &calldata->res.seq_res, task); @@ -8382,7 +8260,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, status = PTR_ERR(task); goto out; } - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status == 0) status = task->tk_status; rpc_put_task(task); @@ -8397,10 +8275,9 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); - struct nfs4_session *session = nfs4_get_session(server); dprintk("--> %s\n", __func__); - nfs41_setup_sequence(session, &lgp->args.seq_args, + nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args, &lgp->res.seq_res, task); dprintk("<-- %s\n", __func__); } @@ -8615,7 +8492,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); - status = nfs4_wait_for_completion_rpc_task(task); + status = rpc_wait_for_completion_task(task); if (status == 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; @@ -8644,7 +8521,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - nfs41_setup_sequence(lrp->clp->cl_session, + nfs4_setup_sequence(lrp->clp, &lrp->args.seq_args, &lrp->res.seq_res, task); @@ -8794,9 +8671,8 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct nfs_server *server = NFS_SERVER(data->args.inode); - struct nfs4_session *session = nfs4_get_session(server); - nfs41_setup_sequence(session, + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -9120,7 +8996,7 @@ struct nfs_free_stateid_data { static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) { struct nfs_free_stateid_data *data = calldata; - nfs41_setup_sequence(nfs4_get_session(data->server), + nfs4_setup_sequence(data->server->nfs_client, &data->args.seq_args, &data->res.seq_res, task); @@ -9232,10 +9108,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1, if (s1->seqid == s2->seqid) return true; - if (s1->seqid == 0 || s2->seqid == 0) - return true; - return false; + return s1->seqid == 0 || s2->seqid == 0; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 82e7719..1f8c2ae 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -153,7 +153,7 @@ void nfs4_set_lease_period(struct nfs_client *clp, spin_unlock(&clp->cl_lock); /* Cap maximum reconnect timeout at 1/2 lease period */ - rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); + rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1); } /* diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index dae3855..dfae488 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -103,6 +103,11 @@ static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl, return !!test_bit(slotid, tbl->used_slots); } +static inline struct nfs4_session *nfs4_get_session(const struct nfs_client *clp) +{ + return clp->cl_session; +} + #if defined(CONFIG_NFS_V4_1) extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, u32 target_highest_slotid); @@ -170,6 +175,8 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +#define nfs_session_id_hash(session) (0) + #endif /* defined(CONFIG_NFS_V4_1) */ #endif /* IS_ENABLED(CONFIG_NFS_V4) */ #endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index daeb94e..8156bad 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -868,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner, 0); + lsp = __nfs4_find_lock_state(state, owner, NULL); if (lsp != NULL) break; if (new != NULL) { diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index cfb8f7c..845d0ea 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -241,38 +241,6 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); -TRACE_EVENT(nfs4_setup_sequence, - TP_PROTO( - const struct nfs4_session *session, - const struct nfs4_sequence_args *args - ), - TP_ARGS(session, args), - - TP_STRUCT__entry( - __field(unsigned int, session) - __field(unsigned int, slot_nr) - __field(unsigned int, seq_nr) - __field(unsigned int, highest_used_slotid) - ), - - TP_fast_assign( - const struct nfs4_slot *sa_slot = args->sa_slot; - __entry->session = nfs_session_id_hash(&session->sess_id); - __entry->slot_nr = sa_slot->slot_nr; - __entry->seq_nr = sa_slot->seq_nr; - __entry->highest_used_slotid = - sa_slot->table->highest_used_slotid; - ), - TP_printk( - "session=0x%08x slot_nr=%u seq_nr=%u " - "highest_used_slotid=%u", - __entry->session, - __entry->slot_nr, - __entry->seq_nr, - __entry->highest_used_slotid - ) -); - #define show_nfs4_sequence_status_flags(status) \ __print_flags((unsigned long)status, "|", \ { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ @@ -382,6 +350,38 @@ TRACE_EVENT(nfs4_cb_sequence, ); #endif /* CONFIG_NFS_V4_1 */ +TRACE_EVENT(nfs4_setup_sequence, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_args *args + ), + TP_ARGS(session, args), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_used_slotid) + ), + + TP_fast_assign( + const struct nfs4_slot *sa_slot = args->sa_slot; + __entry->session = session ? nfs_session_id_hash(&session->sess_id) : 0; + __entry->slot_nr = sa_slot->slot_nr; + __entry->seq_nr = sa_slot->seq_nr; + __entry->highest_used_slotid = + sa_slot->table->highest_used_slotid; + ), + TP_printk( + "session=0x%08x slot_nr=%u seq_nr=%u " + "highest_used_slotid=%u", + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_used_slotid + ) +); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e9255cb..f0369e3 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -169,8 +169,10 @@ static int nfs4_stat_to_errno(int); open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) +#define decode_space_limit_maxsz (3) #define decode_ace_maxsz (3 + nfs4_owner_maxsz) #define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \ + decode_space_limit_maxsz + \ decode_ace_maxsz) #define decode_change_info_maxsz (5) #define decode_open_maxsz (op_decode_hdr_maxsz + \ @@ -924,34 +926,22 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) { - __be32 *p; - - p = xdr_reserve_space(xdr, len); - xdr_encode_opaque_fixed(p, buf, len); + WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); } static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { - __be32 *p; - - p = reserve_space(xdr, 4 + len); - xdr_encode_opaque(p, str, len); + WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0); } static void encode_uint32(struct xdr_stream *xdr, u32 n) { - __be32 *p; - - p = reserve_space(xdr, 4); - *p = cpu_to_be32(n); + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); } static void encode_uint64(struct xdr_stream *xdr, u64 n) { - __be32 *p; - - p = reserve_space(xdr, 8); - xdr_encode_hyper(p, n); + WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } static void encode_nfs4_seqid(struct xdr_stream *xdr, @@ -2524,7 +2514,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, @@ -3062,20 +3052,15 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { - __be32 *p; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - *len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, *len); - if (unlikely(!p)) - goto out_overflow; - *string = (char *)p; + ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, + NFS4_OPAQUE_LIMIT); + if (unlikely(ret < 0)) { + if (ret == -EBADMSG) + print_overflow_msg(__func__, xdr); + return -EIO; + } + *len = ret; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -3142,7 +3127,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) } /* Dummy routine */ -static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) +static int decode_ace(struct xdr_stream *xdr, void *ace) { __be32 *p; unsigned int strlen; @@ -3890,45 +3875,50 @@ out_overflow: return -EIO; } +static ssize_t decode_nfs4_string(struct xdr_stream *xdr, + struct nfs4_string *name, gfp_t gfp_flags) +{ + ssize_t ret; + + ret = xdr_stream_decode_string_dup(xdr, &name->data, + XDR_MAX_NETOBJ, gfp_flags); + name->len = 0; + if (ret > 0) + name->len = ret; + return ret; +} + static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, const struct nfs_server *server, kuid_t *uid, struct nfs4_string *owner_name) { - uint32_t len; - __be32 *p; - int ret = 0; + ssize_t len; + char *p; *uid = make_kuid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; - if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, len); - if (unlikely(!p)) - goto out_overflow; - if (owner_name != NULL) { - owner_name->data = kmemdup(p, len, GFP_NOWAIT); - if (owner_name->data != NULL) { - owner_name->len = len; - ret = NFS_ATTR_FATTR_OWNER_NAME; - } - } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) - ret = NFS_ATTR_FATTR_OWNER; - else - dprintk("%s: nfs_map_name_to_uid failed!\n", - __func__); - } else - dprintk("%s: name too long (%u)!\n", - __func__, len); - bitmap[1] &= ~FATTR4_WORD1_OWNER; + if (!(bitmap[1] & FATTR4_WORD1_OWNER)) + return 0; + bitmap[1] &= ~FATTR4_WORD1_OWNER; + + if (owner_name != NULL) { + len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + if (len <= 0) + goto out; + dprintk("%s: name=%s\n", __func__, owner_name->data); + return NFS_ATTR_FATTR_OWNER_NAME; + } else { + len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, + XDR_MAX_NETOBJ); + if (len <= 0 || nfs_map_name_to_uid(server, p, len, uid) != 0) + goto out; + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); + return NFS_ATTR_FATTR_OWNER; } - dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); - return ret; -out_overflow: +out: + if (len != -EBADMSG) + return 0; print_overflow_msg(__func__, xdr); return -EIO; } @@ -3937,41 +3927,33 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, const struct nfs_server *server, kgid_t *gid, struct nfs4_string *group_name) { - uint32_t len; - __be32 *p; - int ret = 0; + ssize_t len; + char *p; *gid = make_kgid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; - if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - p = xdr_inline_decode(xdr, len); - if (unlikely(!p)) - goto out_overflow; - if (group_name != NULL) { - group_name->data = kmemdup(p, len, GFP_NOWAIT); - if (group_name->data != NULL) { - group_name->len = len; - ret = NFS_ATTR_FATTR_GROUP_NAME; - } - } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) - ret = NFS_ATTR_FATTR_GROUP; - else - dprintk("%s: nfs_map_group_to_gid failed!\n", - __func__); - } else - dprintk("%s: name too long (%u)!\n", - __func__, len); - bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + if (!(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) + return 0; + bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + + if (group_name != NULL) { + len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + if (len <= 0) + goto out; + dprintk("%s: name=%s\n", __func__, group_name->data); + return NFS_ATTR_FATTR_OWNER_NAME; + } else { + len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, + XDR_MAX_NETOBJ); + if (len <= 0 || nfs_map_group_to_gid(server, p, len, gid) != 0) + goto out; + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); + return NFS_ATTR_FATTR_GROUP; } - dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); - return ret; -out_overflow: +out: + if (len != -EBADMSG) + return 0; print_overflow_msg(__func__, xdr); return -EIO; } @@ -4294,15 +4276,12 @@ out_overflow: static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { - __be32 *p; - - p = xdr_inline_decode(xdr, len); - if (likely(p)) { - memcpy(buf, p, len); - return 0; + ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); + if (unlikely(ret < 0)) { + print_overflow_msg(__func__, xdr); + return -EIO; } - print_overflow_msg(__func__, xdr); - return -EIO; + return 0; } static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) @@ -5093,7 +5072,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } - return decode_ace(xdr, NULL, res->server->nfs_client); + return decode_ace(xdr, NULL); out_overflow: print_overflow_msg(__func__, xdr); return -EIO; @@ -5660,8 +5639,6 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->server_owner->major_id, dummy_str, dummy); res->server_owner->major_id_sz = dummy; @@ -5669,8 +5646,6 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->server_scope->server_scope, dummy_str, dummy); res->server_scope->server_scope_sz = dummy; @@ -5685,16 +5660,12 @@ static int decode_exchange_id(struct xdr_stream *xdr, status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->impl_id->domain, dummy_str, dummy); /* nii_name */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) - return -EIO; memcpy(res->impl_id->name, dummy_str, dummy); /* nii_date */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6bca178..54e0f9f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -531,39 +531,32 @@ static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + char *proto = NULL; - seq_printf(m, ",mountproto="); switch (sap->sa_family) { case AF_INET: switch (nfss->mountd_protocol) { case IPPROTO_UDP: - seq_printf(m, RPCBIND_NETID_UDP); + proto = RPCBIND_NETID_UDP; break; case IPPROTO_TCP: - seq_printf(m, RPCBIND_NETID_TCP); + proto = RPCBIND_NETID_TCP; break; - default: - if (showdefaults) - seq_printf(m, "auto"); } break; case AF_INET6: switch (nfss->mountd_protocol) { case IPPROTO_UDP: - seq_printf(m, RPCBIND_NETID_UDP6); + proto = RPCBIND_NETID_UDP6; break; case IPPROTO_TCP: - seq_printf(m, RPCBIND_NETID_TCP6); + proto = RPCBIND_NETID_TCP6; break; - default: - if (showdefaults) - seq_printf(m, "auto"); } break; - default: - if (showdefaults) - seq_printf(m, "auto"); } + if (proto || showdefaults) + seq_printf(m, ",mountproto=%s", proto ?: "auto"); } static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0060685..e75b056 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1785,7 +1785,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) if (status < 0) { nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); - dprintk(", error = %d\n", status); + dprintk_cont(", error = %d\n", status); goto next; } @@ -1794,11 +1794,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); - dprintk(" OK\n"); + dprintk_cont(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk(" mismatch\n"); + dprintk_cont(" mismatch\n"); nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 43e109c..e71f11b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1102,6 +1102,7 @@ static struct flags { { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index d08cd88..838f90f 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -376,5 +376,4 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 0c89034..dcb5f79 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -266,6 +266,5 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d818e4f..045c908 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nfserr = nfsd_write(rqstp, &resp->fh, NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &resp->committed); + nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index eb78109..0274db6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, length + 4); if (unlikely(p == NULL)) goto out_overflow; + p += XDR_QUADLEN(length); hdr->nops = be32_to_cpup(p); return 0; out_overflow: @@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_callback *cb) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; - struct nfs4_sessionid id; - int status; + int status = -ESERVERFAULT; __be32 *p; u32 dummy; - status = -ESERVERFAULT; - /* * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. @@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; - memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); - if (memcmp(id.data, session->se_sessionid.data, - NFS4_MAX_SESSIONID_LEN) != 0) { + + if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s Invalid session id\n", __func__); goto out; } @@ -753,6 +750,14 @@ int set_callback_cred(void) return 0; } +void cleanup_callback_cred(void) +{ + if (callback_cred) { + put_rpccred(callback_cred); + callback_cred = NULL; + } +} + static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b20577..6b9b6cc 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); *uid = make_kuid(&init_user_ns, id); if (!uid_valid(*uid)) @@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); *gid = make_kgid(&init_user_ns, id); if (!gid_valid(*gid)) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 74a6e57..cbeeda1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u32 *bmval, u32 *writable) { struct dentry *dentry = cstate->current_fh.fh_dentry; + struct svc_export *exp = cstate->current_fh.fh_export; if (!nfsd_attrs_supported(cstate->minorversion, bmval)) return nfserr_attrnotsupp; if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) && + !(exp->ex_flags & NFSEXP_SECURITY_LABEL)) + return nfserr_attrnotsupp; if (writable && !bmval_is_subset(bmval, writable)) return nfserr_inval; if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && @@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - &write->wr_how_written); + write->wr_how_written); fput(filp); write->wr_bytes_written = cnt; @@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } +static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + /* ac_supported, ac_resp_access */ + return (op_encode_hdr_size + 2)* sizeof(__be32); +} + static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); @@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } +static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o XDR_QUADLEN(rlen)) * sizeof(__be32); } +static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; +} + static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } +static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) + * sizeof(__be32); +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * + (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); +} + static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * @@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) } #ifdef CONFIG_NFSD_PNFS +static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + + return (op_encode_hdr_size + + 1 /* gd_layout_type*/ + + XDR_QUADLEN(rlen) + + 2 /* gd_notify_types */) * sizeof(__be32); +} + /* * At this stage we don't really know what layout driver will handle the request, * so we need to define an arbitrary upper bound here. @@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ } #endif /* CONFIG_NFSD_PNFS */ + +static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 3) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, .op_name = "OP_ACCESS", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize, }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, @@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETFH] = { .op_func = (nfsd4op_func)nfsd4_getfh, .op_name = "OP_GETFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize, }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, @@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, .op_name = "OP_LOCKT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, @@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_lookup, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUPP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_NVERIFY] = { .op_func = (nfsd4op_func)nfsd4_nverify, .op_name = "OP_NVERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, @@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, .op_name = "OP_READLINK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize, }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, @@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, @@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, .op_name = "OP_VERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, @@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_TEST_STATEID] = { .op_func = (nfsd4op_func)nfsd4_test_stateid, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_TEST_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize, }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, @@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_GETDEVICEINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize, }, [OP_LAYOUTGET] = { .op_func = (nfsd4op_func)nfsd4_layoutget, @@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize, }, }; @@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { - struct nfsd4_operation *opdesc; - nfsd4op_rsize estimator; - if (op->opnum == OP_ILLEGAL) return op_encode_hdr_size * sizeof(__be32); - opdesc = OPDESC(op); - estimator = opdesc->op_rsize_bop; - return estimator ? estimator(rqstp, op) : PAGE_SIZE; + + BUG_ON(OPDESC(op)->op_rsize_bop == NULL); + return OPDESC(op)->op_rsize_bop(rqstp, op); } void warn_on_nonidempotent_op(struct nfsd4_op *op) @@ -2476,12 +2537,13 @@ static struct svc_procedure nfsd_procedures4[2] = { }; struct svc_version nfsd_version4 = { - .vs_vers = 4, - .vs_nproc = 2, - .vs_proc = nfsd_procedures4, - .vs_dispatch = nfsd_dispatch, - .vs_xdrsize = NFS4_SVC_XDRSIZE, - .vs_rpcb_optnl = 1, + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = true, + .vs_need_cong_ctrl = true, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a0dee8a..e9ef50a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; - dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " + dprintk("NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -7012,23 +7012,24 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) - return -ENOMEM; + return ret; + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; - goto out_recovery; + goto out_cleanup_cred; } ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; set_max_delegations(); - return 0; out_free_laundry: destroy_workqueue(laundry_wq); -out_recovery: +out_cleanup_cred: + cleanup_callback_cred(); return ret; } @@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void) { destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); + cleanup_callback_cred(); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8fae53c..33017d6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -58,7 +58,7 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR -u32 nfsd_suppattrs[3][3] = { +const u32 nfsd_suppattrs[3][3] = { {NFSD4_SUPPORTED_ATTRS_WORD0, NFSD4_SUPPORTED_ATTRS_WORD1, NFSD4_SUPPORTED_ATTRS_WORD2}, @@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) READ_BUF(16); p = xdr_decode_hyper(p, &write->wr_offset); write->wr_stable_how = be32_to_cpup(p++); - if (write->wr_stable_how > 2) + if (write->wr_stable_how > NFS_FILE_SYNC) goto xdr_error; write->wr_buflen = be32_to_cpup(p++); @@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } else max_reply += nfsd4_max_reply(argp->rqstp, op); /* - * OP_LOCK may return a conflicting lock. (Special case - * because it will just skip encoding this if it runs - * out of xdr buffer space, and it is the only operation - * that behaves this way.) + * OP_LOCK and OP_LOCKT may return a conflicting lock. + * (Special case because it will just skip encoding this + * if it runs out of xdr buffer space, and it is the only + * operation that behaves this way.) */ - if (op->opnum == OP_LOCK) + if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT) max_reply += NFS4_OPAQUE_LIMIT; if (op->status) { @@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) DECODE_TAIL; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, + struct svc_export *exp) { - if (IS_I_VERSION(inode)) { + if (exp->ex_flags & NFSEXP_V4ROOT) { + *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); + *p++ = 0; + } else if (IS_I_VERSION(inode)) { p = xdr_encode_hyper(p, inode->i_version); } else { *p++ = cpu_to_be32(stat->ctime.tv_sec); @@ -2297,7 +2301,7 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) if (path.dentry != path.mnt->mnt_root) break; } - err = vfs_getattr(&path, stat); + err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); path_put(&path); return err; } @@ -2381,7 +2385,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat); + err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | @@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(d_inode(dentry), + if (exp->ex_flags & NFSEXP_SECURITY_LABEL) + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); + else + err = -EOPNOTSUPP; contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) @@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, d_inode(dentry)); + p = encode_change(p, &stat, d_inode(dentry), exp); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f3b2f34..73e75ac 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -536,6 +536,19 @@ out_free: return rv; } +static ssize_t +nfsd_print_version_support(char *buf, int remaining, const char *sep, + unsigned vers, unsigned minor) +{ + const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u"; + bool supported = !!nfsd_vers(vers, NFSD_TEST); + + if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST)) + supported = false; + return snprintf(buf, remaining, format, sep, + supported ? '+' : '-', vers, minor); +} + static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; @@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = qword_get(&mesg, vers, size); if (len <= 0) return -EINVAL; do { + enum vers_op cmd; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); @@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (*minorp == '.') { if (num != 4) return -EINVAL; - minor = simple_strtoul(minorp+1, NULL, 0); - if (minor == 0) - return -EINVAL; - if (nfsd_minorversion(minor, sign == '-' ? - NFSD_CLEAR : NFSD_SET) < 0) + if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; - goto next; - } + } else + minor = 0; + cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { case 2: case 3: - case 4: - nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); + nfsd_vers(num, cmd); break; + case 4: + if (nfsd_minorversion(minor, cmd) >= 0) + break; default: return -EINVAL; } - next: vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as @@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = 0; sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; - for (num=2 ; num <= 4 ; num++) - if (nfsd_vers(num, NFSD_AVAIL)) { - len = snprintf(buf, remaining, "%s%c%d", sep, - nfsd_vers(num, NFSD_TEST)?'+':'-', - num); - sep = " "; - - if (len >= remaining) - break; - remaining -= len; - buf += len; - tlen += len; - } - if (nfsd_vers(4, NFSD_AVAIL)) - for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; - minor++) { - len = snprintf(buf, remaining, " %c4.%u", - (nfsd_vers(4, NFSD_TEST) && - nfsd_minorversion(minor, NFSD_TEST)) ? - '+' : '-', - minor); - + for (num=2 ; num <= 4 ; num++) { + if (!nfsd_vers(num, NFSD_AVAIL)) + continue; + minor = 0; + do { + len = nfsd_print_version_support(buf, remaining, + sep, num, minor); if (len >= remaining) - break; + goto out; remaining -= len; buf += len; tlen += len; - } - + minor++; + sep = " "; + } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); + } +out: len = snprintf(buf, remaining, "\n"); if (len >= remaining) return -EINVAL; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d74c8c4..d966068 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -362,16 +362,16 @@ void nfsd_lockd_shutdown(void); FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) -extern u32 nfsd_suppattrs[3][3]; +extern const u32 nfsd_suppattrs[3][3]; -static inline bool bmval_is_subset(u32 *bm1, u32 *bm2) +static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2) { return !((bm1[0] & ~bm2[0]) || (bm1[1] & ~bm2[1]) || (bm1[2] & ~bm2[2])); } -static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) +static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) { return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 010aff5..fa82b77 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, struct nfsd_attrstat *resp) { __be32 nfserr; - int stable = 1; unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &stable); + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, + rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e6bfd96..786a4a2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -6,7 +6,7 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> @@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change) return 0; } +static void +nfsd_adjust_nfsd_versions4(void) +{ + unsigned i; + + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { + if (nfsd_supported_minorversions[i]) + return; + } + nfsd_vers(4, NFSD_CLEAR); +} + int nfsd_minorversion(u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) @@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change) switch(change) { case NFSD_SET: nfsd_supported_minorversions[minorversion] = true; + nfsd_vers(4, NFSD_SET); break; case NFSD_CLEAR: nfsd_supported_minorversions[minorversion] = false; + nfsd_adjust_nfsd_versions4(); break; case NFSD_TEST: return nfsd_supported_minorversions[minorversion]; @@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4516e8b..005c911 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); +extern void cleanup_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 26c6fdb..19d50f6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file) __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int *stablep) + unsigned long *cnt, int stable) { struct svc_export *exp; - struct inode *inode; mm_segment_t oldfs; __be32 err = 0; int host_err; - int stable = *stablep; int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; @@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, */ current->flags |= PF_LESS_THROTTLE; - inode = file_inode(file); - exp = fhp->fh_export; - + exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); if (!EX_ISSYNC(exp)) - stable = 0; + stable = NFS_UNSTABLE; if (stable && !use_wgather) flags |= RWF_SYNC; @@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep) +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - __be32 err = 0; + struct file *file = NULL; + __be32 err = 0; trace_write_start(rqstp, fhp, offset, vlen); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, - stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); - if (err) - goto out; + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) + goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - if (cnt) - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, - cnt, stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - fput(file); - } + trace_write_opened(rqstp, fhp, offset, vlen); + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); + trace_write_io_done(rqstp, fhp, offset, vlen); + fput(file); out: trace_write_done(rqstp, fhp, offset, vlen); return err; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0bf9e7b..1bbdcce 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -83,12 +83,12 @@ __be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, - loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, + struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep); + int stable); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, @@ -135,7 +135,8 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) { struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat)); + return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT)); } static inline int nfsd_create_is_exclusive(int createmode) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7d18d62..febed12 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -30,6 +30,8 @@ #include <linux/crc32.h> #include <linux/pagevec.h> #include <linux/slab.h> +#include <linux/sched/signal.h> + #include "nilfs.h" #include "btnode.h" #include "page.h" diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a4c4622..e5f7e47 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> /* UINT_MAX */ #include <linux/mount.h> #include <linux/sched.h> +#include <linux/sched/user.h> #include <linux/types.h> #include <linux/wait.h> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 7ebfca6..2b37f27 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/uaccess.h> #include <linux/compat.h> +#include <linux/sched/signal.h> #include <asm/ioctls.h> diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index f36c293..1aeb837 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -30,6 +30,7 @@ #include <linux/slab.h> /* kmem_* */ #include <linux/types.h> #include <linux/sched.h> +#include <linux/sched/user.h> #include "inotify.h" diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1cf41c6..498d609 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -30,7 +30,7 @@ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ -#include <linux/sched.h> /* struct user */ +#include <linux/sched/signal.h> #include <linux/slab.h> /* struct kmem_cache */ #include <linux/syscalls.h> #include <linux/types.h> diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 358ed7e..c4f68c3 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -24,7 +24,7 @@ #include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/pagevec.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d4ec0d8..fb15a96 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -30,6 +30,7 @@ #include <linux/swap.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/sched/signal.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ec00057..4348027 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -54,6 +54,7 @@ */ #include <linux/kernel.h> +#include <linux/sched/mm.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/idr.h> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 32fd261..a2b19fb 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -33,6 +33,7 @@ #include <linux/delay.h> #include <linux/err.h> #include <linux/debugfs.h> +#include <linux/sched/signal.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index f70cda2..9cecf48 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -28,6 +28,7 @@ */ #include <linux/signal.h> +#include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8dce409..3b7c937 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -33,6 +33,7 @@ #include <linux/seq_file.h> #include <linux/time.h> #include <linux/quotaops.h> +#include <linux/sched/signal.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8836305..bfeb647 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1306,16 +1306,15 @@ bail: return status; } -int ocfs2_getattr(struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) +int ocfs2_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct inode *inode = d_inode(dentry); - struct super_block *sb = dentry->d_sb; + struct inode *inode = d_inode(path->dentry); + struct super_block *sb = path->dentry->d_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; - err = ocfs2_inode_revalidate(dentry); + err = ocfs2_inode_revalidate(path->dentry); if (err) { if (err != -ENOENT) mlog_errno(err); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 897fd9a..1fdc983 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -68,8 +68,8 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +int ocfs2_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int ocfs2_permission(struct inode *inode, int mask); int ocfs2_should_update_atime(struct inode *inode, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a24e42f..ca1646f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -42,6 +42,7 @@ #include <linux/seq_file.h> #include <linux/quotaops.h> #include <linux/cleancache.h> +#include <linux/signal.h> #define CREATE_TRACE_POINTS #include "ocfs2_trace.h" diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index df7ea85..8c9034e 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/vfs.h> +#include <linux/cred.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> @@ -301,12 +301,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (S_ISFIFO(inode->i_mode)) return -ESPIPE; - /* - * Let individual file system decide if it supports preallocation - * for directories or not. - */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && - !S_ISBLK(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ @@ -316,7 +314,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!file->f_op->fallocate) return -EOPNOTSUPP; - sb_start_write(inode->i_sb); + file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* @@ -329,7 +327,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret == 0) fsnotify_modify(file); - sb_end_write(inode->i_sb); + file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5cd6179..a304bf3 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -245,25 +245,24 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *kstat) +int orangefs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { int ret = -ENOENT; - struct inode *inode = dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct orangefs_inode_s *orangefs_inode = NULL; gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_getattr: called on %pd\n", - dentry); + path->dentry); ret = orangefs_inode_getattr(inode, 0, 0); if (ret == 0) { - generic_fillattr(inode, kstat); + generic_fillattr(inode, stat); /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); - kstat->blksize = orangefs_inode->blksize; + stat->blksize = orangefs_inode->blksize; } return ret; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 70355a9..5e48a0b 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -41,7 +41,7 @@ #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/uio.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/mm.h> #include <linux/wait.h> #include <linux/dcache.h> @@ -439,9 +439,8 @@ struct inode *orangefs_new_inode(struct super_block *sb, int orangefs_setattr(struct dentry *dentry, struct iattr *iattr); -int orangefs_getattr(struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *kstat); +int orangefs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int orangefs_permission(struct inode *inode, int mask); diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index c48859f..67c2435 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -115,6 +115,13 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) return &orangefs_inode->vfs_inode; } +static void orangefs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + kmem_cache_free(orangefs_inode_cache, orangefs_inode); +} + static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -123,7 +130,7 @@ static void orangefs_destroy_inode(struct inode *inode) "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); - kmem_cache_free(orangefs_inode_cache, orangefs_inode); + call_rcu(&inode->i_rcu, orangefs_i_callback); } /* diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index f57043d..906ea6c 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -15,11 +15,13 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/uaccess.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/cred.h> #include <linux/namei.h> #include <linux/fdtable.h> #include <linux/ratelimit.h> #include "overlayfs.h" +#include "ovl_entry.h" #define OVL_COPY_UP_CHUNK_SIZE (1 << 20) @@ -232,12 +234,14 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *dentry, struct path *lowerpath, - struct kstat *stat, const char *link) + struct kstat *stat, const char *link, + struct kstat *pstat, bool tmpfile) { struct inode *wdir = workdir->d_inode; struct inode *udir = upperdir->d_inode; struct dentry *newdentry = NULL; struct dentry *upper = NULL; + struct dentry *temp = NULL; int err; const struct cred *old_creds = NULL; struct cred *new_creds = NULL; @@ -248,25 +252,30 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, .link = link }; - newdentry = ovl_lookup_temp(workdir, dentry); - err = PTR_ERR(newdentry); - if (IS_ERR(newdentry)) - goto out; - upper = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out1; + goto out; err = security_inode_copy_up(dentry, &new_creds); if (err < 0) - goto out2; + goto out1; if (new_creds) old_creds = override_creds(new_creds); - err = ovl_create_real(wdir, newdentry, &cattr, NULL, true); + if (tmpfile) + temp = ovl_do_tmpfile(upperdir, stat->mode); + else + temp = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto out1; + + err = 0; + if (!tmpfile) + err = ovl_create_real(wdir, temp, &cattr, NULL, true); if (new_creds) { revert_creds(old_creds); @@ -281,39 +290,55 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, ovl_path_upper(dentry, &upperpath); BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = newdentry; + upperpath.dentry = temp; + + if (tmpfile) { + inode_unlock(udir); + err = ovl_copy_up_data(lowerpath, &upperpath, + stat->size); + inode_lock_nested(udir, I_MUTEX_PARENT); + } else { + err = ovl_copy_up_data(lowerpath, &upperpath, + stat->size); + } - err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); if (err) goto out_cleanup; } - err = ovl_copy_xattr(lowerpath->dentry, newdentry); + err = ovl_copy_xattr(lowerpath->dentry, temp); if (err) goto out_cleanup; - inode_lock(newdentry->d_inode); - err = ovl_set_attr(newdentry, stat); - inode_unlock(newdentry->d_inode); + inode_lock(temp->d_inode); + err = ovl_set_attr(temp, stat); + inode_unlock(temp->d_inode); if (err) goto out_cleanup; - err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (tmpfile) + err = ovl_do_link(temp, udir, upper, true); + else + err = ovl_do_rename(wdir, temp, udir, upper, 0); if (err) goto out_cleanup; + newdentry = dget(tmpfile ? upper : temp); ovl_dentry_update(dentry, newdentry); ovl_inode_update(d_inode(dentry), d_inode(newdentry)); - newdentry = NULL; + + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, pstat); out2: - dput(upper); + dput(temp); out1: - dput(newdentry); + dput(upper); out: return err; out_cleanup: - ovl_cleanup(wdir, newdentry); + if (!tmpfile) + ovl_cleanup(wdir, temp); goto out2; } @@ -337,6 +362,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; const char *link = NULL; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; if (WARN_ON(!workdir)) return -EROFS; @@ -346,7 +372,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; - err = vfs_getattr(&parentpath, &pstat); + err = vfs_getattr(&parentpath, &pstat, + STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT); if (err) return err; @@ -356,6 +383,25 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return PTR_ERR(link); } + /* Should we copyup with O_TMPFILE or with workdir? */ + if (S_ISREG(stat->mode) && ofs->tmpfile) { + err = ovl_copy_up_start(dentry); + /* err < 0: interrupted, err > 0: raced with another copy-up */ + if (unlikely(err)) { + pr_debug("ovl_copy_up_start(%pd2) = %i\n", dentry, err); + if (err > 0) + err = 0; + goto out_done; + } + + inode_lock_nested(upperdir->d_inode, I_MUTEX_PARENT); + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, link, &pstat, true); + inode_unlock(upperdir->d_inode); + ovl_copy_up_end(dentry); + goto out_done; + } + err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { pr_err("overlayfs: failed to lock workdir+upperdir\n"); @@ -368,13 +414,10 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, - stat, link); - if (!err) { - /* Restore timestamps on parent (best effort) */ - ovl_set_timestamps(upperdir, &pstat); - } + stat, link, &pstat, false); out_unlock: unlock_rename(workdir, upperdir); +out_done: do_delayed_call(&done); return err; @@ -409,7 +452,8 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags) } ovl_path_lower(next, &lowerpath); - err = vfs_getattr(&lowerpath, &stat); + err = vfs_getattr(&lowerpath, &stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); /* maybe truncate regular file. this has no effect on dirs */ if (flags & O_TRUNC) stat.size = 0; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 16e06dd..6515796 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,9 +138,10 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return err; } -static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ovl_dir_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; int err; enum ovl_path_type type; struct path realpath; @@ -148,7 +149,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat); + err = vfs_getattr(&realpath, stat, request_mask, flags); revert_creds(old_cred); if (err) return err; @@ -264,7 +265,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, goto out; ovl_path_upper(dentry, &upperpath); - err = vfs_getattr(&upperpath, &stat); + err = vfs_getattr(&upperpath, &stat, + STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (err) goto out_unlock; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 08643ac..f8fe6bf 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include "overlayfs.h" @@ -56,16 +57,17 @@ out: return err; } -static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct path realpath; const struct cred *old_cred; int err; ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat); + err = vfs_getattr(&realpath, stat, request_mask, flags); revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 023bb0b..b8b0778 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/cred.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8af450b..741dc0b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -127,6 +127,15 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) return err; } +static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) +{ + struct dentry *ret = vfs_tmpfile(dentry, mode, 0); + int err = IS_ERR(ret) ? PTR_ERR(ret) : 0; + + pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); + return ret; +} + static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper) { unsigned long x = (unsigned long) READ_ONCE(inode->i_private); @@ -169,6 +178,8 @@ void ovl_dentry_version_inc(struct dentry *dentry); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); +int ovl_copy_up_start(struct dentry *dentry); +void ovl_copy_up_end(struct dentry *dentry); /* namei.c */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index d14bca1..59614fa 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -27,6 +27,8 @@ struct ovl_fs { struct ovl_config config; /* creds of process who forced instantiation of super block */ const struct cred *creator_cred; + bool tmpfile; + wait_queue_head_t copyup_wq; }; /* private information held for every overlayfs dentry */ @@ -38,6 +40,7 @@ struct ovl_entry { u64 version; const char *redirect; bool opaque; + bool copying; }; struct rcu_head rcu; }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 20f48ab..c9e70d3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -7,6 +7,7 @@ * the Free Software Foundation. */ +#include <uapi/linux/magic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/xattr.h> @@ -160,6 +161,25 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs); } +static int ovl_sync_fs(struct super_block *sb, int wait) +{ + struct ovl_fs *ufs = sb->s_fs_info; + struct super_block *upper_sb; + int ret; + + if (!ufs->upper_mnt) + return 0; + upper_sb = ufs->upper_mnt->mnt_sb; + if (!upper_sb->s_op->sync_fs) + return 0; + + /* real inodes have already been synced by sync_filesystem(ovl_sb) */ + down_read(&upper_sb->s_umount); + ret = upper_sb->s_op->sync_fs(upper_sb, wait); + up_read(&upper_sb->s_umount); + return ret; +} + /** * ovl_statfs * @sb: The overlayfs super block @@ -222,6 +242,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, + .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, .show_options = ovl_show_options, .remount_fs = ovl_remount, @@ -701,6 +722,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int stacklen = 0; unsigned int i; bool remote = false; + struct cred *cred; int err; err = -ENOMEM; @@ -708,6 +730,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs) goto out; + init_waitqueue_head(&ufs->copyup_wq); ufs->config.redirect_dir = ovl_redirect_dir_def; err = ovl_parse_opt((char *) data, &ufs->config); if (err) @@ -825,6 +848,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) * creation of workdir in previous step. */ if (ufs->workdir) { + struct dentry *temp; + err = ovl_check_d_type_supported(&workpath); if (err < 0) goto out_put_workdir; @@ -836,6 +861,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) */ if (!err) pr_warn("overlayfs: upper fs needs to support d_type.\n"); + + /* Check if upper/work fs supports O_TMPFILE */ + temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0); + ufs->tmpfile = !IS_ERR(temp); + if (ufs->tmpfile) + dput(temp); + else + pr_warn("overlayfs: upper fs does not support tmpfile.\n"); } } @@ -870,10 +903,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; - ufs->creator_cred = prepare_creds(); - if (!ufs->creator_cred) + ufs->creator_cred = cred = prepare_creds(); + if (!cred) goto out_put_lower_mnt; + /* Never override disk quota limits or use reserved space */ + cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 952286f..1953986 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -10,7 +10,9 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/slab.h> +#include <linux/cred.h> #include <linux/xattr.h> +#include <linux/sched/signal.h> #include "overlayfs.h" #include "ovl_entry.h" @@ -263,3 +265,33 @@ struct file *ovl_path_open(struct path *path, int flags) { return dentry_open(path, flags | O_NOATIME, current_cred()); } + +int ovl_copy_up_start(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + int err; + + spin_lock(&ofs->copyup_wq.lock); + err = wait_event_interruptible_locked(ofs->copyup_wq, !oe->copying); + if (!err) { + if (oe->__upperdentry) + err = 1; /* Already copied up */ + else + oe->copying = true; + } + spin_unlock(&ofs->copyup_wq.lock); + + return err; +} + +void ovl_copy_up_end(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + spin_lock(&ofs->copyup_wq.lock); + oe->copying = false; + wake_up_locked(&ofs->copyup_wq); + spin_unlock(&ofs->copyup_wq.lock); +} diff --git a/fs/posix_acl.c b/fs/posix_acl.c index c9d48dc..eebf5f6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -15,6 +15,7 @@ #include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> diff --git a/fs/proc/array.c b/fs/proc/array.c index fe12b51..88c3555 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -60,6 +60,10 @@ #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/task.h> +#include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/uaccess.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index 1e1e182..c87b6b9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -85,6 +85,11 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/debug.h> +#include <linux/sched/stat.h> #include <linux/flex_array.h> #include <linux/posix-timers.h> #ifdef CONFIG_HARDWALL @@ -1724,11 +1729,12 @@ out_unlock: return NULL; } -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct task_struct *task; - struct pid_namespace *pid = dentry->d_sb->s_fs_info; + struct pid_namespace *pid = path->dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -3511,9 +3517,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +static int proc_task_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 00ce153..c330495 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -1,4 +1,4 @@ -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/dcache.h> #include <linux/path.h> diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 06c7390..ee27feb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -118,10 +118,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) return 0; } -static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int proc_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5d6960f..c5ae09b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task.h> struct ctl_table_header; struct mempolicy; @@ -149,7 +151,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern int pid_revalidate(struct dentry *, unsigned int); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ea9f3d1..4ee5527 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/ioport.h> #include <linux/memory.h> +#include <linux/sched/task.h> #include <asm/sections.h> #include "internal.h" diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index aec66e6..983fce5 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -3,6 +3,8 @@ #include <linux/pid_namespace.h> #include <linux/proc_fs.h> #include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index ffd72a6..d72fc40 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mount.h> @@ -140,10 +141,10 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct net *net; net = get_proc_task_net(inode); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 3e64c65..8f91ec6 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -8,6 +8,7 @@ #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/module.h> @@ -801,9 +802,10 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +static int proc_sys_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; diff --git a/fs/proc/root.c b/fs/proc/root.c index b90da88..deecb39 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -14,12 +14,14 @@ #include <linux/stat.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/parser.h> +#include <linux/cred.h> #include "internal.h" @@ -149,10 +151,10 @@ void __init proc_root_init(void) proc_sys_init(); } -static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat -) +static int proc_root_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index e47c3e8..bd4e55f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -5,11 +5,12 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> +#include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/irqnr.h> -#include <linux/cputime.h> +#include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ee3efb2..f08bd31 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/sched/mm.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 1ef97cf..23266694 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -7,6 +7,8 @@ #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/sched/mm.h> + #include "internal.h" /* diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 3f1190d..b5713fe 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -10,6 +10,8 @@ #include <linux/nsproxy.h> #include <linux/security.h> #include <linux/fs_struct.h> +#include <linux/sched/task.h> + #include "proc/internal.h" /* only for get_proc_task() in ->open() */ #include "pnode.h" diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 406fed9..74b489e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -72,6 +72,7 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/kmod.h> #include <linux/namei.h> #include <linux/capability.h> diff --git a/fs/read_write.c b/fs/read_write.c index 5816d4c..c4f88af 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -4,8 +4,9 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include <linux/slab.h> +#include <linux/slab.h> #include <linux/stat.h> +#include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> @@ -23,9 +24,6 @@ #include <linux/uaccess.h> #include <asm/unistd.h> -typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); -typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); - const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, @@ -370,7 +368,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) kiocb.ki_pos = *ppos; iter->type |= READ; - ret = file->f_op->read_iter(&kiocb, iter); + ret = call_read_iter(file, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; @@ -390,7 +388,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) kiocb.ki_pos = *ppos; iter->type |= WRITE; - ret = file->f_op->write_iter(&kiocb, iter); + ret = call_write_iter(file, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; @@ -439,7 +437,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo kiocb.ki_pos = *ppos; iov_iter_init(&iter, READ, &iov, 1, len); - ret = filp->f_op->read_iter(&kiocb, &iter); + ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; @@ -496,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t kiocb.ki_pos = *ppos; iov_iter_init(&iter, WRITE, &iov, 1, len); - ret = filp->f_op->write_iter(&kiocb, &iter); + ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; @@ -675,7 +673,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, iter_fn_t fn, int flags) + loff_t *ppos, int type, int flags) { struct kiocb kiocb; ssize_t ret; @@ -692,7 +690,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); kiocb.ki_pos = *ppos; - ret = fn(&kiocb, iter); + if (type == READ) + ret = call_read_iter(filp, &kiocb, iter); + else + ret = call_write_iter(filp, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; @@ -700,7 +701,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, io_fn_t fn, int flags) + loff_t *ppos, int type, int flags) { ssize_t ret = 0; @@ -711,7 +712,13 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; - nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); + if (type == READ) { + nr = filp->f_op->read(filp, iovec.iov_base, + iovec.iov_len, ppos); + } else { + nr = filp->f_op->write(filp, iovec.iov_base, + iovec.iov_len, ppos); + } if (nr < 0) { if (!ret) @@ -834,50 +841,32 @@ out: return ret; } -static ssize_t do_readv_writev(int type, struct file *file, - const struct iovec __user * uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +static ssize_t __do_readv_writev(int type, struct file *file, + struct iov_iter *iter, loff_t *pos, int flags) { size_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - io_fn_t fn; - iter_fn_t iter_fn; - - ret = import_iovec(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (ret < 0) - return ret; + ssize_t ret = 0; - tot_len = iov_iter_count(&iter); + tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; - if (type == READ) { - fn = file->f_op->read; - iter_fn = file->f_op->read_iter; - } else { - fn = (io_fn_t)file->f_op->write; - iter_fn = file->f_op->write_iter; + if (type != READ) file_start_write(file); - } - if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); + if ((type == READ && file->f_op->read_iter) || + (type == WRITE && file->f_op->write_iter)) + ret = do_iter_readv_writev(file, iter, pos, type, flags); else - ret = do_loop_readv_writev(file, &iter, pos, fn, flags); + ret = do_loop_readv_writev(file, iter, pos, type, flags); if (type != READ) file_end_write(file); out: - kfree(iov); if ((ret + (type == READ)) > 0) { if (type == READ) fsnotify_access(file); @@ -887,6 +876,27 @@ out: return ret; } +static ssize_t do_readv_writev(int type, struct file *file, + const struct iovec __user *uvector, + unsigned long nr_segs, loff_t *pos, + int flags) +{ + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + + ret = import_iovec(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + return ret; + + ret = __do_readv_writev(type, file, &iter, pos, flags); + kfree(iov); + + return ret; +} + ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { @@ -1064,51 +1074,19 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, unsigned long nr_segs, loff_t *pos, int flags) { - compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - io_fn_t fn; - iter_fn_t iter_fn; ret = compat_import_iovec(type, uvector, nr_segs, UIO_FASTIOV, &iov, &iter); if (ret < 0) return ret; - tot_len = iov_iter_count(&iter); - if (!tot_len) - goto out; - ret = rw_verify_area(type, file, pos, tot_len); - if (ret < 0) - goto out; - - if (type == READ) { - fn = file->f_op->read; - iter_fn = file->f_op->read_iter; - } else { - fn = (io_fn_t)file->f_op->write; - iter_fn = file->f_op->write_iter; - file_start_write(file); - } - - if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); - else - ret = do_loop_readv_writev(file, &iter, pos, fn, flags); - - if (type != READ) - file_end_write(file); - -out: + ret = __do_readv_writev(type, file, &iter, pos, flags); kfree(iov); - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } + return ret; } @@ -1518,6 +1496,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (flags != 0) return -EINVAL; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; @@ -1538,7 +1521,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; - sb_start_write(inode_out->i_sb); + file_start_write(file_out); /* * Try cloning first, this is supported by more file systems, and @@ -1574,7 +1557,7 @@ done: inc_syscr(current); inc_syscw(current); - sb_end_write(inode_out->i_sb); + file_end_write(file_out); return ret; } diff --git a/fs/select.c b/fs/select.c index 305c0da..e211227 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,7 +15,8 @@ */ #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> @@ -26,7 +27,6 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> -#include <linux/sched/rt.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> diff --git a/fs/splice.c b/fs/splice.c index 4ef78aa..006ba50 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -33,6 +33,8 @@ #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> +#include <linux/sched/signal.h> + #include "internal.h" /* @@ -307,7 +309,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, idx = to.idx; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; - ret = in->f_op->read_iter(&kiocb, &to); + ret = call_read_iter(in, &kiocb, &to); if (ret > 0) { *ppos = kiocb.ki_pos; file_accessed(in); @@ -12,12 +12,22 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/security.h> +#include <linux/cred.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/uaccess.h> #include <asm/unistd.h> +/** + * generic_fillattr - Fill in the basic attributes from the inode struct + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes + * + * Fill in the basic attributes in the kstat structure from data that's to be + * found on the VFS inode structure. This is the default if no getattr inode + * operation is supplied. + */ void generic_fillattr(struct inode *inode, struct kstat *stat) { stat->dev = inode->i_sb->s_dev; @@ -33,81 +43,147 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->ctime = inode->i_ctime; stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; -} + if (IS_NOATIME(inode)) + stat->result_mask &= ~STATX_ATIME; + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; +} EXPORT_SYMBOL(generic_fillattr); /** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in + * @request_mask: STATX_xxx flags indicating what the caller wants + * @query_flags: Query mode (KSTAT_QUERY_FLAGS) * * Get attributes without calling security_inode_getattr. * * Currently the only caller other than vfs_getattr is internal to the - * filehandle lookup code, which uses only the inode number and returns - * no attributes to any user. Any other code probably wants - * vfs_getattr. + * filehandle lookup code, which uses only the inode number and returns no + * attributes to any user. Any other code probably wants vfs_getattr. */ -int vfs_getattr_nosec(struct path *path, struct kstat *stat) +int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); + memset(stat, 0, sizeof(*stat)); + stat->result_mask |= STATX_BASIC_STATS; + request_mask &= STATX_ALL; + query_flags &= KSTAT_QUERY_FLAGS; if (inode->i_op->getattr) - return inode->i_op->getattr(path->mnt, path->dentry, stat); + return inode->i_op->getattr(path, stat, request_mask, + query_flags); generic_fillattr(inode, stat); return 0; } - EXPORT_SYMBOL(vfs_getattr_nosec); -int vfs_getattr(struct path *path, struct kstat *stat) +/* + * vfs_getattr - Get the enhanced basic attributes of a file + * @path: The file of interest + * @stat: Where to return the statistics + * @request_mask: STATX_xxx flags indicating what the caller wants + * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * + * Ask the filesystem for a file's attributes. The caller must indicate in + * request_mask and query_flags to indicate what they want. + * + * If the file is remote, the filesystem can be forced to update the attributes + * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can + * suppress the update by passing AT_STATX_DONT_SYNC. + * + * Bits must have been set in request_mask to indicate which attributes the + * caller wants retrieving. Any such attribute not requested may be returned + * anyway, but the value may be approximate, and, if remote, may not have been + * synchronised with the server. + * + * 0 will be returned on success, and a -ve error code if unsuccessful. + */ +int vfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { int retval; retval = security_inode_getattr(path); if (retval) return retval; - return vfs_getattr_nosec(path, stat); + return vfs_getattr_nosec(path, stat, request_mask, query_flags); } - EXPORT_SYMBOL(vfs_getattr); -int vfs_fstat(unsigned int fd, struct kstat *stat) +/** + * vfs_statx_fd - Get the enhanced basic attributes by file descriptor + * @fd: The file descriptor referring to the file of interest + * @stat: The result structure to fill in. + * @request_mask: STATX_xxx flags indicating what the caller wants + * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * + * This function is a wrapper around vfs_getattr(). The main difference is + * that it uses a file descriptor to determine the file location. + * + * 0 will be returned on success, and a -ve error code if unsuccessful. + */ +int vfs_statx_fd(unsigned int fd, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { struct fd f = fdget_raw(fd); int error = -EBADF; if (f.file) { - error = vfs_getattr(&f.file->f_path, stat); + error = vfs_getattr(&f.file->f_path, stat, + request_mask, query_flags); fdput(f); } return error; } -EXPORT_SYMBOL(vfs_fstat); +EXPORT_SYMBOL(vfs_statx_fd); -int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, - int flag) +/** + * vfs_statx - Get basic and extra attributes by filename + * @dfd: A file descriptor representing the base dir for a relative filename + * @filename: The name of the file of interest + * @flags: Flags to control the query + * @stat: The result structure to fill in. + * @request_mask: STATX_xxx flags indicating what the caller wants + * + * This function is a wrapper around vfs_getattr(). The main difference is + * that it uses a filename and base directory to determine the file location. + * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink + * at the given name from being referenced. + * + * The caller must have preset stat->request_mask as for vfs_getattr(). The + * flags are also used to load up stat->query_flags. + * + * 0 will be returned on success, and a -ve error code if unsuccessful. + */ +int vfs_statx(int dfd, const char __user *filename, int flags, + struct kstat *stat, u32 request_mask) { struct path path; int error = -EINVAL; - unsigned int lookup_flags = 0; + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH)) != 0) - goto out; + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + return -EINVAL; - if (!(flag & AT_SYMLINK_NOFOLLOW)) - lookup_flags |= LOOKUP_FOLLOW; - if (flag & AT_EMPTY_PATH) + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; - error = vfs_getattr(&path, stat); + error = vfs_getattr(&path, stat, request_mask, flags); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -116,19 +192,7 @@ retry: out: return error; } -EXPORT_SYMBOL(vfs_fstatat); - -int vfs_stat(const char __user *name, struct kstat *stat) -{ - return vfs_fstatat(AT_FDCWD, name, stat, 0); -} -EXPORT_SYMBOL(vfs_stat); - -int vfs_lstat(const char __user *name, struct kstat *stat) -{ - return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); -} -EXPORT_SYMBOL(vfs_lstat); +EXPORT_SYMBOL(vfs_statx); #ifdef __ARCH_WANT_OLD_STAT @@ -141,7 +205,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta { static int warncount = 5; struct __old_kernel_stat tmp; - + if (warncount > 0) { warncount--; printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n", @@ -166,7 +230,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) return -EOVERFLOW; -#endif +#endif tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -445,6 +509,81 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ +static inline int __put_timestamp(struct timespec *kts, + struct statx_timestamp __user *uts) +{ + return (__put_user(kts->tv_sec, &uts->tv_sec ) || + __put_user(kts->tv_nsec, &uts->tv_nsec ) || + __put_user(0, &uts->__reserved )); +} + +/* + * Set the statx results. + */ +static long statx_set_result(struct kstat *stat, struct statx __user *buffer) +{ + uid_t uid = from_kuid_munged(current_user_ns(), stat->uid); + gid_t gid = from_kgid_munged(current_user_ns(), stat->gid); + + if (__put_user(stat->result_mask, &buffer->stx_mask ) || + __put_user(stat->mode, &buffer->stx_mode ) || + __clear_user(&buffer->__spare0, sizeof(buffer->__spare0)) || + __put_user(stat->nlink, &buffer->stx_nlink ) || + __put_user(uid, &buffer->stx_uid ) || + __put_user(gid, &buffer->stx_gid ) || + __put_user(stat->attributes, &buffer->stx_attributes ) || + __put_user(stat->blksize, &buffer->stx_blksize ) || + __put_user(MAJOR(stat->rdev), &buffer->stx_rdev_major ) || + __put_user(MINOR(stat->rdev), &buffer->stx_rdev_minor ) || + __put_user(MAJOR(stat->dev), &buffer->stx_dev_major ) || + __put_user(MINOR(stat->dev), &buffer->stx_dev_minor ) || + __put_timestamp(&stat->atime, &buffer->stx_atime ) || + __put_timestamp(&stat->btime, &buffer->stx_btime ) || + __put_timestamp(&stat->ctime, &buffer->stx_ctime ) || + __put_timestamp(&stat->mtime, &buffer->stx_mtime ) || + __put_user(stat->ino, &buffer->stx_ino ) || + __put_user(stat->size, &buffer->stx_size ) || + __put_user(stat->blocks, &buffer->stx_blocks ) || + __clear_user(&buffer->__spare1, sizeof(buffer->__spare1)) || + __clear_user(&buffer->__spare2, sizeof(buffer->__spare2))) + return -EFAULT; + + return 0; +} + +/** + * sys_statx - System call to get enhanced stats + * @dfd: Base directory to pathwalk from *or* fd to stat. + * @filename: File to stat *or* NULL. + * @flags: AT_* flags to control pathwalk. + * @mask: Parts of statx struct actually required. + * @buffer: Result buffer. + * + * Note that if filename is NULL, then it does the equivalent of fstat() using + * dfd to indicate the file of interest. + */ +SYSCALL_DEFINE5(statx, + int, dfd, const char __user *, filename, unsigned, flags, + unsigned int, mask, + struct statx __user *, buffer) +{ + struct kstat stat; + int error; + + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer))) + return -EFAULT; + + if (filename) + error = vfs_statx(dfd, filename, flags, &stat, mask); + else + error = vfs_statx_fd(dfd, &stat, mask, flags); + if (error) + return error; + return statx_set_result(&stat, buffer); +} + /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { @@ -192,7 +192,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) spin_unlock(&inode->i_lock); mark_inode_dirty_sync(inode); } - return file->f_op->fsync(file, start, end, datasync); + return call_fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 08d3e63..83809f5 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -440,10 +440,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return blocks; } -int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int sysv_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { - struct super_block *s = dentry->d_sb; - generic_fillattr(d_inode(dentry), stat); + struct super_block *s = path->dentry->d_sb; + generic_fillattr(d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 6c21228..1e7e27c 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -142,7 +142,7 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 528369f..30825d88 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1622,11 +1622,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int ubifs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { loff_t size; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f0c86f0..4d57e48 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1749,8 +1749,8 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode); -int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +int ubifs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index f7dfef5..6023c97 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -152,9 +152,10 @@ out_unmap: return err; } -static int udf_symlink_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int udf_symlink_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { + struct dentry *dentry = path->dentry; struct inode *inode = d_backing_inode(dentry); struct page *page; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3c421d0..973607d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -14,7 +14,8 @@ #include <linux/list.h> #include <linux/hashtable.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include <linux/mm.h> #include <linux/poll.h> #include <linux/slab.h> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 339c696..2dfdc62 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/swap.h> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8c7d01b..b620872 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -33,6 +33,7 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> +#include <linux/sched/mm.h> #include "xfs_format.h" #include "xfs_log_format.h" diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index cf1363db..2fd7fdf 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -43,6 +43,7 @@ #include "xfs_acl.h" #include <linux/capability.h> +#include <linux/cred.h> #include <linux/dcache.h> #include <linux/mount.h> #include <linux/namei.h> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 22c1615..229cc6a 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -489,11 +489,12 @@ xfs_vn_get_link_inline( STATIC int xfs_vn_getattr( - struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) + const struct path *path, + struct kstat *stat, + u32 request_mask, + unsigned int query_flags) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7a989de..592fdf7 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -55,7 +55,7 @@ typedef __u32 xfs_nlink_t; #include <linux/file.h> #include <linux/swap.h> #include <linux/errno.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/bitops.h> #include <linux/major.h> #include <linux/pagemap.h> |